In [None]:
#포르토 세구로 안전 운전자 예측 경진대회
#https://www.kaggle.com/c/porto-seguro-safe-driver-prediction
#운전자A가 내년에 보험을 청구할 확률을 예측하는 경진대회
#학습 데이터는 총 59만, 테스트 데이터는 총 89만개의 운전자 데이터를 포함

# Scoring Metric

In [None]:
#Submissions are evaluated using the Normalized Gini Coefficient.

#During scoring, observations are sorted from the largest to the smallest predictions. Predictions are only used for ordering observations; therefore, the relative magnitude of the predictions are not used during scoring. The scoring algorithm then compares the cumulative proportion of positive class observations to a theoretical uniform proportion.

#The Gini Coefficient ranges from approximately 0 for random guessing, to approximately 0.5 for a perfect score. The theoretical maximum for the discrete calculation is (1 - frac_pos) / 2.

#The Normalized Gini Coefficient adjusts the score by the theoretical maximum so that the maximum score is 1.

#The code to calculate Normalized Gini Coefficient in a number of different languages can be found in this forum thread.

# Submission File

In [None]:
#For each id in the test set, you must predict a probability of an insurance claim in the target column. 
#The file should contain a header and have the following format:

id,target
0,0.1
1,0.9
2,1.0
etc.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
p_train = pd.read_csv('../data/porto-seguro-safe-driver-prediction/train.csv')
p_test = pd.read_csv('../data/porto-seguro-safe-driver-prediction/test.csv')

In [3]:
p_train.shape, p_test.shape

((595212, 59), (892816, 58))

In [4]:
p_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     595212 non-null int64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595212 non-null int64
ps_ind_05_cat     595212 non-null int64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64
ps_re

In [5]:
p_train.isna().sum()

id                0
target            0
ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_03_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0


In [6]:
p_train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [None]:
#0 = 결측치? 특성을 알 수 없으니 결측치 대체가 어려움

In [7]:
p_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [8]:
p_test.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


In [9]:
y = p_train['target']

In [10]:
X = p_train.drop('target', axis = 1)

In [11]:
X.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
1,9,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
2,13,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
3,16,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [12]:
X_train = p_train.values
X_train

array([[7.000000e+00, 0.000000e+00, 2.000000e+00, ..., 0.000000e+00,
        0.000000e+00, 1.000000e+00],
       [9.000000e+00, 0.000000e+00, 1.000000e+00, ..., 0.000000e+00,
        1.000000e+00, 0.000000e+00],
       [1.300000e+01, 0.000000e+00, 5.000000e+00, ..., 0.000000e+00,
        1.000000e+00, 0.000000e+00],
       ...,
       [1.488017e+06, 0.000000e+00, 1.000000e+00, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00],
       [1.488021e+06, 0.000000e+00, 5.000000e+00, ..., 1.000000e+00,
        0.000000e+00, 0.000000e+00],
       [1.488027e+06, 0.000000e+00, 0.000000e+00, ..., 0.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_scaled = scaler.transform(X_train)

In [14]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [90]:
model = Sequential()

model.add(Dense(1280,activation="relu",input_dim = np.shape(X_scaled)[1]))
#model.add(Dropout(0.25))
model.add(Dense(320, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(320, activation='relu'))
model.add(Dense(1, activation='softmax'))

In [91]:
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam', metrics = ['accuracy'])

In [92]:
model.fit(X_scaled, y, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 19600/595212 [..............................] - ETA: 51s - loss: 15.3681 - acc: 0.0360

KeyboardInterrupt: 

In [None]:
#SVM 그리드서치
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

for gamma  in [0.001, 0.01, 0.01, 1, 10, 100] :
    for C in [0.001, 0.01, 0.01, 1, 10, 100] :
        svm = SVC(gamma = gamma, C=C)
        svm.fit(X_scaled, y)
        score = svm.score(X_scaled, y)
        if score > best_score : 
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
print("최고 점수:{:.3f}".format(best_score))
print("최적 매개변수",best_parameters)

In [15]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 20, random_state = 2) 
forest.fit(X_scaled,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [16]:
print("훈련 세트 점수: {:.3f}".format(forest.score(X_train,y)))

훈련 세트 점수: 0.964


In [21]:
predictions = forest.predict(p_test)

In [22]:
predictions = pd.DataFrame(predictions, columns=['target'])

In [23]:
P_test = pd.read_csv('../data/porto-seguro-safe-driver-prediction/test.csv')
predictions = pd.concat((P_test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv('y_test_ver_1.csv', sep=",", index = False)

In [24]:
submission = pd.read_csv('../data/porto-seguro-safe-driver-prediction/y_test_ver_1.csv')

In [27]:
np.unique(submission['target'])

array([0, 1], dtype=int64)

In [None]:
#score : 0.00114