In [1]:
import numpy as np, pandas as pd
import tensorflow as tf
from sklearn.decomposition import PCA
import time
from sklearn import svm

In [2]:
def onehotEncoding(dfWhole, feaName):
    cats = list(np.unique(np.array(dfWhole[feaName])))
    for cat in cats:
        dfWhole[feaName + str(cat)] = dfWhole[feaName] == cat
    return dfWhole

In [3]:
dfTrainOri = pd.read_csv(filepath_or_buffer='D:\\Workfiles\\Kaggle\\Porto Seguro Safe Driver Prediction\\data\\train.csv')
dfTestOri = pd.read_csv(filepath_or_buffer='D:\\Workfiles\\Kaggle\\Porto Seguro Safe Driver Prediction\\data\\test.csv')

In [4]:
dfWhole = pd.concat([dfTrainOri, dfTestOri])
dfWhole.index = dfWhole['id']
dfWhole = dfWhole.drop(['id'], axis = 1)

In [5]:
########################################### onehot encoding ################################################
feaToDelete = []
for fea in dfWhole.columns:
    if fea.split('_')[-1] == 'cat':
        feaToDelete.append(fea)
        dfWhole = onehotEncoding(dfWhole, fea)
dfWhole = dfWhole.drop(feaToDelete, axis = 1)

In [6]:
####################################### train/dev set dividing #############################################
isTest = pd.isnull(dfWhole['target'])
isNotTest = ~pd.isnull(dfWhole['target'])
numNotTest = np.sum(isNotTest)
rateDev = 0.1
numDev = int(numNotTest * rateDev)
numTrain = numNotTest - numDev
permut = np.random.permutation(numNotTest)

dfNotTest = dfWhole.loc[isNotTest, :]
dfTrainX = dfNotTest.iloc[permut[:numTrain], :]
dfDevX = dfNotTest.iloc[permut[numTrain:], :]
dfTrainY = dfTrainX['target']
dfTrainX = dfTrainX.drop(['target'], axis = 1)
dfDevY = dfDevX['target']
dfDevX = dfDevX.drop(['target'], axis = 1)
dfTestX = dfWhole.loc[isTest, :].drop(['target'], axis = 1)

In [7]:
######################################## transform df into ar ##############################################
arTrainX = np.array(dfTrainX).T
arTrainY = np.array(dfTrainY).T.reshape([-1])
arDevX = np.array(dfDevX).T
arDevY = np.array(dfDevY).T.reshape([-1])
arTestX = np.array(dfTestX).T

In [8]:
################################################## PCA #####################################################
pca = PCA(n_components = 20)
pca.fit(X = arTrainX.T)
print("explained_variance_ratio_:", pca.explained_variance_ratio_)
print("sum of above:", sum(pca.explained_variance_ratio_))
arTrainX = pca.transform(arTrainX.T).T
arDevX = pca.transform(arDevX.T).T
arTestX = pca.transform(arTestX.T).T

explained_variance_ratio_: [ 0.1871453   0.11910895  0.10655399  0.10108749  0.07680283  0.05147351
  0.0405334   0.03007437  0.02825043  0.02514396  0.02197689  0.02044376
  0.0181996   0.0176244   0.01301997  0.01054684  0.00971028  0.00805745
  0.00732174  0.00686088]
sum of above: 0.899936022874


In [9]:
################################################## NN ######################################################
layersize = [50, 1]
layercount = len(layersize)

seed = time.time()
X = tf.placeholder(shape = [arTrainX.shape[0], None], dtype = 'float')
Y_ = tf.placeholder(shape = [None], dtype = 'float')
W = dict()
b = dict()
Y = dict()
Y[0] = X
layersizeTemp = [arTrainX.shape[0]] + layersize
for i in range(layercount):
    W[i+1] = tf.Variable(tf.random_normal(seed = np.random.randint(seed), shape = [layersizeTemp[i+1], layersizeTemp[i]]) * 1)
    b[i+1] = tf.Variable(tf.zeros(shape = [layersizeTemp[i+1], 1]))
    if i != layercount - 1:
        Y[i+1] = tf.nn.relu(tf.matmul(W[i+1], Y[i]) + b[i+1])
Y[layercount] = tf.nn.sigmoid(tf.matmul(W[layercount], Y[layercount-1]) + b[layercount])
Youtput = Y[layercount]

correct_prediction = tf.equal(tf.round(Youtput), Y_)
correct_rate = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

cost = tf.reduce_sum(tf.square(Youtput - Y_))

In [10]:
optimizer = tf.train.AdamOptimizer(0.1).minimize(cost)
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
init = tf.global_variables_initializer()
sess.run(init) 

for i in range(101):
    #sess.run(train_step, feed_dict = {X:dpTrainX.getData(size=50), Y_:dpTrainY.getData(size=50)})
    sess.run(optimizer, feed_dict = {X:arTrainX, Y_:arTrainY})
    if i % 10 == 0:
        predTrain = sess.run(Youtput, feed_dict={X:arTrainX, Y_: arTrainY})
        predDev = sess.run(Youtput, feed_dict={X:arDevX, Y_: arDevY})
        print(np.mean(np.round(predTrain).reshape([-1]) == arTrainY.reshape([-1])), np.mean(np.round(predDev).reshape([-1]) == arDevY.reshape([-1])))

0.776260941476 0.772651669159
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161
0.963544282058 0.963626283161


In [11]:
predTest = sess.run(Youtput, feed_dict={X:arTestX})

In [121]:
predTrain = sess.run(Youtput, feed_dict={X:arTrainX, Y_: arTrainY})
print(predTrain)
print(np.sum(np.square(predTrain - np.mean(predTrain))))

[[ 0.  0.  0. ...,  0.  0.  0.]]
0.0


In [None]:
#calculateAuc(arTrainY.reshape([-1]), predTrain.reshape([-1])), calculateAuc(arDevY.reshape([-1]), predDev.reshape([-1]))

In [None]:
clf = svm.SVC(kernel = 'rbf', gamma = 100, C = 50)
clf.fit(arTrainX.T, arTrainY.reshape([-1]))
predTrain = clf.predict(arTrainX.T)
precisionTrain = sum(predTrain == arTrainY.reshape([-1])) / predTrain.shape[0]
print(precisionTrain)
predDev = clf.predict(arDevX.T)
precisionDev = sum(predDev == arDevY.reshape([-1])) / predDev.shape[0]
print(precisionDev)
pred = clf.predict(arTestX.T)

In [17]:
##################################### save result to csvfile ########################################
result = pd.DataFrame({'id':dfTestOri['id']} )
result['target'] = predTest.reshape([-1])
result.to_csv(path_or_buf='result\\result20171008_01.csv', header=True, index=None)