In [334]:
import pandas as pd
import numpy as np


In [335]:
df_dev = pd.read_pickle('df_dev.pkl')

In [336]:
zero_data = np.zeros(shape=(len(df_dev),1),dtype=np.int32)
d = pd.DataFrame(zero_data)
df_dev['group_id']= d

zero_data = np.zeros(shape=(len(df_dev),1),dtype=np.float32)
d = pd.DataFrame(zero_data)
df_dev['prob']=d

In [339]:
n_sample = len(df_dev)
n_fold = 10

shuffle_idx = np.random.permutation(n_sample)
split_array=np.array_split(shuffle_idx,n_fold)

In [341]:
for i in range(n_fold):
    idx_list = split_array[i]
    df_dev.ix[split_array[i],'group_id']=i
    

In [343]:
from sklearn.model_selection import train_test_split
import chainer
from chainer import training, Chain
from chainer.training import extensions
from chainer import links as L
from chainer import functions as F
from chainer.datasets import TupleDataset
from chainer import serializers

from chainer.functions.loss.mean_squared_error import mean_squared_error

In [344]:
class MyChain(Chain):
    def __init__(self):
        super(MyChain,self).__init__(
            l1 = L.Linear(None, 10),
            l2 = L.Linear(None, 6),
            l3 = L.Linear(None,2),
            bn1 = L.BatchNormalization(size = 14),
            bn2 = L.BatchNormalization(size = 10),
            bn3 = L.BatchNormalization(size = 6)
        )
        
    def __call__(self, x, train = True):
        h = F.dropout(self.bn1(x),ratio = 0.2, train=train)
        h = F.dropout(F.relu(self.bn2(self.l1(x))), train = train)
        h = F.dropout(F.relu(self.bn3(self.l2(h))), train = train)
        y = self.l3(h)
        
        return y

In [354]:
for i in range(n_fold):
    df_train = df_dev[df_dev.group_id != i]
    df_test = df_dev[df_dev.group_id == i]
    
    X_train = df_train.drop(['Survived','group_id','prob'],axis=1).as_matrix().astype(np.float32)
    y_train = df_train.Survived.as_matrix().astype(np.int32)
    X_test = df_test.drop(['Survived','group_id','prob'],axis=1).as_matrix().astype(np.float32)
    y_test = df_test.Survived.as_matrix().astype(np.int32)
    
    train = TupleDataset(X_train,y_train.reshape(1,-1)[0])
    test = TupleDataset(X_test,y_test.reshape(1,-1)[0])

    train_iter = chainer.iterators.SerialIterator(train, 100)
    test_iter = chainer.iterators.SerialIterator(test, 100,repeat=False, shuffle=False)
    
    model = L.Classifier(MyChain())
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    updater = training.StandardUpdater(train_iter, optimizer, device=-1)
    trainer = training.Trainer(updater, (1000, 'epoch'), out="result")
    trainer.extend(extensions.Evaluator(test_iter, model, device=-1))
    trainer.extend(extensions.LogReport())
    #trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy']))
    
    trainer.run()
    
    serializers.save_npz("model/model_"+str(i)+".npz",model)
    
    df_prob = pd.DataFrame(F.softmax(model.predictor(X_test,train=False)).data[:,1],index = df_test.index,columns=['prob'])
    df_dev.loc[df_test.index,'prob']=df_prob['prob']


epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy
[J1           0.795707    0.758732              0.452222       0.411111                  
[J2           0.801311    0.860941              0.495          0.511111                  
[J3           0.794178    0.731018              0.44125        0.488889                  
[J4           0.795559    0.760435              0.46875        0.455556                  
[J5           0.717185    0.695872              0.5125         0.533333                  
[J6           0.702118    0.75092               0.52625        0.522222                  
[J7           0.722645    0.690843              0.475          0.488889                  
[J8           0.713711    0.688731              0.52           0.588889                  
[J9           0.694399    0.681615              0.53375        0.6                       
[J10          0.682025    0.691647              0.5475         0.611111                  
[J1

In [356]:
df_dev.to_pickle("meta_featured.pkl")

In [362]:
df_X = df_dev.drop(['group_id','Survived'],axis=1)

In [363]:
df_y = df_dev[['Survived']]

In [365]:
X = df_X.as_matrix().astype(np.float32)
y = df_y.as_matrix().astype(np.int32)

In [387]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [371]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [395]:
#clf = RandomForestClassifier()
clf = GradientBoostingClassifier()
clf.fit(X_train,y_train)

pred = clf.predict(X_test)
print confusion_matrix(y_test,pred)
print accuracy_score(y_test,pred)
print classification_report(y_test,pred)

[[90 15]
 [21 53]]
0.798882681564
             precision    recall  f1-score   support

          0       0.81      0.86      0.83       105
          1       0.78      0.72      0.75        74

avg / total       0.80      0.80      0.80       179



In [396]:
clf.predict_proba(X_train)

array([[ 0.00722892,  0.99277108],
       [ 0.11477093,  0.88522907],
       [ 0.90065849,  0.09934151],
       ..., 
       [ 0.95607188,  0.04392812],
       [ 0.69593323,  0.30406677],
       [ 0.82237385,  0.17762615]])