# Kaggle Titanic challenge

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score

from fastai.structured import *
from fastai.column_data import *

##  1) Load data

In [3]:
train = pd.read_feather("data/CleanedData1_train")
test  = pd.read_feather("data/CleanedData1_test")

In [4]:
train.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,HasCabin,Title,FareGroup
870,871,0,3,0,32.39875,0,0,0,False,3,1


In [5]:
train.shape

(891, 11)

In [6]:
train_df = train.drop(["Survived",'PassengerId','Age'],axis='columns')
target_df = train['Survived'].astype('float32')
test_df = test.drop(['PassengerId','Age'],axis='columns')

In [7]:
train_df.sample()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,HasCabin,Title,FareGroup
149,2,0,0,0,0,False,5,1


In [8]:
test_df.sample()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,HasCabin,Title,FareGroup
369,2,0,0,0,1,False,3,1


In [9]:
cat_vars = ['Pclass','Sex','Embarked','HasCabin','Title','FareGroup']
contin_vars = ['SibSp','Parch']

In [10]:
for v in cat_vars:
    train_df[v] = train_df[v].astype('category').cat.as_ordered()
    test_df[v]  = test_df[v].astype('category').cat.as_ordered()

In [11]:
for v in contin_vars:
    train_df[v] = train_df[v].astype('float32')
    test_df[v]  = test_df[v].astype('float32')

In [12]:
val_idx = random.sample(range(train_df.shape[0]),int(train_df.shape[0]*0.20))

In [13]:
val_idx[:10]

[836, 781, 395, 624, 445, 8, 510, 7, 321, 600]

In [79]:
md = ColumnarModelData.from_data_frame('data', val_idx, train_df, target_df, cat_flds=cat_vars, bs=64,test_df=test_df)

In [80]:
cat_sz = [(c, len(train_df[c].cat.categories)+1) for c in cat_vars]

In [81]:
emb_szs = [(c,min(50,(c+1))) for _,c in cat_sz]

In [82]:
emb_szs

[(4, 5), (3, 4), (4, 5), (3, 4), (6, 7), (5, 6)]

In [83]:
m = md.get_learner(emb_szs, len(train_df.columns)-len(cat_vars),0.04, 1, [1000,500], [0.001,0.01],y_range=[0,1])

In [84]:
m

MixedInputModel(
  (embs): ModuleList(
    (0): Embedding(4, 5)
    (1): Embedding(3, 4)
    (2): Embedding(4, 5)
    (3): Embedding(3, 4)
    (4): Embedding(6, 7)
    (5): Embedding(5, 6)
  )
  (lins): ModuleList(
    (0): Linear(in_features=33, out_features=1000, bias=True)
    (1): Linear(in_features=1000, out_features=500, bias=True)
  )
  (bns): ModuleList(
    (0): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True)
    (1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True)
  )
  (outp): Linear(in_features=500, out_features=1, bias=True)
  (emb_drop): Dropout(p=0.04)
  (drops): ModuleList(
    (0): Dropout(p=0.001)
    (1): Dropout(p=0.01)
  )
  (bn): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True)
)

In [85]:
def y_metric(y_pred, targ):
    y_pred = [ int(xx[0]*2) for xx in y_pred]
    y_val  = [ int(yy[0]*2) for yy in targ]

    return accuracy_score(y_pred, y_val)
    #print(f'Predict: {y_pred[0]} vs {targ[0]}')
    #return ((y_pred-targ)**2).sum()

In [86]:
#m.lr_find()

In [87]:
#m.sched.plot_lr()

In [88]:
#m.sched.plot()

In [89]:
lr = 0.01

In [90]:
m.fit(lr, 3,metrics=[y_metric])

epoch      trn_loss   val_loss   y_metric         
    0      0.297952   0.168211   0.824531  
    1      0.26123    0.191586   0.785            
    2      0.239003   0.185469   0.814531         



[0.18546875, 0.8145312499999999]

In [91]:
m.fit(lr, 5,cycle_len=1,cycle_mult=2,metrics=[y_metric])

epoch      trn_loss   val_loss   y_metric          
    0      0.219568   0.181562   0.818437  
    1      0.207483   0.194998   0.743281         
    2      0.217139   0.203681   0.688125         
    3      0.219137   0.174525   0.76375          
    4      0.215201   0.167656   0.832344         
    5      0.211959   0.167656   0.832344         
    6      0.210343   0.167656   0.832344         
    7      0.206735   0.167656   0.832344         
    8      0.204375   0.167656   0.832344         
    9      0.202337   0.167656   0.832344         
    10     0.20579    0.177656   0.818437         
    11     0.206124   0.171367   0.824531         
    12     0.210542   0.177656   0.818437         
    13     0.210225   0.177656   0.818437         
    14     0.207452   0.177656   0.818437         
    15     0.206214   0.16375    0.83625          
    16     0.20653    0.16375    0.83625          
    17     0.20715    0.171558   0.828437         
    18     0.203614   0.171562   0.82

[0.16936912, 0.830625]

In [92]:
#m.fit(lr, 5,cycle_len=4,metrics=[y_metric])

In [93]:
x,y=m.predict_with_targs()

In [94]:
y_pred = [ int(xx[0]*2) for xx in x]
y_val  = [ int(yy[0]*2) for yy in y]

round(accuracy_score(y_pred, y_val) * 100, 2)

80.9

## Submissions

In [95]:
ids = test['PassengerId']
preds = m.predict(True)
predictions = [int(x[0]) for x in preds]

In [96]:
output = pd.DataFrame( {'PassengerId':ids , 'Survived':predictions})
output.to_csv('data/FastAi_DL.csv',index=False)

In [97]:
!ls -lart data

total 220
-rw-rw-r-- 1 paperspace paperspace 61194 Feb 13 13:59 train.csv
-rw-rw-r-- 1 paperspace paperspace 28629 Feb 13 13:59 test.csv
-rw-rw-r-- 1 paperspace paperspace  3258 Feb 13 13:59 gender_submission.csv
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 13 15:38 test_sub.csv
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 15 15:43 logisticRegression.csv
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 17 11:06 AllModels.csv
-rw-r--r-- 1 paperspace paperspace 66232 Feb 19 13:36 CleanedData1_train
-rw-r--r-- 1 paperspace paperspace 28224 Feb 19 13:36 CleanedData1_test
drwxrwxr-x 2 paperspace paperspace  4096 Feb 19 14:02 tmp
drwxrwxr-x 2 paperspace paperspace  4096 Feb 19 14:02 models
drwxrwxr-x 4 paperspace paperspace  4096 Feb 19 15:38 .
drwxrwxr-x 4 paperspace paperspace  4096 Feb 19 15:41 ..
-rw-rw-r-- 1 paperspace paperspace  2839 Feb 19 15:43 FastAi_DL.csv


In [98]:
# Submit data to kaggle using kaggle-cli
# kg submit data/test_sub.csv -c titanic -u Bobox214 -p XXXXX