# Imports

In [1]:
from fastai import *
from fastai.tabular import *

In [2]:
import os
import numpy as np
import pandas as pd

np.random.seed(42)

path = Path(os.getcwd() + '\data')

pd.set_option('display.max_rows', None)

# Load Data

In [3]:
df_train = pd.read_csv(path/'train.csv', index_col='PassengerId')
df_test = pd.read_csv(path/'test.csv', index_col='PassengerId')

In [4]:
df_train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature engineering

In [5]:
titles_dict = {'Capt.': 'Other',
               'Major.': 'Other',
               'Jonkheer.': 'Other',
               'Don.': 'Other',
               'Sir.': 'Other',
               'Dr.': 'Other',
               'Rev.': 'Other',
               'Countess.': 'Other',
               'Dona.': 'Other',
               'Mme.': 'Mrs',
               'Mlle.': 'Miss',
               'Ms.': 'Miss',
               'Mr.': 'Mr',
               'Mrs.': 'Mrs',
               'Miss.': 'Miss',
               'Master.': 'Master',
               'Lady.': 'Other'}

def get_title(name):
    return name.split(',')[1].split(' ')[1]

def create_title_column(df):
    df['Title'] = df['Name'].apply(get_title)
    df['Title'] = df['Title'].map(titles_dict)
    
create_title_column(df_train)
create_title_column(df_test)

In [6]:
def create_family_size_column(df):
    df['FamilySize'] = df['SibSp'] + df['Parch']
    
create_family_size_column(df_train)
create_family_size_column(df_test)

In [7]:
# def get_cabin_count(x):
#     if isinstance(x, float):
#         return 0
#     else:
#         return len(x.split(' '))

# def cabin_count(df):
#     df['CabinCount'] = df['Cabin'].apply(get_cabin_count)
    

    
# cabin_count(df_train)
# cabin_count(df_test)

In [8]:
# def get_cabin_letter(x):
#     if isinstance(x, float):
#         return x
#     else:
#         return x[0]
    
# def cabin_letter(df):
#     df['CabinLetter'] = df['Cabin'].apply(get_cabin_letter)
    
# cabin_letter(df_train)
# cabin_letter(df_test)

In [9]:
df_train.Title.value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Other      20
Name: Title, dtype: int64

# Clean Data

In [10]:
df_train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
df_test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

df_test.loc[1044, 'Fare']= df_train.Fare.mean()

# Data analysis

In [11]:
cat_names = ['Pclass', 'Sex', 'Embarked', 'Title']
# emb_szs = {
#     'Cabin': 50
# }

# Training

In [12]:
procs = [FillMissing, Categorify, Normalize]
valid_idx = df_train.sample(frac=0.2, random_state=42).index
dep_var = 'Survived'
epochs = 1

In [96]:
# K-fold implementation
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_idx = []

for _, val_idx in skf.split(np.zeros(len(df_train)), df_train['Survived']):
    fold_idx.append(val_idx)
    
def k_fold(idx):
    data = TabularDataBunch.from_df(path, df_train, dep_var, 
                                valid_idx=idx,
                                procs=procs, cat_names=cat_names,
                                test_df=df_test,
                                bs=8)
    
    learn = tabular_learner(data, layers=[50,25], 
                        metrics=accuracy,
                        ps=0.25,
                        emb_drop=0.25,
                        wd=1e-1)
    
    learn.fit_one_cycle(3, 1e-2)
    return learn

learners = list(map(k_fold, fold_idx))

epoch,train_loss,valid_loss,accuracy,time
0,0.62082,0.439613,0.844444,00:36
1,0.541529,0.442981,0.833333,00:36
2,0.521988,0.3811,0.866667,00:36


epoch,train_loss,valid_loss,accuracy,time
0,0.615957,0.515867,0.775281,00:36
1,0.521551,0.507541,0.786517,00:36
2,0.504952,0.477912,0.786517,00:36


epoch,train_loss,valid_loss,accuracy,time
0,0.613881,0.502536,0.775281,00:37
1,0.553904,0.456209,0.820225,00:39
2,0.484408,0.491557,0.786517,00:41


epoch,train_loss,valid_loss,accuracy,time
0,0.651976,0.479256,0.808989,00:38
1,0.52606,0.453178,0.820225,00:39
2,0.489038,0.45392,0.808989,00:37


epoch,train_loss,valid_loss,accuracy,time
0,0.620973,0.532336,0.752809,00:36
1,0.561418,0.534449,0.764045,00:36
2,0.487415,0.492638,0.786517,00:37


epoch,train_loss,valid_loss,accuracy,time
0,0.629957,0.45838,0.831461,00:36
1,0.501691,0.444075,0.797753,00:36
2,0.470044,0.459933,0.808989,00:37


epoch,train_loss,valid_loss,accuracy,time
0,0.606943,0.496407,0.764045,00:36
1,0.540718,0.469747,0.820225,00:36
2,0.517658,0.414496,0.831461,00:36


epoch,train_loss,valid_loss,accuracy,time
0,0.593233,0.510839,0.797753,00:36
1,0.539453,0.494427,0.797753,00:36
2,0.503777,0.453414,0.808989,00:38


epoch,train_loss,valid_loss,accuracy,time
0,0.629498,0.511877,0.797753,00:36
1,0.547965,0.414231,0.853933,00:36
2,0.525512,0.41819,0.808989,00:38


epoch,train_loss,valid_loss,accuracy,time
0,0.610138,0.511797,0.764045,00:36
1,0.534162,0.46249,0.797753,00:36
2,0.490463,0.459532,0.808989,00:36


In [61]:
def get_mean_preds(learners):
    preds = torch.zeros(len(df_test), 2)
    for learner in learners:
        preds += learner.get_preds(DatasetType.Test)[1]
    preds /= len(learners)
    return preds.argmax(1)

In [98]:
def get_vote_preds(learners):
    votes = torch.zeros(len(df_test), dtype=int)
    for learner in learners:
        votes += learner.get_preds(DatasetType.Test)[0].argmax(1)
    preds = votes >= len(learners)/2
    return preds.int()

preds = get_vote_preds(learners)

In [110]:
votes = torch.zeros(len(df_test), dtype=int)
for learner in learners:
    votes += learner.get_preds(DatasetType.Test)[0].argmax(1)

In [111]:
votes

tensor([ 0, 10,  0,  0,  9,  0,  9,  0, 10,  0,  0,  0, 10,  0, 10, 10,  0,  0,
         7, 10,  0,  6, 10,  0, 10,  0, 10,  0,  0,  0,  0,  0,  6,  9,  0,  0,
         9, 10,  0,  0,  0,  0,  0, 10, 10,  0,  0,  0, 10,  7,  0,  0,  7,  9,
         0,  0,  0,  0,  0, 10,  0,  0,  0, 10,  9, 10, 10,  0,  0,  9,  9,  0,
         9,  0, 10,  7,  0, 10,  0,  9,  6,  4,  0,  0,  0,  0,  9, 10,  9,  9,
        10,  0, 10,  0,  0,  0, 10,  0, 10,  0, 10,  0,  0,  0,  9,  0,  0,  0,
         0,  0,  0,  9, 10, 10, 10,  0,  0,  8,  1, 10, 10,  0, 10,  0,  0,  9,
         0,  8,  0,  0,  0,  1,  5,  0,  0,  0,  0,  0,  9,  0,  0, 10,  1,  0,
         0,  0,  0,  0,  0,  0, 10,  0,  0,  7,  0,  0, 10,  9,  0,  8,  9,  6,
        10,  0,  0,  9,  0,  0, 10,  9,  0,  0,  0,  0,  0, 10, 10,  0,  8,  9,
         0,  0, 10,  0, 10,  0, 10,  0,  0,  0,  0,  0,  6,  0, 10,  0, 10, 10,
         0, 10,  9,  9, 10, 10,  0,  0,  9,  0, 10,  0,  0,  0,  0, 10,  0,  0,
         9,  1, 10,  0, 10,  0, 10,  0, 

In [97]:
for i in range(len(learners)):
    learners[i].save(f'fold{i}-50-25-3ep')

# Submission

In [59]:
# pred = learn.predict(df_test.iloc[4])
# pred[1].item()

1

In [60]:
# preds = np.zeros(len(df_test))
# for x in range(len(df_test)):
#     preds[x] = learn.predict(df_test.iloc[x])[1].item()

In [112]:
# print(preds[:10].astype(int))

In [104]:
submission = pd.DataFrame()
submission['PassengerId'] = df_test.index
submission['Survived'] = preds

In [105]:
submission.to_csv('submission.csv', index=False)