# Modeling and predicting survival aboard the Titanic:
This notebook contains feature selection and extraction, model testing, selection and combination, and lastly, prediction. <br>
Entirely written by Alon Shaaltiel (Shaaltiel.Alon@gmail.com), this is my first shot at Machine Learning

using the nbextention "Initialization cells", I first of all save the standard output to the notebook, as it changes with each kernel. <br>
The condition makes sure that no matter what happens, as long as I don't go out of my way to screw up `saved_std` is always the original standard output.

In [1]:
import sys

if str(type(sys.stdout)) == "<class 'ipykernel.iostream.OutStream'>":
    saved_std = sys.stdout
else:
    sys.stdout = saved_std

import warnings
warnings.filterwarnings('ignore')
sys.stdout

<ipykernel.iostream.OutStream at 0x246d66eb460>

In [2]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Binarizer
from category_encoders.binary import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                              AdaBoostClassifier,ExtraTreesClassifier, VotingClassifier)
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import itertools
from math import ceil

In [3]:
df = pd.read_csv('train.csv', index_col='PassengerId').fillna(value={'Cabin':'Unknown'})
test = pd.read_csv('test.csv', index_col='PassengerId').fillna(value={'Cabin':'Unknown'})
y = df['Survived']
df.drop(['Survived'],axis=1,inplace=True)
df

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S
...,...,...,...,...,...,...,...,...,...,...
887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S
888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,Unknown,S
890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Note that only people with a Pclass of 1 have a cabin. Which makes sense since they are the richest.

As shown in this __[forum dicussion](https://www.kaggle.com/c/titanic/discussion/57447)__ the surname contains a lot of information. <br>While it may not generalize, it never hurts to try :).

In [4]:
df['Surname'] =  [Name.split(',')[0] for Name in df['Name']]
test['Surname'] =  [Name.split(',')[0] for Name in test['Name']]
len(df['Surname'].unique())

667

There are many unique surnames and so, instead of the usual OneHot encoding I'd use for categorical data, I'll be using the Binary encoding.

Another feature which may be very useful is the person's title, as it contains both gender and economic status.<br>
The method by which I find all possible titles and assign each person a title is taken from __[here](https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/)__.

In [5]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print(big_string)
    return None

In [6]:
# Titles = ['Mr','Mrs','Miss','Master'] 

# # all other titles must be found and entered by hand
# # since there is no pattern to their placement
# for Name in df['Name']:
#     if substrings_in_string(Name,Titles) == None:
#         Title = input("Please enter this person's title:")
#         Titles.append(Title)

# for Name in test['Name']:
#     if substrings_in_string(Name,Titles) == None:
#         Title = input("Please enter this person's title:")
#         Titles.append(Title)

Titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme',
          'Ms', 'Major', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer']


df['Title'] = df['Name'].apply(substrings_in_string, substrings=Titles)
test['Title'] = test['Name'].apply(substrings_in_string, substrings=Titles)

The list of titles is quite big and contains many titles that appear with a very low frequency.<br>
I'd like to simplify them, the method by which I reduce the number of titles is also taken from __[here](https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/)__.

In [7]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [8]:
df['Title'] = df.apply(replace_titles, axis=1)
test['Title'] = test.apply(replace_titles, axis=1)

The deck which a person stayed on may be relevant, as some cabins were closer to the lifeboats.<br>
However, this column contains by far the highest percentage of missing data which may hurt the model or not affect it at all.

In [9]:
df['Deck'] = [Cab[0] for Cab in df['Cabin']]
test['Deck'] = [Cab[0] for Cab in test['Cabin']]
df

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Title,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Unknown,S,Braund,Mr,U
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mr,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Unknown,S,Heikkinen,Miss,U
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Futrelle,Mr,C
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Unknown,S,Allen,Mr,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,Unknown,S,Montvila,Mr,U
888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Graham,Miss,B
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,Unknown,S,Johnston,Miss,U
890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Behr,Mr,C


We'll handle different data types differently in our preprocessing.<br>
The pipelines help adding other transformers later and keeping things consistent.<br>
The binary columns don't need an Imputer since they don't have any missing data in both datasets.

In [10]:
Numerical_Pre = make_pipeline(SimpleImputer(strategy='mean'))
OneHot_Pre = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder())
Binary_Pre = make_pipeline(BinaryEncoder())
Col_Num = ['Age','Fare','SibSp','Parch']
Col_OHE = ['Sex','Deck','Title','Embarked','Pclass']
Col_Bin = ['Surname','Ticket']


Preprocess = ColumnTransformer([('Col_OHE', OneHot_Pre, Col_OHE),
                                ('Col_Bin', Binary_Pre, Col_Bin),
                                ('Col_Num', Numerical_Pre, Col_Num),
                                ])

In [316]:
PreProOHE = [sorted([str(Col) + "_" + str(i) for i in df[Col].unique() if str(i) != 'nan']) for Col in Col_OHE]
PreProBin = [[str(Col) + "_" + str(i+1) for i in range(ceil(np.log2(len(df[Col].unique()))) + 1)] for Col in Col_Bin]
Cols = list(itertools.chain(*(PreProOHE+PreProBin))) + Col_Num
Cols

['Sex_female',
 'Sex_male',
 'Deck_A',
 'Deck_B',
 'Deck_C',
 'Deck_D',
 'Deck_E',
 'Deck_F',
 'Deck_G',
 'Deck_T',
 'Deck_U',
 'Title_Master',
 'Title_Miss',
 'Title_Mr',
 'Title_Mrs',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Pclass_1',
 'Pclass_2',
 'Pclass_3',
 'Surname_1',
 'Surname_2',
 'Surname_3',
 'Surname_4',
 'Surname_5',
 'Surname_6',
 'Surname_7',
 'Surname_8',
 'Surname_9',
 'Surname_10',
 'Surname_11',
 'Ticket_1',
 'Ticket_2',
 'Ticket_3',
 'Ticket_4',
 'Ticket_5',
 'Ticket_6',
 'Ticket_7',
 'Ticket_8',
 'Ticket_9',
 'Ticket_10',
 'Ticket_11',
 'Age',
 'Fare',
 'SibSp',
 'Parch']

All of this was done so we could ?????? 

In [332]:
# data = pd.DataFrame(Preprocess.fit_transform(df), index = df.index, columns=Cols)

In [11]:
# Gradient = GradientBoostingClassifier(warm_start=True)
# params_Grad = dict(n_estimators = [200,300],
#                    max_depth=[3,5], 
#                    random_state=[12],
#                    learning_rate=[0.1,0.15])

Gradient = GradientBoostingClassifier(warm_start=True, n_estimators = 200, 
                                      max_depth=4,random_state=12,learning_rate=0.1)

In [12]:
def print_best(Grid,Prints = True):
    if Prints:
        print(Grid.best_score_,Grid.best_params_)
    if not Prints:
        return [Grid.best_score_,Grid.best_params_]

In [13]:
def fit_verb(Grid,data,y,nbOut = saved_std):
    sys.stdout = open(1, 'w')
    Grid.fit(data,y)
    print('Done with grid :)')
    sys.stdout = nbOut

In [23]:
# Grad_Grid = GridSearchCV(Gradient,param_grid=params_Grad,scoring='accuracy',cv=10,verbose=10)

# fit_verb(Grad_Grid,data,y)
# print_best(Grad_Grid)

print(cross_val_score(Gradient,data,y,cv=5,scoring='accuracy',n_jobs=1).mean())

Gradient.fit(data,y)
print(metrics.confusion_matrix(y,Gradient.predict(data)))

Grad_pred = pd.Series(Gradient.predict(test_data), index=test.index, name='Survived')
Grad_pred.to_csv('Predictions_Feat/Feat_Grad_Pred_2.csv')


0.8350072186303434
[[529  20]
 [ 56 286]]


In [43]:
xgbtree1 = XGBClassifier(eta=0.01,use_label_encoder=False, eval_metric='rmse', objective='binary:logistic')
xgbtree2 = XGBClassifier(booster = 'dart',use_label_encoder=False, eval_metric='rmse', objective='binary:logistic')
xgblin = XGBClassifier(booster = 'gblinear',use_label_encoder=False, eval_metric='rmse', objective='binary:logistic')

print(cross_val_score(xgbtree1,data,y,cv=5,scoring='accuracy',n_jobs=1).mean())
print(cross_val_score(xgbtree2,data,y,cv=5,scoring='accuracy',n_jobs=1).mean())
print(cross_val_score(xgblin,data,y,cv=5,scoring='accuracy',n_jobs=1).mean())

xgbtree1.fit(data,y)
print(metrics.confusion_matrix(y,xgbtree1.predict(data)))

xgb_pred = pd.Series(xgbtree1.predict(test_data), index=test.index, name='Survived')
xgb_pred.to_csv('Predictions_Feat/Feat_xgb_Pred_0.csv')

0.8350134957002073
0.8305818843763731
0.8114744837110036
[[519  30]
 [ 64 278]]
