In [101]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn import tree

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

# Import test and train datasets
df_train = pd.read_csv('titanic-data/train.csv')
df_test = pd.read_csv('titanic-data/test.csv')

# View first lines of training data
print (df_train.info())
df_train.head(n=4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [102]:
import string
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if string.find(big_string, substring) != -1:
            return substring
    print (big_string)
    return np.nan

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

df_train['Title']=df_train['Name'].map(lambda x: substrings_in_string(x, title_list))
df_test['Title']=df_test['Name'].map(lambda x: substrings_in_string(x, title_list))

In [103]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
df_train['Title']=df_train.apply(replace_titles, axis=1)
df_test['Title']=df_test.apply(replace_titles, axis=1)

In [112]:
#New features that may help 

#Modified Kaggle Function
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.startswith(substring):
            return substring
    return np.nan

#Kaggle Function: Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
df_train['Deck']=df_train['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))
df_test['Deck']=df_test['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))


df_train['FamilySize']=df_train['SibSp']+df_train['Parch']
df_test['FamilySize']=df_test['SibSp']+df_train['Parch']

df_train['Sex'] = df_train['Sex'].map(lambda x: 1 if x == 'male' else 0)
df_test['Sex'] = df_test['Sex'].map(lambda x: 1 if x == 'male' else 0)


df_train['Deck'] = df_train['Deck'].map(lambda x: 0 if pd.isnull(x) else 1)
df_test['Deck'] = df_test['Deck'].map(lambda x: 0 if pd.isnull(x) else 1)

In [113]:
sexes = sorted(df_train['Sex'].unique())
genders_mapping = dict(zip(sexes, range(0, len(sexes) + 1)))
genders_mapping

{0: 0}

In [114]:
age_group = pd.cut(df_train["Age"], np.arange(0, 100, 10))

In [115]:
# #Our initial data is all passengers for training
# X = df_drop.drop(['Survived'], 1).values
# #Our actual data of passengers that survived
# Y = df_drop['Survived'].values

#Can mess with test_size
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

df_dt = DecisionTreeClassifier(max_depth=3, random_state=60)

df_dt.fit(X_train, Y_train)
df_dt.score(X_test, Y_test)

0.8044692737430168

In [119]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,FamilySize
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,Mr,0,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S,Mrs,0,1.0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,Mr,0,0.0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,Mr,0,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S,Mrs,0,1.0
