# Analysis of Titanic dataset

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

In [2]:
# First load the training and test datasets
train_df = pd.read_csv("./dataset/train.csv")
test_df = pd.read_csv("./dataset/test.csv") 

In [3]:
# Display an overview of the training dataset first rows
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Check number of NaN values per column

In [4]:
# Show number of NaN values per column
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Show basics statistics/info on numeric columns

In [5]:
train_df.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### Extract name titles

In [7]:
# Show the 10 first passenger names
train_df['Name'].head(10)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

In [8]:
# We clearly see that name titles are associated to passengers: Mr., Mrs., Master., ...
# Titles are located as first item after the comma
# Create a function to extract titles from names

####################################################################
#@brief: Extract title from full name
#@param name       (string)     full name
#@return title
def extractTitle(name):
        return name.split(', ')[1].split(' ')[0].split('.')[0]

In [9]:
# Retrieve all titles from both train and test datasets
name_col = pd.concat([train_df['Name'], test_df['Name']])
# Count the number of all titles found
name_col.apply(extractTitle).value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Mlle          2
Ms            2
Major         2
Sir           1
Dona          1
Jonkheer      1
the           1
Capt          1
Don           1
Lady          1
Mme           1
Name: Name, dtype: int64

In [10]:
# Add a new column "Title" to data frame
train_df['Title'] = train_df['Name'].apply(extractTitle)

From the title extraction result we can note :
- Mrs, Ms, Mme, Dona can be grouped together
- Miss and Mlle can be grouped together
- "the" has been misextracted as title

In [11]:
train_df[(train_df['Title']=='Rev') | (train_df['Title']=='Dr')].sort_values('Title')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
245,246,0,1,"Minahan, Dr. William Edward",male,44.0,2,0,19928,90.0,C78,Q,Dr
317,318,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S,Dr
398,399,0,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,Dr
632,633,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C,Dr
660,661,1,1,"Frauenthal, Dr. Henry William",male,50.0,2,0,PC 17611,133.65,,S,Dr
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C,Dr
796,797,1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S,Dr
149,150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42.0,0,0,244310,13.0,,S,Rev
150,151,0,2,"Bateman, Rev. Robert James",male,51.0,0,0,S.O.P. 1166,12.525,,S,Rev
249,250,0,2,"Carter, Rev. Ernest Courtenay",male,54.0,1,0,244252,26.0,,S,Rev


In [12]:
# Check the 'The" title
train_df[(train_df['Title']=='the')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S,the


In [13]:
# Group all occurences of 'Mrs', 'Ms', 'Mme', 'Dona' under the label 'Mrs'
train_df['Title'] = train_df['Title'].replace({'Ms':'Mrs', 'Mme':'Mrs', 'Dona': 'Mrs'}, regex=True)
# Do the same for 'Mlle' and 'Miss' under label 'Miss'
train_df['Title'] = train_df['Title'].replace({'Mlle':'Miss'}, regex=True)
# Replace the 'the' title by 'Countess'
train_df['Title'] = train_df['Title'].replace({'the':'Countess'}, regex=True)

In [14]:
# Check the newly computed titles
train_df['Title'].value_counts()

Mr          517
Miss        184
Mrs         127
Master       40
Dr            7
Rev           6
Major         2
Col           2
Jonkheer      1
Countess      1
Lady          1
Don           1
Capt          1
Sir           1
Name: Title, dtype: int64

Let's do another couple of operations on these titles:
- Regroup "higher" titles together
- Regroup rare titles together

In [15]:
# Regroup "higher" titles together
higherTitleLst = ['Sir', 'Lady', 'Countess', 'Col']
for title in higherTitleLst:
    train_df['Title'] = train_df['Title'].replace({title:'HigherTitle'}, regex=True)

# Regroup rare titles together
rareTitleLst = ['Dr', 'Major', 'Jonkheer', 'Don', 'Capt']
for title in rareTitleLst:
    train_df['Title'] = train_df['Title'].replace({title:'RareTitle'}, regex=True)

train_df['Title'].value_counts()

Mr             517
Miss           184
Mrs            127
Master          40
RareTitle       12
Rev              6
HigherTitle      5
Name: Title, dtype: int64

### Drop non pertinent columns
Now that we have extracted the passenger titles, the names dot carry any useful information per se. We can then drop them. 
We also make the assumption here that tickets do not carry any pertinent information.
The point of embarcation and cabin numbers could carry useful information. However for this notebook we do not dig any further. 

In [16]:
# Drop non pertinent columns
train_df = train_df.drop(['Name', 'Ticket', 'Cabin'] ,1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,male,22.0,1,0,7.25,S,Mr
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs
2,3,1,3,female,26.0,0,0,7.925,S,Miss
3,4,1,1,female,35.0,1,0,53.1,S,Mrs
4,5,0,3,male,35.0,0,0,8.05,S,Mr


In [17]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
Title            0
dtype: int64

In [18]:
pd.crosstab(train_df['Title'],train_df['Survived'],margins=True)

Survived,0,1,All
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HigherTitle,1,4,5
Master,17,23,40
Miss,55,129,184
Mr,436,81,517
Mrs,26,101,127
RareTitle,8,4,12
Rev,6,0,6
All,549,342,891


In [19]:
pd.crosstab(train_df['Pclass'],train_df['Survived'],margins=True)

Survived,0,1,All
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80,136,216
2,97,87,184
3,372,119,491
All,549,342,891


In [20]:
from sklearn import preprocessing
encodeFeatureLst = ['Pclass', 'Sex']
for feature in encodeFeatureLst:
    train_df[feature] = preprocessing.LabelEncoder().fit_transform(train_df[feature])
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,2,1,22.0,1,0,7.25,S,Mr
1,2,1,0,0,38.0,1,0,71.2833,C,Mrs
2,3,1,2,0,26.0,0,0,7.925,S,Miss
3,4,1,0,0,35.0,1,0,53.1,S,Mrs
4,5,0,2,1,35.0,0,0,8.05,S,Mr


In [21]:
fareData = train_df['Fare']
#fareData = preprocessing.normalize(fareData, norm='l2')[0]
scaler = preprocessing.StandardScaler().fit(fareData)
scaler.transform(fareData)



array([ -5.02445171e-01,   7.86845294e-01,  -4.88854258e-01,
         4.20730236e-01,  -4.86337422e-01,  -4.78116429e-01,
         3.95813561e-01,  -2.24083121e-01,  -4.24256141e-01,
        -4.29555021e-02,  -3.12172378e-01,  -1.13845709e-01,
        -4.86337422e-01,  -1.87093118e-02,  -4.90279793e-01,
        -3.26266659e-01,  -6.19988892e-02,  -3.86670720e-01,
        -2.85997284e-01,  -5.02948539e-01,  -1.24919787e-01,
        -3.86670720e-01,  -4.86756223e-01,   6.63597416e-02,
        -2.24083121e-01,  -1.64441595e-02,  -5.02948539e-01,
         4.64700108e+00,  -4.89776426e-01,  -4.89442190e-01,
        -9.02720170e-02,   2.30172882e+00,  -4.92377828e-01,
        -4.37007438e-01,   1.00606170e+00,   3.98582080e-01,
        -5.02863973e-01,  -4.86337422e-01,  -2.85997284e-01,
        -4.22073541e-01,  -4.57645492e-01,  -2.25593223e-01,
        -4.89442190e-01,   1.88762532e-01,  -4.89776426e-01,
        -4.86337422e-01,  -3.36334002e-01,  -4.92377828e-01,
        -2.11917743e-01,

In [22]:
train_df['Fare'] = fareData

In [23]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,2,1,22.0,1,0,7.25,S,Mr
1,2,1,0,0,38.0,1,0,71.2833,C,Mrs
2,3,1,2,0,26.0,0,0,7.925,S,Miss
3,4,1,0,0,35.0,1,0,53.1,S,Mrs
4,5,0,2,1,35.0,0,0,8.05,S,Mr


In [24]:
# Determine for Sex, SibSp and Parch the average age
age_df = train_df[train_df['Age'].notnull()].copy()
age_df.drop(['PassengerId','Survived', 'SibSp', 'Parch', 'Fare'], inplace=True, axis=1)
age_group = age_df.groupby(['Sex', 'Pclass', 'Title']).mean()

age_group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age
Sex,Pclass,Title,Unnamed: 3_level_1
0,0,HigherTitle,40.5
0,0,Miss,29.744681
0,0,Mrs,40.4
0,0,RareTitle,49.0
0,1,Miss,22.390625
0,1,Mrs,33.547619
0,2,Miss,16.123188
0,2,Mrs,33.515152
1,0,HigherTitle,55.0
1,0,Master,5.306667


In [25]:
from itertools import product
import math
ageDic = {}
sexLst = age_df['Sex'].unique()
pclassLst = age_df['Pclass'].unique()
titleLst = age_df['Title'].unique()
lev2Dic={}
lev3Dic={}

for sex, pclass, title in product(sexLst, pclassLst, titleLst):
    age=age_df[(age_df['Sex']==sex)&(age_df['Pclass']==pclass) &(age_df['Title']==title)]['Age'].mean()
    if not math.isnan(age):
        lev3Dic[title]=age
        lev2Dic[pclass]=lev3Dic
    ageDic[sex]=lev2Dic
print ageDic

{0: {0: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr': 32.76829268292683, 'Miss': 22.390625}, 1: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr': 32.76829268292683, 'Miss': 22.390625}, 2: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr': 32.76829268292683, 'Miss': 22.390625}}, 1: {0: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr': 32.76829268292683, 'Miss': 22.390625}, 1: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr': 32.76829268292683, 'Miss': 22.390625}, 2: {'RareTitle': 49.0, 'Rev': 43.166666666666664, 'Mrs': 33.54761904761905, 'HigherTitle': 40.5, 'Master': 2.2588888888888885, 'Mr

In [26]:
def fillAge(sex, pclass, title, ageDic):
    return ageDic[sex][pclass][title]

In [27]:
ageNan_df = train_df[train_df['Age'].isnull()]
ageNan_df['Age'] = np.vectorize(fillAge)(ageNan_df['Sex'], ageNan_df['Pclass'], ageNan_df['Title'], ageDic)
ageNan_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
5,6,0,2,1,32.768293,0,0,8.4583,Q,Mr
17,18,1,1,1,32.768293,0,0,13.0,S,Mr
19,20,1,2,0,33.547619,0,0,7.225,C,Mrs
26,27,0,2,1,32.768293,0,0,7.225,C,Mr
28,29,1,2,0,22.390625,0,0,7.8792,Q,Miss


In [28]:
train_df[train_df['Age'].isnull()] = ageNan_df
train_df[train_df['PassengerId']==6]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
5,6,0,2,1,32.768293,0,0,8.4583,Q,Mr


In [29]:
titleLst = ['Mr', 'Miss', 'Mrs', 'Master', 'RareTitle', 'Rev', 'HigherTitle']
titleEncoder = pd.get_dummies(titleLst, dummy_na=False)
print titleEncoder

   HigherTitle  Master  Miss   Mr  Mrs  RareTitle  Rev
0          0.0     0.0   0.0  1.0  0.0        0.0  0.0
1          0.0     0.0   1.0  0.0  0.0        0.0  0.0
2          0.0     0.0   0.0  0.0  1.0        0.0  0.0
3          0.0     1.0   0.0  0.0  0.0        0.0  0.0
4          0.0     0.0   0.0  0.0  0.0        1.0  0.0
5          0.0     0.0   0.0  0.0  0.0        0.0  1.0
6          1.0     0.0   0.0  0.0  0.0        0.0  0.0


In [30]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['Title'], dummy_na=False)], axis=1)
train_df = train_df.drop(['Title'], 1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev
0,1,0,2,1,22.0,1,0,7.25,S,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,1,0,0,38.0,1,0,71.2833,C,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,1,2,0,26.0,0,0,7.925,S,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,1,0,0,35.0,1,0,53.1,S,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,0,2,1,35.0,0,0,8.05,S,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [31]:
# One hot encode Embarked
embarkedLst = ['S', 'C', 'Q']
embarkedEncoder = pd.get_dummies(embarkedLst, dummy_na=False)
print embarkedEncoder

     C    Q    S
0  0.0  0.0  1.0
1  1.0  0.0  0.0
2  0.0  1.0  0.0


In [32]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['Embarked'], dummy_na=False)], axis=1)
train_df = train_df.drop(['Embarked'], 1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev,C,Q,S
0,1,0,2,1,22.0,1,0,7.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1,0,0,38.0,1,0,71.2833,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,3,1,2,0,26.0,0,0,7.925,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,1,0,0,35.0,1,0,53.1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,5,0,2,1,35.0,0,0,8.05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


# Preprocess test dataset

In [33]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [34]:
# Format title
test_df['Title'] = test_df['Name'].apply(extractTitle)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [35]:
# Get rid of unnecessary columns
test_df = test_df.drop(['Cabin', 'Ticket', 'Name'],1)

In [36]:
# Convert Sex and Pclass
for feature in encodeFeatureLst:
    test_df[feature] = preprocessing.LabelEncoder().fit_transform(test_df[feature])
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,2,1,34.5,0,0,7.8292,Q,Mr
1,893,2,0,47.0,1,0,7.0,S,Mrs
2,894,1,1,62.0,0,0,9.6875,Q,Mr
3,895,2,1,27.0,0,0,8.6625,S,Mr
4,896,2,0,22.0,1,1,12.2875,S,Mrs


In [37]:
print test_df['Title'].unique()

['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']


In [38]:
test_df['Title'] = test_df['Title'].replace({'Ms':'Mrs', 'Mme':'Mrs', 'Dona': 'Mrs'}, regex=True)
for title in higherTitleLst:
    test_df['Title'] = test_df['Title'].replace({title:'HigherTitle'}, regex=True)
for title in rareTitleLst:
    test_df['Title'] = test_df['Title'].replace({title:'RareTitle'}, regex=True)

print test_df['Title'].unique()

['Mr' 'Mrs' 'Miss' 'Master' 'HigherTitle' 'Rev' 'RareTitle']


In [39]:
ageNan_df = test_df[test_df['Age'].isnull()].copy()
ageNan_df['Age'] = np.vectorize(fillAge)(ageNan_df['Sex'], ageNan_df['Pclass'], ageNan_df['Title'], ageDic)
test_df[test_df['Age'].isnull()] = ageNan_df

In [40]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['Title'], dummy_na=False)], axis=1)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev
0,892,2,1,34.5,0,0,7.8292,Q,Mr,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,893,2,0,47.0,1,0,7.0,S,Mrs,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,894,1,1,62.0,0,0,9.6875,Q,Mr,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,895,2,1,27.0,0,0,8.6625,S,Mr,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,896,2,0,22.0,1,1,12.2875,S,Mrs,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [41]:
test_df = test_df.drop(['Title'], 1)

In [42]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['Embarked'], dummy_na=False)], axis=1)
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev,C,Q,S
0,892,2,1,34.5,0,0,7.8292,Q,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,893,2,0,47.0,1,0,7.0,S,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,894,1,1,62.0,0,0,9.6875,Q,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,895,2,1,27.0,0,0,8.6625,S,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,896,2,0,22.0,1,1,12.2875,S,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [43]:
X_test = test_df.drop(['PassengerId', 'Embarked'],1)
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev,C,Q,S
0,2,1,34.5,0,0,7.8292,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,0,47.0,1,0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,1,62.0,0,0,9.6875,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2,1,27.0,0,0,8.6625,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,0,22.0,1,1,12.2875,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


## Train data with cross validation

In [44]:
X_train = train_df.drop(['PassengerId', 'Survived'],1)
y_train = train_df['Survived']
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,HigherTitle,Master,Miss,Mr,Mrs,RareTitle,Rev,C,Q,S
0,2,1,22.0,1,0,7.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,0,38.0,1,0,71.2833,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2,0,26.0,0,0,7.925,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,35.0,1,0,53.1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,2,1,35.0,0,0,8.05,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
from sklearn.cross_validation import train_test_split
num_test = 0
X_train, X_cross, y_train, y_cross = train_test_split(X_train, y_train, test_size=num_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.grid_search import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier(n_jobs = -1, oob_score = True)

# Choose some parameter combinations to try
parameters = {'n_estimators': [ 40, 100], #[8, 9, 10,11, 12, 20, 30]
              'max_features': ['log2', 'sqrt','auto', None], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [6, 8, 9,  10,  12], 
              'min_samples_split': [2, 3, 4,5],
              'min_samples_leaf': [1,4,5,6,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

## Test prediction

In [None]:
y_pred = clf.predict(X_test)