In [1]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier

In [4]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
combine = [train_df, test_df]
print(train_df.columns.values)
train_df.head()

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
print("Before",train_df.shape,test_df.shape,combine[0].shape, combine[1].shape)

train_df = train_df.drop(["Ticket","Cabin"],axis = 1)
test_df = test_df.drop(["Ticket","Cabin"],axis = 1)
combine = [train_df,test_df]
print("After",train_df.shape,test_df.shape,combine[0].shape, combine[1].shape)


Before (891, 12) (418, 11) (891, 12) (418, 11)
After (891, 10) (418, 9) (891, 10) (418, 9)


In [7]:
for dataset in combine:
    dataset["Title"] = dataset.Name.str.extract(" ([A-Za-z]+)\.",expand = False)
    
pd.crosstab(train_df["Title"], train_df["Sex"])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [8]:
for dataset in combine:
    dataset["Title"] = dataset["Title"].replace(["Lady", "Countess", "Capt","Col", "Don",
                                                "Dr","Major","Rev","Sir","Jonkheer","Dona"],"Rare")
    dataset["Title"] = dataset["Title"].replace("Mlle","Miss")
    dataset["Title"] = dataset["Title"].replace("Ms","Miss")
    dataset["Title"] = dataset["Title"].replace("Mme","Mrs")

train_df[["Title","Survived"]].groupby(["Title"],as_index = False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [9]:
title_mapping = {"Mr" : 1, "Miss" : 2,"Mrs":3,"Master" : 4,"Rare" : 5}
for dataset in combine:
    dataset["Title"] = dataset["Title"].map(title_mapping)
    dataset["Title"] = dataset["Title"].fillna(0)
    
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [10]:
train_df = train_df.drop(["Name","PassengerId"],axis = 1)
test_df = test_df.drop(["Name"],axis = 1)
combine = [train_df,test_df]
train_df.shape, test_df.shape

((891, 9), (418, 9))

In [11]:
for dataset in combine:
    dataset["Sex"] = dataset["Sex"].map({"female":1,"male":0}).astype(int)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,3
2,1,3,1,26.0,0,0,7.925,S,2
3,1,1,1,35.0,1,0,53.1,S,3
4,0,3,0,35.0,0,0,8.05,S,1


In [12]:
guess_ages = np.zeros((2,3))

In [15]:
for dataset in combine:
    for i in range(0,2):
        for j in range(0,3):
            guess_df = dataset[(dataset["Sex"] == i)&(dataset["Pclass"]==j+1)]["Age"].dropna()
            
            
            age_guess = guess_df.median()
            
            guess_ages[i,j] = int(age_guess/0.5+0.5)*0.5
    
    for i in range(0,2):
        for j in range(0,3):
            dataset.loc[(dataset.Age.isnull()) & 
                       (dataset.Sex == i) &
                       (dataset.Pclass == j + 1 ),"Age"] = guess_ages[i,j]
    dataset["Age"] = dataset["Age"].astype(int) 
    
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,1
1,1,1,1,38,1,0,71.2833,C,3
2,1,3,1,26,0,0,7.925,S,2
3,1,1,1,35,1,0,53.1,S,3
4,0,3,0,35,0,0,8.05,S,1


In [16]:
train_df["AgeBand"] = pd.cut(train_df["Age"],5)
train_df[["AgeBand","Survived"]].groupby(["AgeBand"],as_index = False).mean().sort_values(by = "AgeBand",ascending = True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [None]:
for dataset in combine:
    