# Titanic Survival Part 2: Machine Learning Predictions in Python

In [130]:
# import modules
import numpy as np 
import pandas as pd 
import os

# load train dataset 
train = pd.read_csv("../input/train.csv")

In [131]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Name: 
 
 1. Extract titles with regex
 2. create Title attribute
 3. Extract number of characters
 4. create NameLength 
 5. drop Name 
 
 * NB: keep `PassengerID`, will need it for the competition submission file.

In [132]:
import re 

pattern = r',\s(.+?)\.'

In [133]:
def extract_title(a_name):
    """Given a name from the Name column, extract the title by searching for the pattern, 
       returning a tuple that identifies whether the regex worked or not.
    """
    try:
        return (True, *tuple(re.search(pattern, a_name).groups()))
    except:
        return (False, a_name)
    
# testing the function
name_entry = train['Name'][1]
name_entry
extract_title(name_entry)

(True, 'Mrs')

In [134]:
# testing whether all names can be parsed successfully
parsed_name = train['Name'].map(lambda x: extract_title(x))

In [135]:
parsed_name.head()

0      (True, Mr)
1     (True, Mrs)
2    (True, Miss)
3     (True, Mrs)
4      (True, Mr)
Name: Name, dtype: object

In [136]:
def sum_line(line):
    ct = 0
    if line[0] == True:
        ct += 1
    return ct

passed = parsed_name.map(lambda x: sum_line(x))

In [137]:
sum(passed) == len(parsed_name)

True

In [138]:
# unpack tuples to get titles
def unpack_tuples(x):
    a,b = x
    return b

titles = parsed_name.map(lambda x: unpack_tuples(x))

In [139]:
titles.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [140]:
all_titles = titles.unique().tolist()
common_titles = ["Mr", "Mrs", "Miss"]
#rare_titles = list(set(all_titles) - set(common_titles))

In [141]:
def simplify_title(x):
    """"Given a parsed name, extract the title from the tuple, 
       if it belongs to a common title, return that, otherwise, return 'rare'
    """
    try:
        a,b = x
        if b in common_titles:
            return (b)
        else:
            return ('Rare')
    except:
        return ('')

parsed_name_clean = parsed_name.map(lambda x: simplify_title(x))

## Name Length

In [142]:
train['NameLength'] = train['Name'].map(lambda x: len(x))

In [143]:
#pd.options.display.max_rows = 400

data={'original': train['Name'], 'parsed':parsed_name, 'clean':parsed_name_clean, 'length': train['NameLength']}
pd.DataFrame(data).head(10)

Unnamed: 0,original,parsed,clean,length
0,"Braund, Mr. Owen Harris","(True, Mr)",Mr,23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...","(True, Mrs)",Mrs,51
2,"Heikkinen, Miss. Laina","(True, Miss)",Miss,22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","(True, Mrs)",Mrs,44
4,"Allen, Mr. William Henry","(True, Mr)",Mr,24
5,"Moran, Mr. James","(True, Mr)",Mr,16
6,"McCarthy, Mr. Timothy J","(True, Mr)",Mr,23
7,"Palsson, Master. Gosta Leonard","(True, Master)",Rare,30
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)","(True, Mrs)",Mrs,49
9,"Nasser, Mrs. Nicholas (Adele Achem)","(True, Mrs)",Mrs,35


In [144]:
train['Title'] = parsed_name_clean

In [145]:
# drop Name variable
train = train.drop('Name', 1)

In [146]:
# current state of our train dataset
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,23,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,51,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,44,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,24,Mr
5,6,0,3,male,,0,0,330877,8.4583,,Q,16,Mr
6,7,0,1,male,54.0,0,0,17463,51.8625,E46,S,23,Mr
7,8,0,3,male,2.0,3,1,349909,21.075,,S,30,Rare
8,9,1,3,female,27.0,0,2,347742,11.1333,,S,49,Mrs
9,10,1,2,female,14.0,1,0,237736,30.0708,,C,35,Mrs


## Sex, SibSp & Parch

In [147]:
def is_male(x):
    try:
        if x == 'male':
            return(1)
        else:
            return(0)
    except:
        pass
    
train['IsMale'] = train['Sex'].map(lambda x: is_male(x))

In [148]:
train = train.drop('Sex', 1)

In [149]:
train['NumRelatives'] = train['SibSp'] + train['Parch']

In [150]:
train = train.drop('SibSp', 1)
train = train.drop('Parch', 1)

In [151]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives
0,1,0,3,22.0,A/5 21171,7.25,,S,23,Mr,1,1
1,2,1,1,38.0,PC 17599,71.2833,C85,C,51,Mrs,0,1
2,3,1,3,26.0,STON/O2. 3101282,7.925,,S,22,Miss,0,0
3,4,1,1,35.0,113803,53.1,C123,S,44,Mrs,0,1
4,5,0,3,35.0,373450,8.05,,S,24,Mr,1,0


## Ticket & Fare

In [152]:
train['count'] = 1

In [153]:
group = train[['Ticket','Fare','count']].groupby('Ticket').sum() # just didn't work, the Fare should stay the same

In [154]:
group['Fare'] = group['Fare']/group['count']
group['FarePerPerson'] = (group['Fare'])/group['count']

In [155]:
group.head()

Unnamed: 0_level_0,Fare,count,FarePerPerson
Ticket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110152,86.5,3,28.833333
110413,79.65,3,26.55
110465,52.0,2,26.0
110564,26.55,1,26.55
110813,75.25,1,75.25


In [156]:
# testing
sort_by_ticket = train.sort_values('Ticket')
sort_by_ticket.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives,count
504,505,1,1,16.0,110152,86.5,B79,S,21,Miss,0,0,1
257,258,1,1,30.0,110152,86.5,B77,S,20,Miss,0,0,1
759,760,1,1,33.0,110152,86.5,B77,S,56,Rare,0,0,1
262,263,0,1,52.0,110413,79.65,E67,S,17,Mr,1,2,1
558,559,1,1,39.0,110413,79.65,E67,S,38,Mrs,0,2,1
585,586,1,1,18.0,110413,79.65,E68,S,19,Miss,0,2,1
110,111,0,1,47.0,110465,52.0,C110,S,30,Mr,1,0,1
475,476,0,1,,110465,52.0,A14,S,27,Mr,1,0,1
430,431,1,1,28.0,110564,26.55,C52,S,41,Mr,1,0,1
366,367,1,1,60.0,110813,75.25,D37,C,48,Mrs,0,1,1


In [157]:
def map_fare_perperson(a_ticket):

    row_names=list(group.index.values)
    row_names_array=np.array(row_names)
    loc=np.where(row_names_array == a_ticket)[0][0]
    return(group['FarePerPerson'][loc:loc+1][0])
    
sort_by_ticket['FarePerPerson'] = sort_by_ticket['Ticket'].map(lambda x: map_fare_perperson(x))
sort_by_ticket.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives,count,FarePerPerson
504,505,1,1,16.0,110152,86.5,B79,S,21,Miss,0,0,1,28.833333
257,258,1,1,30.0,110152,86.5,B77,S,20,Miss,0,0,1,28.833333
759,760,1,1,33.0,110152,86.5,B77,S,56,Rare,0,0,1,28.833333
262,263,0,1,52.0,110413,79.65,E67,S,17,Mr,1,2,1,26.55
558,559,1,1,39.0,110413,79.65,E67,S,38,Mrs,0,2,1,26.55
585,586,1,1,18.0,110413,79.65,E68,S,19,Miss,0,2,1,26.55
110,111,0,1,47.0,110465,52.0,C110,S,30,Mr,1,0,1,26.0
475,476,0,1,,110465,52.0,A14,S,27,Mr,1,0,1,26.0
430,431,1,1,28.0,110564,26.55,C52,S,41,Mr,1,0,1,26.55
366,367,1,1,60.0,110813,75.25,D37,C,48,Mrs,0,1,1,75.25


In [158]:
train['FarePerPerson'] = train['Ticket'].map(lambda x: map_fare_perperson(x))

In [159]:
train = train.drop('Ticket', 1)
train = train.drop('Fare', 1)
train = train.drop('count', 1)

In [160]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives,FarePerPerson
0,1,0,3,22.0,,S,23,Mr,1,1,7.25
1,2,1,1,38.0,C85,C,51,Mrs,0,1,71.2833
2,3,1,3,26.0,,S,22,Miss,0,0,7.925
3,4,1,1,35.0,C123,S,44,Mrs,0,1,26.55
4,5,0,3,35.0,,S,24,Mr,1,0,8.05


##  Cabin

In [161]:
train['Cabin'].head(100).dropna()

1             C85
3            C123
6             E46
10             G6
11           C103
21            D56
23             A6
27    C23 C25 C27
31            B78
52            D33
54            B30
55            C52
61            B28
62            C83
66            F33
75          F G73
88    C23 C25 C27
92            E31
96             A5
97        D10 D12
Name: Cabin, dtype: object

In [162]:
def clean_cabin(x):
    
    letter=x[0]
    
    if letter not in 'ABCDEF':
        letter='F'
        
    return(letter)
    
train['Cabin']=train['Cabin'].map(lambda x: clean_cabin(x), na_action='ignore')

In [163]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives,FarePerPerson
0,1,0,3,22.0,,S,23,Mr,1,1,7.25
1,2,1,1,38.0,C,C,51,Mrs,0,1,71.2833
2,3,1,3,26.0,,S,22,Miss,0,0,7.925
3,4,1,1,35.0,C,S,44,Mrs,0,1,26.55
4,5,0,3,35.0,,S,24,Mr,1,0,8.05


## Embarked

In [176]:
# impute two missing with majority class
train['Embarked']=train['Embarked'].fillna('S')

In [None]:
# create dummies?

## Age

In [181]:
len(train['Age'][train['Age'].isna()])

177

In [None]:
# impute missing values

In [182]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Cabin,Embarked,NameLength,Title,IsMale,NumRelatives,FarePerPerson
0,1,0,3,22.0,,S,23,Mr,1,1,7.25
1,2,1,1,38.0,C,C,51,Mrs,0,1,71.2833
2,3,1,3,26.0,,S,22,Miss,0,0,7.925
3,4,1,1,35.0,C,S,44,Mrs,0,1,26.55
4,5,0,3,35.0,,S,24,Mr,1,0,8.05


* Title: 3 dummies
* Cabin: 5 dummies
* Embarked: 2 dummies
* Sex: 1 dummy
* Pclass: 2 dummies
* Age: numeric or 4 dummies

In [184]:
train['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Rare'], dtype=object)

In [185]:
# too many values to impute, use it only for tree model?
train['Cabin'].unique()

array([nan, 'C', 'E', 'F', 'D', 'A', 'B'], dtype=object)

In [186]:
len(train['Cabin'][train['Cabin'].isna()])

687