# Titanic Survival Part 2: Machine Learning Predictions in Python

In [97]:
# import modules
import numpy as np 
import pandas as pd 
import os
import re 

# load train dataset 
train = pd.read_csv("../input/train.csv")

In [98]:
# simplify Name
def extract_title(a_name):
    """Given a name from the Name column, extract the title by searching for the pattern, 
       returning a tuple that identifies whether the regex worked or not.
    """
    try:
        return (True, *tuple(re.search(r',\s(.+?)\.', a_name).groups()))
    except:
        return (False, a_name)
    
parsed_name = train['Name'].map(lambda x: extract_title(x))    

In [99]:
# ensure all names were parsed
def sum_line(line):
    ct = 0
    if line[0] == True:
        ct += 1
    return ct

passed = parsed_name.map(lambda x: sum_line(x))
assert sum(passed) == len(parsed_name)

In [100]:
# create Title
def simplify_title(x):
    """"Given a parsed name, extract the title from the tuple, 
       if it belongs to a common title, return that, otherwise, return 'rare'
    """
    try:
        a,b = x
        if b in ["Mr", "Mrs", "Miss"]:
            return (b)
        else:
            return ('Rare')
    except:
        return ('')

train['Title'] = parsed_name.map(lambda x: simplify_title(x))

In [101]:
# Create NameLength
train['NameLength'] = train['Name'].map(lambda x: len(x))

In [102]:
# Create NumRelatives
train['NumRelatives'] = train['SibSp'] + train['Parch']

In [103]:
# Create FarePerPerson
train['count'] = 1
group = train[['Ticket','Fare','count']].groupby('Ticket').sum()
group['Fare'] = group['Fare']/group['count']
group['FarePerPerson'] = (group['Fare'])/group['count']

def map_fare_perperson(ticket):
    """Counts people per tickets and divides the fare per number of people in a ticket.
    Uses the group helper table with aggregated results.
    """

    row_names=np.array(list(group.index.values))
    loc=np.where(row_names == ticket)[0][0]
    
    return(group['FarePerPerson'][loc:loc+1][0])

train['FarePerPerson'] = train['Ticket'].map(lambda x: map_fare_perperson(x))

In [104]:
# Create Deck - just first letter (Deck)

#leaving NA values (will just be 0s in one-hot encoding)
def clean_cabin(x):
    """Extract the Deck information, first letter only, 
    Limit to six groups, A through F, binning other decks to F.
    """
    
    letter=x[0]
    
    if letter not in 'ABCDEF':
        letter='F'
        
    return(letter)
    
train['Deck']=train['Cabin'].map(lambda x: clean_cabin(x), na_action='ignore')

In [105]:
# Embarked
# impute two missing with majority class
train['Embarked']=train['Embarked'].fillna('S')

In [106]:
# Age
# impute with group medians given gender, passenger class, and title
train['Age'] = train.groupby(['Sex', 'Pclass','Title'])['Age'].transform(lambda x: x.fillna(x.median()))

## Binary Indicators:

In [107]:
# Sex
train['IsMale'] = pd.get_dummies(train['Sex'])['male']

In [108]:
# Embarked
train['Embarked_S']=pd.get_dummies(train['Embarked'])['S']
train['Embarked_Q']=pd.get_dummies(train['Embarked'])['Q']

In [109]:
# Title
train['Title_Mr']=pd.get_dummies(train['Title'])['Mr']
train['Title_Mrs']=pd.get_dummies(train['Title'])['Mrs']
train['Title_Miss']=pd.get_dummies(train['Title'])['Miss']

In [110]:
# Pclass
train['Pclass_1']=pd.get_dummies(train['Pclass'])[1]
train['Pclass_2']=pd.get_dummies(train['Pclass'])[2]

In [111]:
# Deck
train['Deck'].fillna('None') # create a None category for NA values
train['Deck_A']=pd.get_dummies(train['Deck'])['A']
train['Deck_B']=pd.get_dummies(train['Deck'])['B']
train['Deck_C']=pd.get_dummies(train['Deck'])['C']
train['Deck_D']=pd.get_dummies(train['Deck'])['D']
train['Deck_E']=pd.get_dummies(train['Deck'])['E']
train['Deck_F']=pd.get_dummies(train['Deck'])['F']

In [112]:
# drop unwanted, redundant columns
train.drop(['PassengerId', 'Pclass','Name','Sex','SibSp','Parch','Ticket','Fare',\
            'Cabin','count','Embarked','Title','Deck'], axis=1, inplace=True)

In [113]:
train.head()

Unnamed: 0,Survived,Age,NameLength,NumRelatives,FarePerPerson,IsMale,Embarked_S,Embarked_Q,Title_Mr,Title_Mrs,Title_Miss,Pclass_1,Pclass_2,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F
0,0,22.0,23,1,7.25,1,1,0,1,0,0,0,0,0,0,0,0,0,0
1,1,38.0,51,1,71.2833,0,0,0,0,1,0,1,0,0,0,1,0,0,0
2,1,26.0,22,0,7.925,0,1,0,0,0,1,0,0,0,0,0,0,0,0
3,1,35.0,44,1,26.55,0,1,0,0,1,0,1,0,0,0,1,0,0,0
4,0,35.0,24,0,8.05,1,1,0,1,0,0,0,0,0,0,0,0,0,0


In [114]:
# separate dependent and independent variables
survived_labels = train['Survived'].copy()
train = train.drop('Survived', axis=1) 

In [115]:
train.head()

Unnamed: 0,Age,NameLength,NumRelatives,FarePerPerson,IsMale,Embarked_S,Embarked_Q,Title_Mr,Title_Mrs,Title_Miss,Pclass_1,Pclass_2,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F
0,22.0,23,1,7.25,1,1,0,1,0,0,0,0,0,0,0,0,0,0
1,38.0,51,1,71.2833,0,0,0,0,1,0,1,0,0,0,1,0,0,0
2,26.0,22,0,7.925,0,1,0,0,0,1,0,0,0,0,0,0,0,0
3,35.0,44,1,26.55,0,1,0,0,1,0,1,0,0,0,1,0,0,0
4,35.0,24,0,8.05,1,1,0,1,0,0,0,0,0,0,0,0,0,0


In [116]:
survived_labels.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [117]:
# scale Age, NameLength, FarePerPerson
def min_max_scaler(X, x):
    x_new = (x - min(X)) / (max(X) - min(X))
    return(x_new)

for var in ['Age','NameLength','NumRelatives', 'FarePerPerson']:
    train[var] = train[var].map(lambda x: min_max_scaler(train[var], x))

In [118]:
train.head()

Unnamed: 0,Age,NameLength,NumRelatives,FarePerPerson,IsMale,Embarked_S,Embarked_Q,Title_Mr,Title_Mrs,Title_Miss,Pclass_1,Pclass_2,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F
0,0.271174,0.157143,0.1,0.03269,1,1,0,1,0,0,0,0,0,0,0,0,0,0
1,0.472229,0.557143,0.1,0.321416,0,0,0,0,1,0,1,0,0,0,1,0,0,0
2,0.321438,0.142857,0.0,0.035734,0,1,0,0,0,1,0,0,0,0,0,0,0,0
3,0.434531,0.457143,0.1,0.119714,0,1,0,0,1,0,1,0,0,0,1,0,0,0
4,0.434531,0.171429,0.0,0.036297,1,1,0,1,0,0,0,0,0,0,0,0,0,0


In [121]:
train['NumRelatives'].describe()

count    891.000000
mean       0.090460
std        0.161346
min        0.000000
25%        0.000000
50%        0.000000
75%        0.100000
max        1.000000
Name: NumRelatives, dtype: float64

In [122]:
train['FarePerPerson'].describe()

count    891.000000
mean       0.080210
std        0.095672
min        0.000000
25%        0.035001
50%        0.039905
75%        0.109515
max        1.000000
Name: FarePerPerson, dtype: float64

In [123]:
train['Age'].describe()

count    891.000000
mean       0.360950
std        0.169599
min        0.000000
25%        0.258608
50%        0.321438
75%        0.456522
max        1.000000
Name: Age, dtype: float64

In [124]:
train['NameLength'].describe()

count    891.000000
mean       0.213789
std        0.132594
min        0.000000
25%        0.114286
50%        0.185714
75%        0.257143
max        1.000000
Name: NameLength, dtype: float64

In [125]:
# use log for skewed variables?
# use standard scaling instead of min-max?