# Titanic Survival Part 2: Machine Learning Predictions in Python

In [193]:
# import modules
import numpy as np 
import pandas as pd 
import os
import re 

# load train dataset 
train = pd.read_csv("../input/train.csv")

In [194]:
# simplify Name
def extract_title(a_name):
    """Given a name from the Name column, extract the title by searching for the pattern, 
       returning a tuple that identifies whether the regex worked or not.
    """
    try:
        return (True, *tuple(re.search(r',\s(.+?)\.', a_name).groups()))
    except:
        return (False, a_name)
    
parsed_name = train['Name'].map(lambda x: extract_title(x))    

In [195]:
# ensure all names were parsed
def sum_line(line):
    ct = 0
    if line[0] == True:
        ct += 1
    return ct

passed = parsed_name.map(lambda x: sum_line(x))
assert sum(passed) == len(parsed_name)

In [196]:
# create Title
def simplify_title(x):
    """"Given a parsed name, extract the title from the tuple, 
       if it belongs to a common title, return that, otherwise, return 'rare'
    """
    try:
        a,b = x
        if b in ["Mr", "Mrs", "Miss"]:
            return (b)
        else:
            return ('Rare')
    except:
        return ('')

train['Title'] = parsed_name.map(lambda x: simplify_title(x))

In [197]:
# Create NameLength
train['NameLength'] = train['Name'].map(lambda x: len(x))

In [198]:
# Create NumRelatives
train['NumRelatives'] = train['SibSp'] + train['Parch']

In [199]:
# Create FarePerPerson
train['count'] = 1
group = train[['Ticket','Fare','count']].groupby('Ticket').sum()
group['Fare'] = group['Fare']/group['count']
group['FarePerPerson'] = (group['Fare'])/group['count']

def map_fare_perperson(ticket):
    """Counts people per tickets and divides the fare per number of people in a ticket.
    Uses the group helper table with aggregated results.
    """

    row_names=np.array(list(group.index.values))
    loc=np.where(row_names == ticket)[0][0]
    
    return(group['FarePerPerson'][loc:loc+1][0])

train['FarePerPerson'] = train['Ticket'].map(lambda x: map_fare_perperson(x))

In [200]:
# Create Deck - just first letter (Deck)

#leaving NA values (will just be 0s in one-hot encoding)
def clean_cabin(x):
    """Extract the Deck information, first letter only, 
    Limit to six groups, A through F, binning other decks to F.
    """
    
    letter=x[0]
    
    if letter not in 'ABCDEF':
        letter='F'
        
    return(letter)
    
train['Deck']=train['Cabin'].map(lambda x: clean_cabin(x), na_action='ignore')

In [201]:
# Embarked
# impute two missing with majority class
train['Embarked']=train['Embarked'].fillna('S')

In [202]:
# Age
# impute with group medians given gender, passenger class, and title
train['Age'] = train.groupby(['Sex', 'Pclass','Title'])['Age'].transform(lambda x: x.fillna(x.median()))

## Binary Indicators:

In [203]:
# Sex
train['IsMale'] = pd.get_dummies(train['Sex'])['male']

In [204]:
# Embarked
train['Embarked_S']=pd.get_dummies(train['Embarked'])['S']
train['Embarked_Q']=pd.get_dummies(train['Embarked'])['Q']

In [206]:
# Title
train['Title_Mr']=pd.get_dummies(train['Title'])['Mr']
train['Title_Mrs']=pd.get_dummies(train['Title'])['Mrs']
train['Title_Miss']=pd.get_dummies(train['Title'])['Miss']

In [207]:
# Pclass
train['Pclass_1']=pd.get_dummies(train['Pclass'])[1]
train['Pclass_2']=pd.get_dummies(train['Pclass'])[2]

In [208]:
# Deck
train['Deck'].fillna('None') # create a None category for NA values
train['Deck_A']=pd.get_dummies(train['Deck'])['A']
train['Deck_B']=pd.get_dummies(train['Deck'])['B']
train['Deck_C']=pd.get_dummies(train['Deck'])['C']
train['Deck_D']=pd.get_dummies(train['Deck'])['D']
train['Deck_E']=pd.get_dummies(train['Deck'])['E']
train['Deck_F']=pd.get_dummies(train['Deck'])['F']

In [209]:
# drop unwanted, redundant columns
train.drop(['Pclass','Name','Sex','SibSp','Parch','Ticket','Fare',\
            'Cabin','count','Embarked','Title','Deck'], axis=1, inplace=True)

In [210]:
train.head()

Unnamed: 0,PassengerId,Survived,Age,NameLength,NumRelatives,FarePerPerson,IsMale,Embarked_S,Embarked_Q,Title_Mr,Title_Mrs,Title_Miss,Pclass_1,Pclass_2,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F
0,1,0,22.0,23,1,7.25,1,1,0,1,0,0,0,0,0,0,0,0,0,0
1,2,1,38.0,51,1,71.2833,0,0,0,0,1,0,1,0,0,0,1,0,0,0
2,3,1,26.0,22,0,7.925,0,1,0,0,0,1,0,0,0,0,0,0,0,0
3,4,1,35.0,44,1,26.55,0,1,0,0,1,0,1,0,0,0,1,0,0,0
4,5,0,35.0,24,0,8.05,1,1,0,1,0,0,0,0,0,0,0,0,0,0
