# Titanic Survival Part 2: Machine Learning Predictions in Python

In [1]:
# import modules
import numpy as np 
import pandas as pd 
import os
import re 

# load train dataset 
train = pd.read_csv("../input/train.csv")

In [2]:
# simplify Name
def extract_title(a_name):
    """Given a name from the Name column, extract the title by searching for the pattern, 
       returning a tuple that identifies whether the regex worked or not.
    """
    try:
        return (True, *tuple(re.search(r',\s(.+?)\.', a_name).groups()))
    except:
        return (False, a_name)
    
parsed_name = train['Name'].map(lambda x: extract_title(x))    

In [3]:
# ensure all names were parsed
def sum_line(line):
    ct = 0
    if line[0] == True:
        ct += 1
    return ct

passed = parsed_name.map(lambda x: sum_line(x))
assert sum(passed) == len(parsed_name)

In [4]:
# create Title
def simplify_title(x):
    """"Given a parsed name, extract the title from the tuple, 
       if it belongs to a common title, return that, otherwise, return 'rare'
    """
    try:
        a,b = x
        if b in ["Mr", "Mrs", "Miss"]:
            return (b)
        else:
            return ('Rare')
    except:
        return ('')

train['Title'] = parsed_name.map(lambda x: simplify_title(x))

In [5]:
# Create NameLength
train['NameLength'] = train['Name'].map(lambda x: len(x))

In [6]:
# Create Is Male
def is_male(x):
    try:
        if x == 'male':
            return(1)
        else:
            return(0)
    except:
        pass
    
train['IsMale'] = train['Sex'].map(lambda x: is_male(x))

In [7]:
# Create NumRelatives
train['NumRelatives'] = train['SibSp'] + train['Parch']

In [8]:
# Create FarePerPerson
train['count'] = 1
group = train[['Ticket','Fare','count']].groupby('Ticket').sum()
group['Fare'] = group['Fare']/group['count']
group['FarePerPerson'] = (group['Fare'])/group['count']

def map_fare_perperson(a_ticket):

    row_names=list(group.index.values)
    row_names_array=np.array(row_names)
    loc=np.where(row_names_array == a_ticket)[0][0]
    return(group['FarePerPerson'][loc:loc+1][0])

train['FarePerPerson'] = train['Ticket'].map(lambda x: map_fare_perperson(x))

In [9]:
train.drop(['Name', 'Sex','SibSp','Parch','Ticket','Fare','count'], axis=1, inplace=True, errors='ignore')

In [10]:
# Create Deck - just first letter (Deck)

#leaving NA values (will just be 0s in one-hot encoding)
def clean_cabin(x):
    
    letter=x[0]
    
    if letter not in 'ABCDEF':
        letter='F'
        
    return(letter)
    
train['Deck']=train['Cabin'].map(lambda x: clean_cabin(x), na_action='ignore')

In [11]:
# Embarked
# impute two missing with majority class
train['Embarked']=train['Embarked'].fillna('S')

In [12]:
# Age

# impute missing values using random values from the normal distribution
np.random.seed(123)
size = len(train['Age'][train['Age'].isna()])
mu, sigma = train['Age'].mean(), np.std(train['Age'])

# set seed
imputed_ages = np.round_(np.random.normal(mu, sigma, size))
imputed_ages[imputed_ages < 0] = 0
imputed_ages = pd.Series(imputed_ages)

In [13]:
indices=np.where(train['Age'].isna())[0].tolist()

for i in range(0, len(indices)):
    
    index=indices[i]
    train['Age'][index]=imputed_ages[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## One-Hot Encoding

* Title: 3 dummies
* Cabin: 5 dummies
* Embarked: 2 dummies
* Sex: 1 dummy
* Pclass: 2 dummies
* Age: numeric or 4 dummies
