# Titanic Survival Part 2: Machine Learning Predictions in Python

In [141]:
# import modules
import numpy as np 
import pandas as pd 
import os

# load train dataset 
train = pd.read_csv("../input/train.csv")

In [142]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Name: 
 
 1. Extract titles with regex
 2. create Title attribute
 3. Extract number of characters
 4. create NameLength 
 5. drop Name 
 
 * NB: keep `PassengerID`, will need it for the competition submission file.

In [143]:
import re 

pattern = r',\s(.+?)\.'

In [144]:
def extract_title(a_name):
    """Given a name from the Name column, extract the title by searching for the pattern, 
       returning a tuple that identifies whether the regex worked or not.
    """
    try:
        return (True, *tuple(re.search(pattern, a_name).groups()))
    except:
        return (False, a_name)
    
# testing the function
name_entry = train['Name'][1]
name_entry
extract_title(name_entry)

(True, 'Mrs')

In [145]:
# testing whether all names can be parsed successfully
parsed_name = train['Name'].map(lambda x: extract_title(x))

In [146]:
parsed_name.head()

0      (True, Mr)
1     (True, Mrs)
2    (True, Miss)
3     (True, Mrs)
4      (True, Mr)
Name: Name, dtype: object

In [147]:
def sum_line(line):
    ct = 0
    if line[0] == True:
        ct += 1
    return ct

passed = parsed_name.map(lambda x: sum_line(x))

In [148]:
sum(passed) == len(parsed_name)

True

In [149]:
# unpack tuples to get titles
def unpack_tuples(x):
    a,b = x
    return b

titles = parsed_name.map(lambda x: unpack_tuples(x))

In [150]:
titles.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [151]:
all_titles = titles.unique().tolist()
common_titles = ["Mr", "Mrs", "Miss"]
#rare_titles = list(set(all_titles) - set(common_titles))

In [152]:
def simplify_title(x):
    """"Given a parsed name, extract the title from the tuple, 
       if it belongs to a common title, return that, otherwise, return 'rare'
    """
    try:
        a,b = x
        if b in common_titles:
            return (b)
        else:
            return ('Rare')
    except:
        return ('')

parsed_name_clean = parsed_name.map(lambda x: simplify_title(x))

## Name Length

In [153]:
train['NameLength'] = train['Name'].map(lambda x: len(x))

In [154]:
#pd.options.display.max_rows = 400

data={'original': train['Name'], 'parsed':parsed_name, 'clean':parsed_name_clean, 'length': train['NameLength']}
pd.DataFrame(data).head(10)

Unnamed: 0,original,parsed,clean,length
0,"Braund, Mr. Owen Harris","(True, Mr)",Mr,23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...","(True, Mrs)",Mrs,51
2,"Heikkinen, Miss. Laina","(True, Miss)",Miss,22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","(True, Mrs)",Mrs,44
4,"Allen, Mr. William Henry","(True, Mr)",Mr,24
5,"Moran, Mr. James","(True, Mr)",Mr,16
6,"McCarthy, Mr. Timothy J","(True, Mr)",Mr,23
7,"Palsson, Master. Gosta Leonard","(True, Master)",Rare,30
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)","(True, Mrs)",Mrs,49
9,"Nasser, Mrs. Nicholas (Adele Achem)","(True, Mrs)",Mrs,35


In [155]:
train['Title'] = parsed_name_clean

In [156]:
# drop Name variable
train = train.drop('Name', 1)

In [157]:
# current state of our train dataset
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameLength,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,23,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,51,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,44,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,24,Mr
5,6,0,3,male,,0,0,330877,8.4583,,Q,16,Mr
6,7,0,1,male,54.0,0,0,17463,51.8625,E46,S,23,Mr
7,8,0,3,male,2.0,3,1,349909,21.075,,S,30,Rare
8,9,1,3,female,27.0,0,2,347742,11.1333,,S,49,Mrs
9,10,1,2,female,14.0,1,0,237736,30.0708,,C,35,Mrs
