In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
# Title: Data Science Starter Project
# For: Incoming Data Science SWEs for Zest.ai
# Written By: Christopher Ong, Senior SWE
# Last Updated: Feb 9, 2021

In [3]:
# Welcome to the Starter Project for Data Science at Zest.ai! This project is designed to give you some introductory
# practice to the art of feature engineering.

# Let's begin. The year is 1912 and the titanic has just crashed. Imagine that you are a data engineer on a team 
# that is in charge of analyzing the crash and investigating the patterns of survival, who got a lifeboat and why?
# You have a raw dataset of a fraction of the passengers, and each entry contains the name of the passenger, 
# several other important pieces of personal information, and whether or not they survived the crash. 

# Let's take a look at the data:

In [4]:
titanic_path = "./train.csv"

In [5]:
# Using the above file path. Load the csv data into a dataframe and take a look at the first and last few lines. 
# Google any commands that you are not familiar with.

In [6]:
df = pd.read_csv(titanic_path, index_col=0)

In [7]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",35.0,373450,8.05,,S


In [8]:
df.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",27.0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",26.0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",32.0,370376,7.75,,Q


In [9]:
# Now, in theory, you could just jam this dataset into your favorite classifier and get a result. But the result
# probably wouldn't be optimal. Why? Because there are a lot of hidden data features hidden within the raw data that
# need to be extracted to be fully useful to a classifier.

# For example, we know that in general, women and children were given first access to lifeboats... but we don't have
# any raw data that gives the gender of each passenger, right? How might we extract the gender of the passengers
# from the data set? Brainstorm this a little before moving on (b/c we're about to reveal the answer)

In [10]:
# You don't have the gender of each passenger, per se, but you do have their full name and title! And from those, you
# might be able to extract the gender of each passenger! Try it out: see if you can add another column on the dataset
# entitled "gender" containing the gender for each passenger. If the gender is unknown, you can mark that as well!

In [11]:
df.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Ticket,Fare,Cabin,Embarked
881,882,0,3,"Markun, Mr. Johann",33.0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",22.0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",28.0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",25.0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",39.0,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",27.0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",26.0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",32.0,370376,7.75,,Q


In [12]:
titles = []
for name in list(df['Name']):
    pattern = re.compile('\, ')
    results = re.split(pattern, name)
    pattern = re.compile('\.')
    results = re.split(pattern, results[1])
    titles.append(results[0])

In [13]:
titles

['Mr',
 'Mrs',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Mrs',
 'Mrs',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Don',
 'Mrs',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Miss',
 'Mrs',
 'Mrs',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mrs',
 'Master',
 'Mr',
 'Mrs',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Master',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 'Master',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Master',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mrs',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mrs',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Mr',
 'Miss',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Mr',
 'Mr',
 'Miss',
 'Mr',
 'Master',
 'Mr',
 

In [14]:
set(titles)

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [15]:
male_titles = ['Capt', 'Col', 'Don','Jonkheer', 'Major', 'Master', 'Mr', 'Rev', 'Sir',]

In [16]:
female_titles = ['Lady', 'Miss', 'Mlle', 'Mme',  'Mrs', 'Ms', 'the Countess']

In [17]:
genders = []
for title in titles:
    if title in male_titles:
        genders.append('m')
    elif title in female_titles:
        genders.append('f')
    else:
        genders.append('unknown')
        
df['Genders'] = genders

In [18]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Ticket,Fare,Cabin,Embarked,Genders
0,1,0,3,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,,S,m
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,C85,C,f
2,3,1,3,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,,S,f
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,C123,S,f
4,5,0,3,"Allen, Mr. William Henry",35.0,373450,8.05,,S,m


In [19]:
# Let's take it a step further. The titanic data commission wants to have the first and last name of each passenger
# extracted for record keeping. See if you can create two new columns and populate them with the first and last names
# of each passenger. If they have middle names, feel free to create additional columns.

# HINT: this might be especially tricky for married women. Remember, that back in the 1910s, married women still went
# by their husband's first and last names. See if you can find a way to get past that and populate the first and last
# name columns with their real names, not their husbands'.

In [20]:
last_names = []
first_names = []
for name in list(df['Name']):
    pattern_m_last = re.compile('\,')
    pattern_f = re.compile('.*\(.*\).*')
    pattern_f_first = re.compile('\(')
    pattern_f_last = re.compile('\)')
    
    results = re.split(pattern_m_last, name)
    
    if re.search(pattern_f, name) == None:
        last_names.append(results[0])
    
        first_name = results[1].split()[1]
        first_names.append(first_name)

    else:
        first_name = re.split(pattern_f_first, results[1])[1].split()[0]
        first_names.append(first_name)
        
        last_name = re.split(pattern_f_last, results[1])[0].split()[-1]
        last_names.append(last_name)

df['Last names'] = last_names
df['First names'] = first_names

In [21]:
# Now for the "and children" part... when determining whether or not they received a lifeboat, it's probably less
# important their actual age than whether or not they were considered "a child." Add another column to the dataframe
# titled "is_child" that is a 1 if the passenger is 17 or younger and 0 if the passenger is 18 or older.

In [22]:
df['is_child'] = df['Age'].map(lambda x: 1 if x < 18 else 0)

In [23]:
# Finally, there's one last potential source of information. Take a look at the column "Cabin." It contains the
# the passenger's cabin number. Less important than the specific cabin number, however, is the fact that they had
# a cabin at all! So let's create another column entitled "has_cabin" that is a 1 if the cabin number exists
# and 0 if it does not.

In [24]:
df['has_cabin'] = df['Cabin'].map(lambda x: 1 if pd.isnull(x) == False else 0)

In [25]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Ticket,Fare,Cabin,Embarked,Genders,Last names,First names,is_child,has_cabin
0,1,0,3,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.2500,,S,m,Braund,Owen,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,C85,C,f,Thayer,Florence,0,1
2,3,1,3,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.9250,,S,f,Heikkinen,Laina,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1000,C123,S,f,Peel,Lily,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,373450,8.0500,,S,m,Allen,William,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,211536,13.0000,,S,m,Montvila,Juozas,0,0
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,112053,30.0000,B42,S,f,Graham,Margaret,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,W./C. 6607,23.4500,,S,f,Johnston,Catherine,0,0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,111369,30.0000,C148,C,m,Behr,Karl,0,1


In [26]:
# Great job! You've completed the whole exercise! We're not going to go through the actual process of building the
# classifier. Feel free to do that if you are so inclined! But this exercise was meant to be illustrative of the
# importance of feature engineering and the creativity required. 

# For Zest.ai, we will be doing something very similar, except instead of engineering out names and cabins, we will
# be engineering out features like gender, congressional district, zip code. Many of these features will be crucial
# to the proper performance of Zest's classifier and some will require outside APIs and classifiers (for example
# for inferring gender from a name). We are excited to begin this journey with you and hope that this introductory
# project was rewarding and instructive!