# Titanic - Machine Learning from Disaster

The purpose of this notebook is to predict who survived the Titanic disaster.

## Step 1: Notebook set-up

### 1.1: Import the data science packages

In [19]:
import pandas as pd
import numpy as np
import random as rnd

### 1.2: Import the data visualisation packages

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 1.3 Import machine learning packages

In [73]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

### 1.4: Set up the directory

In [18]:
import os
ori_dir = os.path.dirname(os.path.abspath("__file__"))
#print("The current working directory is:", ori_dir)

new_dir = os.path.abspath(os.path.join(cwd, '..', '2. Data'))
#print("The new working directory is:", new_dir)

os.chdir(new_dir)
os.getcwd()

'C:\\Users\\Andy.quach\\Documents\\GitHub\\DataScience\\2. Data'

### 1.5 Import the data

In [23]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
gender_df = pd.read_csv('gender_submission.csv')

### 1.6 Create copies of the dataframe

In [38]:
test_df1 = test_df.copy(deep = True)
train_df1 = train_df.copy(deep = True)
gender_df1 = gender_df.copy(deep = True)

combine = [test_df1, train_df1]

## Section 2: Exploratory Data Analysis (EDA)

### 2.1: Examine the data

In [29]:
## Print the columns 
print(train_df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [32]:
## Examine the dataframe head
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2.2 Examine the datatypes of the dataframes 

In [34]:
train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null

### 2.3 Examine the properties of the columns

In [35]:
## Examine the numerical columns
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [36]:
## Examine the categorical columns
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Aubart, Mme. Leontine Pauline",male,1601,G6,S
freq,1,577,7,4,644


### 2.4 Clean the data

There are 4Cs to data cleaning:
1. Completing incomplete data
2. Correcting outliers
3. Creating new variables (i.e. feature engineering)
4. Converting variables into the right datatype

#### COMPLETE

In [43]:
# Assess the number of nulls in each column
print('Train columns with null values:\n',train_df1.isnull().sum())
print("-"*10)

print('Test columns with null values:\n',test_df1.isnull().sum())

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [55]:
## Completing the dataset
for dataset in combine:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

## Drop unneeded columns
drop_column = ['PassengerId','Cabin', 'Ticket']
try:
    train_df1.drop(drop_column, axis=1, inplace = True)
except:
    print("Columns already dropped!\n")

## Check that there are no nulls left
print(train_df1.isnull().sum())
print("-"*10)
print(test_df1.isnull().sum())

Columns already dropped!

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
----------
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


#### CREATE

In [62]:
for dataset in combine:
    # Identify how large the family is
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    # Identify if passenger is alone
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
    
    # Extract title from name
    dataset['Title'] = dataset['Name'].str.split(", ", expand = True)[1].str.split(".", expand = True)[0]
        
    #Split fares into quartiles (i.e. splits based on frequency distribution)
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4) 
    
    # Split ages into 5 bins based on values
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    

#print(train_df1['Title'].value_counts())

stat_min = 10
title_names = (train_df1['Title'].value_counts() < stat_min)
train_df1['Title'] = train_df1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

#print(train_df1['Title'].value_counts())
#print("-"*10)


train_df1.info()
test_df1.info()
train_df1.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null in

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
681,1,1,"Hassab, Mr. Hammad",male,27.0,0,0,76.7292,C,1,1,Mr,"(31.0, 512.329]","(16.0, 32.0]"
317,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,14.0,S,1,1,Misc,"(7.91, 14.454]","(48.0, 64.0]"
700,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,227.525,C,2,0,Mrs,"(31.0, 512.329]","(16.0, 32.0]"
420,0,3,"Gheorgheff, Mr. Stanio",male,28.0,0,0,7.8958,C,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
659,0,1,"Newell, Mr. Arthur Webster",male,58.0,0,2,113.275,C,3,0,Mr,"(31.0, 512.329]","(48.0, 64.0]"
596,1,2,"Leitch, Miss. Jessie Wills",female,28.0,0,0,33.0,S,1,1,Miss,"(31.0, 512.329]","(16.0, 32.0]"
723,0,2,"Hodges, Mr. Henry Price",male,50.0,0,0,13.0,S,1,1,Mr,"(7.91, 14.454]","(48.0, 64.0]"
244,0,3,"Attalah, Mr. Sleiman",male,30.0,0,0,7.225,C,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
696,0,3,"Kelly, Mr. James",male,44.0,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]"
318,1,1,"Wick, Miss. Mary Natalie",female,31.0,0,2,164.8667,S,3,0,Miss,"(31.0, 512.329]","(16.0, 32.0]"


#### CONVERT

In [76]:
target = ['Survived']
features = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] 

train_df1_dummies = pd.get_dummies(train_df1[features])
train_df1_dummies.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,0,1,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,0,1
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,0,1,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,0,1
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0


#### DOUBLE CHECK

In [71]:
print('Train columns with null values:\n', train_df1_dummies.isnull().sum())
print("-"*10)
print (train_df1_dummies.info())
print("-"*10)

print('Test/Validation columns with null values:\n', test_df1.isnull().sum())
print("-"*10)
print (test_df1.info())
print("-"*10)

train_df1_dummies.describe(include = 'all')

Train columns with null values:
 Pclass          0
SibSp           0
Parch           0
Age             0
Fare            0
FamilySize      0
IsAlone         0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Title_Master    0
Title_Misc      0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
Pclass          891 non-null int64
SibSp           891 non-null int64
Parch           891 non-null int64
Age             891 non-null float64
Fare            891 non-null float64
FamilySize      891 non-null int64
IsAlone         891 non-null int64
Sex_female      891 non-null uint8
Sex_male        891 non-null uint8
Embarked_C      891 non-null uint8
Embarked_Q      891 non-null uint8
Embarked_S      891 non-null uint8
Title_Master    891 non-null uint8
Title_Misc      891 non-null uint8
Title_Miss      891 non-null uint8
Tit

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.523008,0.381594,29.361582,32.204208,1.904602,0.602694,0.352413,0.647587,0.188552,0.08642,0.725028,0.044893,0.030303,0.204265,0.580247,0.140292
std,0.836071,1.102743,0.806057,13.019697,49.693429,1.613459,0.489615,0.47799,0.47799,0.391372,0.281141,0.446751,0.207186,0.171516,0.40339,0.493796,0.347485
min,1.0,0.0,0.0,0.42,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,22.0,7.9104,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,0.0,28.0,14.4542,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,3.0,1.0,0.0,35.0,31.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,3.0,8.0,6.0,80.0,512.3292,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [77]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_df1[features], train_df1[target], random_state = 0)

X_train.head()

Unnamed: 0,Sex,Pclass,Embarked,Title,SibSp,Parch,Age,Fare,FamilySize,IsAlone
105,male,3,S,Mr,0,0,28.0,7.8958,1,1
68,female,3,S,Miss,4,2,17.0,7.925,7,0
253,male,3,S,Mr,1,0,30.0,16.1,2,0
320,male,3,S,Mr,0,0,22.0,7.25,1,1
706,female,2,S,Mrs,0,0,45.0,13.5,1,1
