# Exploring the survival of Titanic passangers with Decision Trees

In [44]:
import warnings
warnings.simplefilter('ignore')

In [45]:
import numpy as np
import pandas as pd
from IPython.display import display

import random
random.seed(42)

data = pd.read_csv('titanic_data.csv')

display(data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- **Survived**: Outcome of survival (0 = No; 1 = Yes)
- **Pclass**: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
- **Name**: Name of passenger
- **Sex**: Sex of the passenger
- **Age**: Age of the passenger (Some entries contain `NaN`)
- **SibSp**: Number of siblings and spouses of the passenger aboard
- **Parch**: Number of parents and children of the passenger aboard
- **Ticket**: Ticket number of the passenger
- **Fare**: Fare paid by the passenger
- **Cabin** Cabin number of the passenger (Some entries contain `NaN`)
- **Embarked**: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)

Feature 'Survived' needs to be removed from dataset and stored in a new variable, as it will be used to make predictions.

In [46]:
survived = data['Survived']
data = data.drop('Survived', axis = 1)

## Preprocessing of the data

We are going to remove 'Name' feature from the dataset as the survival of the person doesn't really depend on the persons name. Also, we are going to one-hot encode features and fill any missing data with zeros.

In [47]:
# Removing the names
data = data.drop(['Name'], axis=1)

# One-hot encoding
data = pd.get_dummies(data)

In [48]:
data = data.fillna(0.0)
display(data.head())

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Training the model on preprocessed data

In [49]:
# splitting data in training and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, survived, test_size=0.2, random_state=42)

In [50]:
# defining model
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

## Testing the model

In [51]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'The training accuracy is {100*train_accuracy:.2f}%.')
print(f'The test accuracy is {100*test_accuracy:.2f}%.')

The training accuracy is 100.00%.
The test accuracy is 81.56%.


## Improving the model

It's seems that the model overfitts the data a bit: we have a high training accuracy, but not so high test accuracy. Now we are going to try to improve the model and use some of the following parameters: 
- `max_depth`
- `min_samples_leaf`
- `min_samples_split`

In order to find the best possible parameter, we are going to use Grid Search.

In [52]:
from sklearn.model_selection import GridSearchCV

criterion = ['gini', 'entropy']
depths = [5, 6, 7, 8, 9]
num_leafs = [5, 6, 7, 8, 9]
num_splits = [5, 6, 7, 8, 9]

param_grid = [{
    'criterion': criterion,
    'max_depth': depths,
    'min_samples_leaf': num_leafs,
    'min_samples_split': num_splits,
}]

grid = GridSearchCV(model, param_grid)
grid.fit(X_train, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'criterion': ['gini', 'entropy'], 'max_depth': [5, 6, 7, 8, 9], 'min_samples_leaf': [5, 6, 7, 8, 9], 'min_samples_split': [5, 6, 7, 8, 9]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
# Dictionary containing the parameters (min_samples_split) used to generate that score
best_params = grid.best_params_
print(best_params)

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(grid.best_estimator_)

{'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 6, 'min_samples_split': 7}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=7,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [54]:
model = DecisionTreeClassifier(**best_params)
model = model.fit(X_train, y_train)

# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'The training accuracy is {100*train_accuracy:.2f}%.')
print(f'The test accuracy is {100*test_accuracy:.2f}%.')

The training accuracy is 87.50%.
The test accuracy is 86.03%.
