# ML Model Selection for the Titanic Survivor Problem
## Creating a model to predict the survivors of the Titanic

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector as SFS

In [45]:
# Tidy data

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

test_data['Survived'] = np.nan

data = pd.concat([train_data,test_data])

# Encode the Sex category
data['Male'] = data.Sex.apply(lambda x: 1 if x == 'male' else 0)
data['Female'] = data.Sex.apply(lambda x: 1 if x == 'female' else 0)
data.drop('Sex', axis=1, inplace=True)

# Extract Title from name - could be useful
data['Title'] = data.Name.apply(lambda x: x.split('.')[0].split(' ')[-1])

# I am going to fill in the people with NaN for age, using their title as an indication of age - i.e. NaN will be
## replaced with average age of people with the same title (e.g. Master and Miss usually refer to younger people)

title_vs_age = data.groupby('Title').Age.mean()

data.Age = data.apply(lambda row: title_vs_age[row.Title] if math.isnan(row.Age) else row.Age, axis=1)

# Remove 'Title' column now used
data.drop('Title',axis=1, inplace=True)

# Tidy ticket number and convert an integer - LINE values will be given the max ticket number +1 as these would have
## the final assigned tickets if they weren't pre-booked (from inspection this is 3,101,317)

data.Ticket = data.Ticket.apply(lambda x: 3101318 if x == 'LINE' else int(x.split(' ')[-1]))

# Remove the Cabin data as 687 out of 891 entries are nan - this will not be useful for analysis
data.drop('Cabin', axis=1, inplace=True)


# I can't see any clues for the Embarked location of the passengers with NaN, so I will assign these to the largest
## group and then encode these
# print(data.Embarked.value_counts())
data.Embarked = data.Embarked.apply(lambda x: 'S' if type(x) == float else x)

encoded_E = pd.get_dummies(data.Embarked)

data.join(encoded_E)

data.drop('Embarked',axis=1, inplace=True)

# 1 person has a Fare of NAN - replace with the mean Fare

data.Fare = data.Fare.apply(lambda x: data.Fare.mean() if pd.isna(x) == True else x)

data.drop('Name', axis=1, inplace=True)

# print(data.head())
train_data = data.iloc[:891]
test_data = data.iloc[891:].drop(['Survived'], axis=1)

y = train_data['Survived']
X = train_data.drop(['Survived'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 10, train_size = 0.75)


In [3]:
# Coarse Model Selection

model_pipeline = Pipeline([('lda', LinearDiscriminantAnalysis()), ('clf', SVC())])

model_param_1 = {'lda': [LinearDiscriminantAnalysis(), None], 'clf': [SVC()], 
                 'clf__kernel': ['poly', 'rbf'], 'clf__degree': [2, 3, 4]}

model_param_2 = {'lda': [LinearDiscriminantAnalysis(), None], 'clf': [KNeighborsClassifier()], 
                 'clf__n_neighbors': list(range(4, 11))}

model_param_3 = {'lda': [LinearDiscriminantAnalysis(), None], 'clf': [RandomForestClassifier()], 
                 'clf__max_depth': list(range(5, 16))}

model_param_grid = [model_param_1, model_param_2, model_param_3]

model_grid = GridSearchCV(model_pipeline, model_param_grid, cv=5, verbose=3, error_score='raise')

model_grid.fit(X_train, y_train)

print(model_grid.best_estimator_)

print(model_grid.best_estimator_.score(X_val, y_val))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=LinearDiscriminantAnalysis();, score=0.724 total time=   0.0s
[CV 2/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=LinearDiscriminantAnalysis();, score=0.694 total time=   0.0s
[CV 3/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=LinearDiscriminantAnalysis();, score=0.687 total time=   0.0s
[CV 4/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=LinearDiscriminantAnalysis();, score=0.692 total time=   0.0s
[CV 5/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=LinearDiscriminantAnalysis();, score=0.654 total time=   0.0s
[CV 1/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=None;, score=0.604 total time=   0.0s
[CV 2/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=None;, score=0.604 total time=   0.0s
[CV 3/5] END clf=SVC(), clf__degree=2, clf__kernel=poly, lda=None;, score=0.597 total time=   0.0s
[CV 4/5] END clf=SVC(), cl

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbors=4, lda=None;, score=0.575 total time=   0.0s
[CV 4/5] END clf=KNeighborsClassifier(), clf__n_neighbors=4, lda=None;, score=0.579 total time=   0.0s
[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=4, lda=None;, score=0.609 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbors=5, lda=LinearDiscriminantAnalysis();, score=0.754 total time=   0.0s
[CV 2/5] END clf=KNeighborsClassifier(), clf__n_neighbors=5, lda=LinearDiscriminantAnalysis();, score=0.821 total time=   0.0s
[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbors=5, lda=LinearDiscriminantAnalysis();, score=0.739 total time=   0.0s
[CV 4/5] END clf=KNeighborsClassifier(), clf__n_neighbors=5, lda=LinearDiscriminantAnalysis();, score=0.774 total time=   0.0s
[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=5, lda=LinearDiscriminantAnalysis();, score=0.820 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbor

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=6, lda=None;, score=0.602 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=LinearDiscriminantAnalysis();, score=0.746 total time=   0.0s
[CV 2/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=LinearDiscriminantAnalysis();, score=0.821 total time=   0.0s
[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=LinearDiscriminantAnalysis();, score=0.731 total time=   0.0s
[CV 4/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=LinearDiscriminantAnalysis();, score=0.782 total time=   0.0s
[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=LinearDiscriminantAnalysis();, score=0.842 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=None;, score=0.634 total time=   0.0s
[CV 2/5] END clf=KNeighborsClassifier(), clf__n_neighbors=7, lda=None;, score=0.657 total time=   0.0s
[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbor

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=LinearDiscriminantAnalysis();, score=0.731 total time=   0.0s
[CV 4/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=LinearDiscriminantAnalysis();, score=0.774 total time=   0.0s
[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=LinearDiscriminantAnalysis();, score=0.812 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=None;, score=0.634 total time=   0.0s
[CV 2/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=None;, score=0.642 total time=   0.0s
[CV 3/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=None;, score=0.590 total time=   0.0s
[CV 4/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=None;, score=0.586 total time=   0.0s
[CV 5/5] END clf=KNeighborsClassifier(), clf__n_neighbors=9, lda=None;, score=0.594 total time=   0.0s
[CV 1/5] END clf=KNeighborsClassifier(), clf__n_neighbors=10, lda=LinearDiscriminantAnalysis();, score=0

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


[CV 1/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=LinearDiscriminantAnalysis();, score=0.769 total time=   0.1s
[CV 2/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=LinearDiscriminantAnalysis();, score=0.836 total time=   0.1s
[CV 3/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=LinearDiscriminantAnalysis();, score=0.716 total time=   0.1s
[CV 4/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=LinearDiscriminantAnalysis();, score=0.782 total time=   0.1s
[CV 5/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=LinearDiscriminantAnalysis();, score=0.820 total time=   0.1s
[CV 1/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=None;, score=0.754 total time=   0.1s
[CV 2/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=None;, score=0.881 total time=   0.1s
[CV 3/5] END clf=RandomForestClassifier(), clf__max_depth=5, lda=None;, score=0.754 total time=   0.1s
[CV 4/5] END clf=RandomForestClassifier(), clf__max_dept

[CV 3/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=LinearDiscriminantAnalysis();, score=0.694 total time=   0.1s
[CV 4/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=LinearDiscriminantAnalysis();, score=0.767 total time=   0.1s
[CV 5/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=LinearDiscriminantAnalysis();, score=0.729 total time=   0.1s
[CV 1/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=None;, score=0.776 total time=   0.1s
[CV 2/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=None;, score=0.851 total time=   0.1s
[CV 3/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=None;, score=0.776 total time=   0.1s
[CV 4/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=None;, score=0.797 total time=   0.1s
[CV 5/5] END clf=RandomForestClassifier(), clf__max_depth=12, lda=None;, score=0.827 total time=   0.1s
[CV 1/5] END clf=RandomForestClassifier(), clf__max_depth=13, lda=LinearDiscriminantAnalysis();,

In [12]:
# Feature Selection

base_pipeline = model_grid.best_estimator_
scores = []
features = []

# print(base_pipeline)

for x in range(6,9):
    sbs = SFS(base_pipeline, n_features_to_select=x, direction='backward', scoring='accuracy', cv=10)

    sbs.fit(X_train, y_train)

    selected_features = [X_train.columns[i] for i in sbs.get_support(indices=True)]

    X_train_reduced = X_train[selected_features]

    X_val_reduced = X_val[selected_features]

    base_pipeline.fit(X_train_reduced, y_train)
    
    print(f'{x} Features:')
    print(base_pipeline.score(X_val_reduced, y_val))
    
    scores.append(base_pipeline.score(X_val_reduced, y_val))
    features.append(selected_features)

base_pipeline.fit(X_train, y_train)

print('9 Features:')
print(base_pipeline.score(X_val, y_val))

if max(scores) >= base_pipeline.score(X_val, y_val):
    selected_features = features[scores.index(max(scores))]

else:
    selected_features = X_train.columns

X_train_reduced = X_train[selected_features]
X_val_reduced = X_val[selected_features]


6 Features:
0.8878923766816144
7 Features:
0.8609865470852018
8 Features:
0.874439461883408
9 Features:
0.8609865470852018


In [47]:
# Model Fine-Tuning

fine_tune_params = {'clf__max_depth': list(range(best_pipeline['clf'].max_depth-2, best_pipeline['clf'].max_depth+3)),
                   'clf__criterion': ['gini', 'entropy'], 'clf__n_estimators': [100, 150, 200, 250]}

fine_tune_grid = GridSearchCV(best_pipeline, fine_tune_params, cv=10, verbose=1)


fine_tune_grid.fit(X_train_reduced, y_train)


tuned_model = fine_tune_grid.best_estimator_

print(tuned_model)

print(tuned_model.score(X_val_reduced, y_val))


Fitting 10 folds for each of 40 candidates, totalling 400 fits
Pipeline(steps=[('lda', None),
                ('clf',
                 RandomForestClassifier(criterion='entropy', max_depth=12))])
0.8654708520179372


In [64]:
# Predict outcomes
total_train_data = pd.concat([X_train_reduced, X_val_reduced])
total_train_results = pd.concat([y_train,y_val])

tuned_model.fit(total_train_data, total_train_results)

test_data_reduced = test_data[selected_features]

predictions = pd.DataFrame()

predictions['PassengerId'] = test_data.PassengerId
predictions['Survived'] = tuned_model.predict(test_data_reduced).astype('int')

print(predictions)

predictions.to_csv('predictions.csv', index=False)


     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
