# I. Loading and splitting training data

In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import utils

In [2]:
folder_path = Path.cwd().parent / "data/"
file_path = folder_path / 'train.csv'

training_data = pd.read_csv(file_path)
X = training_data.loc[:, training_data.columns != 'Survived']
y = training_data[['Survived']]

X_train, X_test, y_train, y_test = utils.balanced_data_split(X, y, max_iter = 5)

# II. Defining column transformer object + imputer

In [3]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer

from sklearn.compose import make_column_transformer

In [4]:
# define encodings
one_hot_enc = OneHotEncoder()
ordinal_enc = OrdinalEncoder(categories=[
    [1, 2, 3], # Pclass
    ])

# build column transformer
preprocess_col_transformer = make_column_transformer(
    (one_hot_enc, ['Embarked', 'Sex']),  # apply OneHotEncoder to Embarked and Sex
    (ordinal_enc, ['Pclass']),
    ('passthrough', ["Age", "Fare"]),
    remainder='drop')     # include remaining column (Fare) in the output

In [5]:
imputer = KNNImputer(n_neighbors=9, weights='uniform', metric='nan_euclidean')

# III. Model configuration

In [6]:
from model_config import create_logistic_regression_config, create_decision_tree_config, create_random_forest_config
model_configs = [create_logistic_regression_config(), create_decision_tree_config(), create_random_forest_config()]

In [7]:
# Example:
print(
    f"The first model configuration in the list is as follows:\n\n{model_configs[0]}"
)

The first model configuration in the list is as follows:

ModelConfig(
	Model name: Logistic Regression
	Model hyperparameters{
		logisticregression__penalty: ('l1', 'l2', 'elasticnet', None)
		logisticregression__C: [  0.    0.1   0.2   0.3   0.4   0.5   0.6   0.7   0.8   0.9   1.    2.
   3.    4.    5.    6.    7.    8.    9.   10.   20.   30.   40.   50.
  60.   70.   80.   90.  100. ]
		logisticregression__fit_intercept: (False, True)
		logisticregression__intercept_scaling: [0, 1, 2]
		logisticregression__class_weight: ('balanced', None, {False: 1, True: 1.5})
		logisticregression__solver: ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
		logisticregression__multi_class: ('auto', 'ovr', 'multinomial')
		logisticregression__l1_ratio: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
	}
)


# IV. Pipeline parameter optimization

In [8]:
from sklearn.pipeline import make_pipeline

# To supress RandomizedSearchCV warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

- ### RandomizedSearchCV

In [9]:
from sklearn.model_selection import RandomizedSearchCV

In [10]:
# Optimizing multiple pipelines, with different model from 'model_config.py'

optimal_grid = dict()

for model_config in model_configs:
    name = model_config.name
    model = model_config.model
    params = model_config.hyperparameters

    pipeline = make_pipeline(preprocess_col_transformer, imputer, model)
    grid = RandomizedSearchCV(pipeline, param_distributions=params, n_iter=100, scoring='accuracy', n_jobs=1, cv=5, verbose=0)
    grid.fit(X_train, np.ravel(y_train))
    optimal_grid[name] = {'score': grid.best_score_, 'grid': grid}

In [12]:
for model_name in optimal_grid.keys():
    print(
        f"The '{model_name}' model got a training score of up to: {optimal_grid.get(model_name).get('score')}"
    )

The 'Logistic Regression' model got a score of up to: 0.7964247020585048
The 'Decision Tree' model got a score of up to: 0.8244952230867725
The 'Random Forest' model got a score of up to: 0.8259529203191175


In [24]:
# Retaining the best performing model
best_model = None
best_score = 0
best_grid = None
for model_name in optimal_grid.keys():
    score = optimal_grid.get(model_name).get('score')
    if score > best_score:
        best_model = model_name
        best_score = score
        best_grid = optimal_grid.get('Random Forest').get('grid')

In [25]:
# Result Example
print(f"The best score obtained so far: {best_score}")

print(f"Optimized hyperparameters of the '{model_name}' classifier:\n")
for key, value in optimal_grid.get(model_name).get('grid').best_params_.items():
    print(key,": ", value)

optimal_grid.get(model_name).get('grid')

The best score obtained so far: 0.8259529203191175
Optimized hyperparameters of the 'Random Forest' classifier:

randomforestclassifier__oob_score :  True
randomforestclassifier__n_estimators :  50
randomforestclassifier__min_samples_split :  12
randomforestclassifier__min_samples_leaf :  3
randomforestclassifier__max_features :  7
randomforestclassifier__max_depth :  13
randomforestclassifier__criterion :  entropy
randomforestclassifier__class_weight :  balanced


# V. Generate predictions

In [21]:
folder_path = Path.cwd().parent / "data/"
file_path = folder_path / 'test.csv'

testing_data = pd.read_csv(file_path)
testing_data.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [22]:
predictions = best_grid.predict(testing_data)
pd.DataFrame(predictions)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


In [23]:
submission = pd.DataFrame({'PassengerId': testing_data['PassengerId'], 'Survived': predictions})
submission.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [29]:
folder_path = Path.cwd().parent / "submissions/"
file_path = folder_path / f'titanic_submission_{model_name}_classifier_v1.csv'

submission.to_csv(file_path, index=None)