In [357]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

from ydata_profiling import ProfileReport
import optuna

In [358]:
# ML Algorithms used 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss, matthews_corrcoef
from sklearn.svm import SVC

In [382]:
data = 'Titanic-Dataset.csv'

type_by_field = {
    'Sex': 'category',
    'Embarked': 'category',
}

df = pd.read_csv(data, dtype=type_by_field)

# Extract cabin info
df.Cabin = df.Cabin.str[0]
df.Cabin.fillna('Unknown', inplace=True)
df.Cabin = df.Cabin.astype('category')

df.Sex = (df.Sex == 'male').astype(int)

# Extract name information
pattern = r'\b(\w+)\.'
df['Title'] = df.Name.str.extract(pattern, expand=False).astype('category')
df['NameLength'] = df.Name.str.len().astype(int)

# Extract family size info
df['FamilySize'] = 1 + df.SibSp + df.Parch

# Age
df.Age.fillna(-10, inplace=True)
df.Age = df.Age.astype(int)
# Create age bins
bins = [-float('inf'), -1, 18, 35, 50, float('inf')]
labels = ['Unknown','<18', '18-35', '36-50', '>50']
df['AgeGroup'] = pd.cut(df.Age, bins=bins, labels=labels)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,NameLength,FamilySize,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,Unknown,S,Mr,23,2,18-35
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C,C,Mrs,51,2,36-50
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,Unknown,S,Miss,22,1,18-35
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1,C,S,Mrs,44,2,18-35
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.05,Unknown,S,Mr,24,1,18-35


# EDA

In [None]:
profile = ProfileReport(df, title="Titanic Datset EDA")
profile

In [383]:
labels = ['PassengerId','Age', 'Ticket', 'Name']
df.drop(labels, inplace=True, axis=1)

# Create an ordinal encoder instance
cols_to_encode = ['Cabin']
encoder = OrdinalEncoder(categories=[df.Cabin.cat.categories])
df[cols_to_encode] = encoder.fit_transform(df[cols_to_encode])
df = pd.get_dummies(df, columns=['AgeGroup', 'Embarked', 'Title'])

y = df['Survived'].ravel()
X = df.drop(['Survived'], axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=13, stratify=y)

In [384]:
random_seed = 42
np.random.seed(random_seed)

params = {
    'n_estimators': 100,
    'max_depth': 2,
    'min_samples_leaf': 10,
}

rf_model = RandomForestClassifier(**params)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8071748878923767

In [385]:
def train_model(model, X_train, y_train, n_folds: int = 5, seed: int = 42):
    random_seed = 42
    np.random.seed(random_seed)
    
    cv = StratifiedKFold(n_splits= n_folds, random_state=seed, shuffle=True)
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

def get_ab_params(trial: optuna.Trial):
     return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'learning_rate' : trial.suggest_float('learning_rate', 0.3, 0.9),
    }

def get_rf_params(trial: optuna.Trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 400),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 8),
        'max_features' : 'sqrt',
    }

def get_et_params(trial: optuna.Trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 6),
    }

def get_gb_params(trial: optuna.Trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
    }

def get_sv_params(trial: optuna.Trial):
    return {
        'kernel' : 'linear',
        'C' : trial.suggest_float('C', 0.01, 0.05),
    }

def build_objective(model_type, parameter_factory):
    def objective(trial: optuna.Trial) -> float:
        params = parameter_factory(trial)
        model = model_type(**params)
        scores = train_model(model, X_train, y_train)
        return scores.mean()
    return objective

models = [
    ('Ada Boost', AdaBoostClassifier, get_ab_params),
    ('Extra Trees', ExtraTreesClassifier, get_et_params),
    ('Gradient Boosting', GradientBoostingClassifier, get_rf_params),
    ('Random Forest', RandomForestClassifier, get_rf_params),
    ('Support Vector', SVC, get_sv_params),
]

best_models = []
for model_name, model_type, param_factory in models:

    OPTUNA_TRIALS = 50
    
    print(f'Optimising hyperparameters for {model_name}...')
    obj = build_objective(model_type, param_factory)
    study = optuna.create_study(study_name=f'{model_name} Hyperparameter search', direction='maximize')
    study.optimize(obj, n_trials=OPTUNA_TRIALS, show_progress_bar=True)
    best_params = study.best_params
    best_model = model_type(**best_params)
    np.random.seed(random_seed)
    estimator = best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models.append((model_name, best_model, best_params, accuracy))

best_models

[I 2023-06-26 19:45:17,161] A new study created in memory with name: Ada Boost Hyperparameter search


Optimising hyperparameters for Ada Boost...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:45:21,643] Trial 0 finished with value: 0.8083828975423634 and parameters: {'n_estimators': 343, 'learning_rate': 0.7206483549243712}. Best is trial 0 with value: 0.8083828975423634.
[I 2023-06-26 19:45:23,839] Trial 1 finished with value: 0.8158904724497813 and parameters: {'n_estimators': 207, 'learning_rate': 0.43567092766061866}. Best is trial 1 with value: 0.8158904724497813.
[I 2023-06-26 19:45:27,900] Trial 2 finished with value: 0.8098754348557963 and parameters: {'n_estimators': 419, 'learning_rate': 0.5596197345487345}. Best is trial 1 with value: 0.8158904724497813.
[I 2023-06-26 19:45:29,922] Trial 3 finished with value: 0.8128605094826618 and parameters: {'n_estimators': 353, 'learning_rate': 0.6474628448344659}. Best is trial 1 with value: 0.8158904724497813.
[I 2023-06-26 19:45:31,534] Trial 4 finished with value: 0.812894175737852 and parameters: {'n_estimators': 285, 'learning_rate': 0.5820824335392594}. Best is trial 1 with value: 0.8158904724497813.


[I 2023-06-26 19:46:51,485] A new study created in memory with name: Extra Trees Hyperparameter search


Optimising hyperparameters for Extra Trees...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:46:52,383] Trial 0 finished with value: 0.8174278981034677 and parameters: {'n_estimators': 236, 'max_depth': 5, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8174278981034677.
[I 2023-06-26 19:46:53,657] Trial 1 finished with value: 0.8294243070362473 and parameters: {'n_estimators': 373, 'max_depth': 6, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.8294243070362473.
[I 2023-06-26 19:46:55,226] Trial 2 finished with value: 0.8309168443496802 and parameters: {'n_estimators': 394, 'max_depth': 10, 'min_samples_leaf': 5}. Best is trial 2 with value: 0.8309168443496802.
[I 2023-06-26 19:46:56,164] Trial 3 finished with value: 0.8323981595780496 and parameters: {'n_estimators': 251, 'max_depth': 10, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.8323981595780496.
[I 2023-06-26 19:46:56,955] Trial 4 finished with value: 0.8324206037481764 and parameters: {'n_estimators': 211, 'max_depth': 7, 'min_samples_leaf': 3}. Best is trial 4 with value: 0.832420

[I 2023-06-26 19:47:48,083] A new study created in memory with name: Gradient Boosting Hyperparameter search


Optimising hyperparameters for Gradient Boosting...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:47:48,649] Trial 0 finished with value: 0.8309056222646168 and parameters: {'n_estimators': 264, 'max_depth': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8309056222646168.
[I 2023-06-26 19:47:49,234] Trial 1 finished with value: 0.8173942318482774 and parameters: {'n_estimators': 214, 'max_depth': 3, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8309056222646168.
[I 2023-06-26 19:47:50,017] Trial 2 finished with value: 0.826394344069128 and parameters: {'n_estimators': 335, 'max_depth': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8309056222646168.
[I 2023-06-26 19:47:50,995] Trial 3 finished with value: 0.8204129727303332 and parameters: {'n_estimators': 377, 'max_depth': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8309056222646168.
[I 2023-06-26 19:47:51,632] Trial 4 finished with value: 0.8233980473571989 and parameters: {'n_estimators': 201, 'max_depth': 5, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.830905622

[I 2023-06-26 19:48:22,300] A new study created in memory with name: Random Forest Hyperparameter search


Optimising hyperparameters for Random Forest...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:48:23,960] Trial 0 finished with value: 0.7949837279766581 and parameters: {'n_estimators': 350, 'max_depth': 2, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7949837279766581.
[I 2023-06-26 19:48:24,916] Trial 1 finished with value: 0.7860172820109976 and parameters: {'n_estimators': 200, 'max_depth': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7949837279766581.
[I 2023-06-26 19:48:26,028] Trial 2 finished with value: 0.8248793625855683 and parameters: {'n_estimators': 217, 'max_depth': 5, 'min_samples_leaf': 7}. Best is trial 2 with value: 0.8248793625855683.
[I 2023-06-26 19:48:27,640] Trial 3 finished with value: 0.8084502300527439 and parameters: {'n_estimators': 343, 'max_depth': 3, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.8248793625855683.
[I 2023-06-26 19:48:29,464] Trial 4 finished with value: 0.8233868252721355 and parameters: {'n_estimators': 381, 'max_depth': 4, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.82487936

[I 2023-06-26 19:49:43,668] A new study created in memory with name: Support Vector Hyperparameter search


Optimising hyperparameters for Support Vector...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:49:43,899] Trial 0 finished with value: 0.8129390640781058 and parameters: {'C': 0.03722386400030778}. Best is trial 0 with value: 0.8129390640781058.
[I 2023-06-26 19:49:44,005] Trial 1 finished with value: 0.8099315452811131 and parameters: {'C': 0.02331241513494027}. Best is trial 0 with value: 0.8129390640781058.
[I 2023-06-26 19:49:44,091] Trial 2 finished with value: 0.808427785882617 and parameters: {'C': 0.018536295924339585}. Best is trial 0 with value: 0.8129390640781058.
[I 2023-06-26 19:49:44,160] Trial 3 finished with value: 0.8069240264841208 and parameters: {'C': 0.013869009650125701}. Best is trial 0 with value: 0.8129390640781058.
[I 2023-06-26 19:49:44,358] Trial 4 finished with value: 0.8114353046796096 and parameters: {'C': 0.03234190234625574}. Best is trial 0 with value: 0.8129390640781058.
[I 2023-06-26 19:49:44,466] Trial 5 finished with value: 0.8084390079676803 and parameters: {'C': 0.019417295487778846}. Best is trial 0 with value: 0.81293906

[('Ada Boost',
  AdaBoostClassifier(learning_rate=0.6232070096819665, n_estimators=221),
  {'n_estimators': 221, 'learning_rate': 0.6232070096819665},
  0.8071748878923767),
 ('Extra Trees',
  ExtraTreesClassifier(max_depth=9, min_samples_leaf=3, n_estimators=247),
  {'n_estimators': 247, 'max_depth': 9, 'min_samples_leaf': 3},
  0.8116591928251121),
 ('Gradient Boosting',
  GradientBoostingClassifier(max_depth=2, min_samples_leaf=4, n_estimators=244),
  {'n_estimators': 244, 'max_depth': 2, 'min_samples_leaf': 4},
  0.8161434977578476),
 ('Random Forest',
  RandomForestClassifier(max_depth=5, min_samples_leaf=3, n_estimators=365),
  {'n_estimators': 365, 'max_depth': 5, 'min_samples_leaf': 3},
  0.8161434977578476),
 ('Support Vector',
  SVC(C=0.04060697358950099),
  {'C': 0.04060697358950099},
  0.6636771300448431)]

In [386]:
accuracy_by_model = {model_name: [accuracy] for model_name, best_model, best_params, accuracy in best_models}
pd.DataFrame(data=accuracy_by_model, index=['Accuracy']).T

Unnamed: 0,Accuracy
Ada Boost,0.807175
Extra Trees,0.811659
Gradient Boosting,0.816143
Random Forest,0.816143
Support Vector,0.663677


In [388]:
def get_ensemble_params(trial: optuna.Trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
    }

train_predictions = []
test_predictions = []
for model_name, best_model, best_params, accuracy in best_models:
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    train_predictions.append(y_pred_train)
    test_predictions.append(y_pred_test)

x_train_ensemble = np.column_stack(train_predictions)
x_test_ensemble = np.column_stack(test_predictions)


obj = build_objective(GradientBoostingClassifier, get_ensemble_params)
study = optuna.create_study(study_name=f'Ensemble Hyperparameter search', direction='maximize')
study.optimize(obj, n_trials=OPTUNA_TRIALS, show_progress_bar=True)

best_params = study.best_params
best_model = GradientBoostingClassifier(**best_params)
np.random.seed(random_seed)
estimator = best_model.fit(x_train_ensemble, y_train)
y_pred = best_model.predict(x_test_ensemble)
accuracy = accuracy_score(y_test, y_pred)
accuracy

[I 2023-06-26 19:54:23,656] A new study created in memory with name: Ensemble Hyperparameter search


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-06-26 19:54:25,197] Trial 0 finished with value: 0.7994501178318931 and parameters: {'n_estimators': 245, 'max_depth': 5}. Best is trial 0 with value: 0.7994501178318931.
[I 2023-06-26 19:54:26,207] Trial 1 finished with value: 0.8174054539333409 and parameters: {'n_estimators': 270, 'max_depth': 3}. Best is trial 1 with value: 0.8174054539333409.
[I 2023-06-26 19:54:27,833] Trial 2 finished with value: 0.8024576366288857 and parameters: {'n_estimators': 240, 'max_depth': 5}. Best is trial 1 with value: 0.8174054539333409.
[I 2023-06-26 19:54:29,028] Trial 3 finished with value: 0.8129278419930424 and parameters: {'n_estimators': 244, 'max_depth': 4}. Best is trial 1 with value: 0.8174054539333409.
[I 2023-06-26 19:54:30,289] Trial 4 finished with value: 0.8129166199079789 and parameters: {'n_estimators': 247, 'max_depth': 4}. Best is trial 1 with value: 0.8174054539333409.
[I 2023-06-26 19:54:32,484] Trial 5 finished with value: 0.8024015262035686 and parameters: {'n_estimator

0.8116591928251121