# Dataset Preparation


### Setup

In [98]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy as np
import warnings

warnings.filterwarnings("ignore")
seed = 1855

## Data Collection


In [99]:
dataset_name = 'titanic'
dataset = pd.DataFrame(sns.load_dataset(dataset_name))

dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Handling missing values


In [100]:
from pandas.api.types import is_numeric_dtype 

dataset = dataset.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode()[0]))

dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,C,Southampton,no,True


## Encoding categorical features with One-Hot Encoding


In [101]:
binary_columns = [col for col in dataset.columns if len(dataset[col].unique()) == 2]
categorical_columns = [col for col in dataset.columns if not is_numeric_dtype(dataset[col])]
categorical_columns = list(set(categorical_columns) - set(binary_columns)) # removing binary columns

dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,alive,alone,...,deck_F,deck_G,who_man,who_woman,class_Second,class_Third,embarked_Q,embarked_S,embark_town_Queenstown,embark_town_Southampton
0,0,3,male,22.0,1,0,7.25,True,no,False,...,False,False,True,False,False,True,False,True,False,True
1,1,1,female,38.0,1,0,71.2833,False,yes,False,...,False,False,False,True,False,False,False,False,False,False
2,1,3,female,26.0,0,0,7.925,False,yes,True,...,False,False,False,True,False,True,False,True,False,True
3,1,1,female,35.0,1,0,53.1,False,yes,False,...,False,False,False,True,False,False,False,True,False,True
4,0,3,male,35.0,0,0,8.05,True,no,True,...,False,False,True,False,False,True,False,True,False,True


## Encoding binary class label


In [102]:
binary_columns = [col for col in dataset.columns if len(dataset[col].unique()) == 2]

for col in binary_columns:
    dataset[col] = dataset[col].map({dataset[col].unique()[0]: -1, dataset[col].unique()[1]: 1})
    
    
# moving the target column to the last position for convience

target_column = 'survived'
dataset = dataset[[col for col in dataset.columns if col != target_column] + [target_column]]


# removing the 'alive' column as it is the same as the target column

dataset = dataset.drop('alive', axis=1)



dataset.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,adult_male,alone,deck_B,deck_C,...,deck_G,who_man,who_woman,class_Second,class_Third,embarked_Q,embarked_S,embark_town_Queenstown,embark_town_Southampton,survived
0,3,-1,22.0,1,0,7.25,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1,1,38.0,1,0,71.2833,1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,1,1
2,3,1,26.0,0,0,7.925,1,1,-1,-1,...,-1,1,1,-1,-1,-1,-1,-1,-1,1
3,1,1,35.0,1,0,53.1,1,-1,-1,-1,...,-1,1,1,-1,1,-1,-1,-1,-1,1
4,3,-1,35.0,0,0,8.05,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


# Model Building


In [103]:
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.svm import *

## Splitting the dataset into training and testing sets


In [104]:
x = dataset.drop('survived', axis=1)

y = dataset['survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [105]:
def evaluate(expected, predicted):
  results = {
    'accuracy': accuracy_score(expected, predicted),
    'precision': precision_score(expected, predicted),
    'recall': recall_score(expected, predicted),
    'f1': f1_score(expected, predicted),
    'roc_auc': roc_auc_score(expected, predicted)
  }
  
  for metric, score in results.items():
    print(f'{metric}: {score:.2f}')

## Model Building


In [106]:
model = GradientBoostingClassifier()

## Model Evaluation


### Cross Validation

In [107]:
results = cross_validate(model, x_train, y_train, cv= 10, scoring=['accuracy', 'precision', 'recall', 'f1'])

for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')
  

test_accuracy: 0.83 +- 0.04
test_precision: 0.84 +- 0.06
test_recall: 0.68 +- 0.09
test_f1: 0.75 +- 0.06


### Cross Validation with KFold

In [108]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_validate(model, x_train, y_train, cv= 10, scoring=['accuracy', 'precision', 'recall', 'f1'])

for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')

test_accuracy: 0.82 +- 0.04
test_precision: 0.83 +- 0.05
test_recall: 0.68 +- 0.10
test_f1: 0.74 +- 0.07


### Cross Validation with StratifiedKFold

In [109]:
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

results = cross_validate(model, x_train, y_train, cv= 10, scoring=['accuracy', 'precision', 'recall', 'f1'])


for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')

test_accuracy: 0.82 +- 0.03
test_precision: 0.83 +- 0.05
test_recall: 0.68 +- 0.09
test_f1: 0.74 +- 0.06


### Model Performance

In [110]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)
predicted = model.predict(x_test)

evaluate(y_test, predicted)

accuracy: 0.85
precision: 0.83
recall: 0.77
f1: 0.80
roc_auc: 0.84


## Model Building with Hyperparameter Tuning


In [111]:
# GradientBoostingClassifier is already a perfect model for this dataset. But we can try to tinkering with the hyperparameters to see if we can change the results.

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
params  = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
}

grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid=params, cv=10, scoring='accuracy')

grid_search.fit(x_train, y_train)


## Model Evaluation


In [112]:
best_model = grid_search.best_estimator_

predicted = best_model.predict(x_test)

evaluate(y_test, predicted)

accuracy: 0.85
precision: 0.84
recall: 0.76
f1: 0.80
roc_auc: 0.83


# Comparing Different Models Performance

## Model Building

In [113]:
models = {
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
}

## Model Comparison

In [114]:
results = {}

for name, model in models.items():
    results[name] = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='accuracy')


result_df = pd.DataFrame(results).transpose()

result_df['mean'] = result_df.mean(axis=1)
result_df['std'] = result_df.std(axis=1)
result_df = result_df.sort_values(by=['mean', 'std'], ascending=False)
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
GradientBoostingClassifier,0.833333,0.805556,0.816901,0.802817,0.830986,0.788732,0.774648,0.859155,0.802817,0.84507,0.816002,0.024786
AdaBoostClassifier,0.875,0.777778,0.816901,0.788732,0.788732,0.816901,0.746479,0.816901,0.788732,0.84507,0.806123,0.034464
RandomForestClassifier,0.847222,0.819444,0.774648,0.788732,0.760563,0.774648,0.71831,0.816901,0.816901,0.816901,0.793427,0.0356
DecisionTreeClassifier,0.777778,0.777778,0.732394,0.774648,0.774648,0.788732,0.746479,0.84507,0.802817,0.84507,0.786541,0.034777
KNeighborsClassifier,0.847222,0.694444,0.746479,0.746479,0.774648,0.71831,0.704225,0.732394,0.788732,0.732394,0.748533,0.042859
SVC,0.680556,0.722222,0.633803,0.704225,0.704225,0.661972,0.661972,0.661972,0.661972,0.661972,0.675489,0.02552
