# Competition: Titanic - Machine Learning from Disaster
URL: https://www.kaggle.com/competitions/titanic
# Model Training

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import accuracy_score
import os

----------
-----------
----------
----------

## Extracting Data

In [2]:
os.chdir("../training")
data_path = "../data/processed/"

In [3]:
train = pd.read_csv(data_path + "train.csv")
test = pd.read_csv(data_path + "test.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pclass        891 non-null    float64
 1   Sex           891 non-null    float64
 2   Age           891 non-null    float64
 3   SibSp         891 non-null    float64
 4   Parch         891 non-null    float64
 5   Fare          891 non-null    float64
 6   FamilySize    891 non-null    float64
 7   Pc1           891 non-null    float64
 8   Pc2           891 non-null    float64
 9   Pc3           891 non-null    float64
 10  em1           891 non-null    float64
 11  em2           891 non-null    float64
 12  em3           891 non-null    float64
 13  PassengerId   891 non-null    int64  
 14  Survived      891 non-null    int64  
 15  Name          891 non-null    object 
 16  Ticket        891 non-null    object 
 17  Embarked      891 non-null    object 
 18  Title         891 non-null    

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pclass        418 non-null    float64
 1   Sex           418 non-null    float64
 2   Age           418 non-null    float64
 3   SibSp         418 non-null    float64
 4   Parch         418 non-null    float64
 5   Fare          418 non-null    float64
 6   FamilySize    418 non-null    float64
 7   Pc1           418 non-null    float64
 8   Pc2           418 non-null    float64
 9   Pc3           418 non-null    float64
 10  em1           418 non-null    float64
 11  em2           418 non-null    float64
 12  em3           418 non-null    float64
 13  PassengerId   418 non-null    int64  
 14  Name          418 non-null    object 
 15  Ticket        418 non-null    object 
 16  Embarked      418 non-null    object 
 17  Title         418 non-null    object 
 18  Age_tb        418 non-null    

----------
-----------
----------
-----------

## Model 1: Random Forest

In [6]:
columns = ['Sex','Age','SibSp','Parch','Fare','FamilySize','Pc1','Pc2','Pc3','em1','em2','em3','Age_tb','FareBin_Code','AgeBin_Code']

train_in = train[columns].copy()
test_in = test[columns].copy()
train_out = train[['Survived']].copy()

**Data Normalization:**

Normalizing the data based on the train set

In [7]:
cols_exc = [] 

train_max = train_in.drop(columns=cols_exc).max()
train_min = train_in.drop(columns=cols_exc).min()

A1 = (train_min - train_in.drop(columns=cols_exc))/(train_min - train_max)
A2 = (train_min - test_in.drop(columns=cols_exc))/(train_min - train_max)

train_in_nor = A1.join(train_in[cols_exc])
test_in_nor = A2.join(test_in[cols_exc])

In [8]:
train_in_nor.describe().round(2)

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FamilySize,Pc1,Pc2,Pc3,em1,em2,em3,Age_tb,FareBin_Code,AgeBin_Code
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.35,0.37,0.07,0.06,0.06,0.09,0.24,0.21,0.55,0.19,0.09,0.72,0.37,0.5,0.5
std,0.48,0.17,0.14,0.13,0.1,0.16,0.43,0.41,0.5,0.39,0.28,0.45,0.17,0.35,0.38
min,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
25%,-0.0,0.27,-0.0,-0.0,0.02,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.27,0.25,0.33
50%,0.0,0.36,0.0,0.0,0.03,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.37,0.5,0.33
75%,1.0,0.45,0.12,-0.0,0.06,0.1,-0.0,-0.0,1.0,-0.0,-0.0,1.0,0.43,0.75,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
test_in_nor.describe().round(2)

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FamilySize,Pc1,Pc2,Pc3,em1,em2,em3,Age_tb,FareBin_Code,AgeBin_Code
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.36,0.38,0.06,0.07,0.07,0.08,0.26,0.22,0.52,0.24,0.11,0.65,0.37,0.49,0.5
std,0.48,0.16,0.11,0.16,0.11,0.15,0.44,0.42,0.5,0.43,0.31,0.48,0.16,0.36,0.38
min,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
25%,0.0,0.27,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.25,0.33
50%,-0.0,0.35,-0.0,-0.0,0.03,-0.0,-0.0,-0.0,1.0,-0.0,-0.0,1.0,0.37,0.5,0.33
75%,1.0,0.45,0.12,-0.0,0.06,0.1,1.0,-0.0,1.0,-0.0,-0.0,1.0,0.44,0.75,1.0
max,1.0,0.95,1.0,1.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.95,1.0,1.0


**Developing a Random Forest model:**

In [11]:
model_rf = RandomForestClassifier(n_estimators=200)
param = {'max_depth'    : Integer(1, 30),
         'max_features' : Integer(1, 15),
         'max_samples'  : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.999]}
cv = RepeatedKFold(n_repeats=1, n_splits=5, random_state=1)
b_search = BayesSearchCV(model_rf, param, cv=cv, return_train_score=True, n_jobs=-1, verbose=0)
b_search.fit(train_in_nor, train_out.values.ravel())



BayesSearchCV(cv=RepeatedKFold(n_repeats=1, n_splits=5, random_state=1),
              estimator=RandomForestClassifier(n_estimators=200), n_jobs=-1,
              return_train_score=True,
              search_spaces={'max_depth': Integer(low=1, high=30, prior='uniform', transform='normalize'),
                             'max_features': Integer(low=1, high=15, prior='uniform', transform='normalize'),
                             'max_samples': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                             0.8, 0.9, 0.999]})

In [12]:
print(f"Mean train score = {b_search.cv_results_['mean_train_score'][b_search.best_index_].round(4)}")
print(f"Mean test score = {b_search.cv_results_['mean_test_score'][b_search.best_index_].round(4)}")

Mean train score = 0.9234
Mean test score = 0.8362


In [13]:
b_search.best_params_

OrderedDict([('max_depth', 8), ('max_features', 9), ('max_samples', 0.7)])

In [15]:
model_rf1 = RandomForestClassifier(max_depth=b_search.best_params_['max_depth'],
                                         max_features=b_search.best_params_['max_features'],
                                         max_samples=b_search.best_params_['max_samples'],
                                         random_state=1)
model_rf1.fit(train_in, train_out.values.ravel())

RandomForestClassifier(max_depth=8, max_features=9, max_samples=0.7,
                       random_state=1)

Checking how many predicted labels are different when using model trained with CV and model trained over full train set:

In [17]:
compare = pd.DataFrame([b_search.predict(test_in), model_rf1.predict(test_in)]).T
pd.DataFrame([compare[0] == compare[1]]).T.value_counts()

True     356
False     62
dtype: int64

In [20]:
os.chdir("../training")
data_path = "../data/raw/"

submission = pd.read_csv(data_path + "gender_submission.csv")
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [21]:
predictions = pd.DataFrame([test['PassengerId'], model_rf1.predict(test_in)]).T
predictions.columns = ['PassengerId','Survived']
predictions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [23]:
os.chdir("../training")
data_path = "../data/prediction/"

predictions.to_csv(data_path + 'submission1.csv', index=False)

#### CV Train Accuracy: 0.923 / CV Val. Accuracy: 0.836 / Test Accuracy (website): 0.77751

------
--------
--------
--------