In [1]:
# load google drive and move to the current directory
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/Titanic

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Titanic


# Taitanic Data Dictionary

|**Variable**	|**Definition**	|**Key**|
|:------------|---------------|-------:|
|survival| Survival|	0 = No, 1 = Yes|
|pclass|	Ticket class|	1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|	Sex	|
|Age|	Age in years|	
|sibsp|	# of siblings / spouses aboard the  Titanic|	
|parch|	# of parents / children aboard the Titanic|	
|ticket|	Ticket number|	
|fare|	Passenger fare|	
|cabin	|Cabin number|	
|embarked|	Port of Embarkation|	C = Cherbourg, Q = Queenstown, S = Southampton

In [2]:
# upgrade scikit learn version
# !pip install scikit-learn==0.24.2

In [3]:
# import dependencies
import pandas as pd
import numpy as np
import sklearn

In [4]:
# load the dataset
data = pd.read_csv('titanic.csv')

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# get the descriptives of the data
data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",male,,,,1601.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


There are 891 passengers in the dataset. The `Age` and `Embarked` features have a few missing values, the `Cabin` feature has a lot of missing values.

# Handling Missing Values in Selected Features
### Age Feature

In [6]:
# check the dataype of all columns
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
# check the number of missing values in the age feature
data['Age'].isna().sum()

177

In [8]:
# fill the Age column with the average age in the dataset, round to nearest integer
data['Age'] = round(data['Age'].fillna(value = data['Age'].mean()).astype(int))

# check the data type
print('Age now has', data['Age'].dtype, 'data type \n')

# check the descriptives of Age
data['Age'].describe()

Age now has int64 data type 



count    891.000000
mean      29.544332
std       13.013778
min        0.000000
25%       22.000000
50%       29.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

### Handle Missing Values for Embarked Column

In [9]:
# check the unique values in this object column
data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
# Check the number of missing values
data['Embarked'].isna().sum()

2

In [11]:
# check these missing embarked rows
data[data['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80.0,B28,


These rows have no other missing values in any of its columns, therefore we'll replace the Null values with the preceeding values.

In [12]:
# fill the Embarked column with the preceeding values
data['Embarked'] = data['Embarked'].fillna(method='ffill')

In [13]:
# check the unique values in this object column
data['Embarked'].value_counts()

S    644
C    169
Q     78
Name: Embarked, dtype: int64

### Handle Missing Values for Cabin Column

In [14]:
# check unique values counts
data['Cabin'].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
E101           3
F2             3
              ..
E38            1
E49            1
D9             1
E12            1
C103           1
Name: Cabin, Length: 147, dtype: int64

In [15]:
# check number of missing values
data['Cabin'].isna().sum()

687

The outputs shows that out of 891 rows in the column, 687 are missing while of the 204 non-missing values, 147 are unique values. Therefore, we'll drop this column because it does not carry any significant information

In [16]:
# drop the cabin column
data.drop('Cabin', axis=1, inplace=True)

# check
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,S


In [17]:
# Recheck the descriptives
data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,891
unique,,,,891,2,,,,681.0,,3
top,,,,"Duff Gordon, Lady. (Lucille Christiana Sutherl...",male,,,,1601.0,,S
freq,,,,1,577,,,,7.0,,644
mean,446.0,0.383838,2.308642,,,29.544332,0.523008,0.381594,,32.204208,
std,257.353842,0.486592,0.836071,,,13.013778,1.102743,0.806057,,49.693429,
min,1.0,0.0,1.0,,,0.0,0.0,0.0,,0.0,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,
50%,446.0,0.0,3.0,,,29.0,0.0,0.0,,14.4542,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,


Next, we'll drop the name, ticket and PassengerID columns  
Name and PassengerID are unique identifiers while the Ticket is a unique ID for passenger tickets

In [18]:
# Drop Name, Ticket and PassengerID columns
data.drop(['Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)

# check
data.describe(include='all')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891,891.0,891.0,891.0,891.0,891
unique,,,2,,,,,3
top,,,male,,,,,S
freq,,,577,,,,,644
mean,0.383838,2.308642,,29.544332,0.523008,0.381594,32.204208,
std,0.486592,0.836071,,13.013778,1.102743,0.806057,49.693429,
min,0.0,1.0,,0.0,0.0,0.0,0.0,
25%,0.0,2.0,,22.0,0.0,0.0,7.9104,
50%,0.0,3.0,,29.0,0.0,0.0,14.4542,
75%,1.0,3.0,,35.0,1.0,0.0,31.0,


# Preprocessing

In [19]:
# transform the survival class
from sklearn.preprocessing import LabelEncoder

# transform the output variable
label_enc = LabelEncoder()
label_enc.fit(data['Survived'])

label_enc

LabelEncoder()

In [20]:
Y = label_enc.transform(data['Survived'])

Y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [21]:
data['Survived'] = Y
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


In [22]:
# convert the sex column to a binary column
data['Sex'] = pd.get_dummies(data['Sex'], drop_first=True, prefix='sex')

# check
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7.25,S
1,1,1,0,38,1,0,71.2833,C
2,1,3,0,26,0,0,7.925,S
3,1,1,0,35,1,0,53.1,S
4,0,3,1,35,0,0,8.05,S


from the above data, The sex column shows that `1 = male` and `0 = female`

Let's encode the `Embarked` column using label encoder

In [23]:
label_enc_2 = LabelEncoder()

label_enc_2.fit(data['Embarked'])

label_enc

LabelEncoder()

In [24]:
data['Embarked'] = label_enc_2.transform(data['Embarked'])

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7.25,2
1,1,1,0,38,1,0,71.2833,0
2,1,3,0,26,0,0,7.925,2
3,1,1,0,35,1,0,53.1,2
4,0,3,1,35,0,0,8.05,2


In [25]:
# save the cleaned data
data.to_csv('titanic_clean.csv', index=False)

In [26]:
# #install latest version of pandas profiling
# !pip install pandas-profiling==3.0.0

In [27]:
# import pandas_profiling 
# #create a pandas profile report for the dataset
# profile = pandas_profiling.ProfileReport(data)

# #save the report in a html document
# profile.to_file('titanic_survivors_EDA.html')

In [28]:
X = data.drop('Survived', axis=1)

Y = data['Survived']

print(X.shape); Y.shape

(891, 7)


(891,)

From the above output, `S = 2`, `C = 0` and `Q = 1`

# Standardize and Split Data Into train and Test sets

In [29]:
# import package for splitting data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV

# import joblib for saving and loading binary files
from  joblib import dump
from joblib import load

# import standard scalar
from sklearn.preprocessing import StandardScaler

# Scale the entire data
scaler = StandardScaler()

# fit the scaler
scaler.fit(X)

# save the scaler
dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [30]:
# check the mean and standard deviation of the scaler
print(scaler.mean_, '\n\n', scaler.scale_)

[ 2.30864198  0.64758698 29.54433221  0.52300786  0.38159371 32.20420797
  1.53310887] 

 [ 0.83560193  0.47772176 13.00647335  1.10212444  0.80560476 49.66553444
  0.79262403]


In [31]:
# transform the data
trans_data = scaler.transform(X)

In [32]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(trans_data, Y, test_size=0.1, random_state=33)

# check
print(X_train.shape, '\n', X_test.shape, '\n', y_train.shape, '\n', y_test.shape)

(801, 7) 
 (90, 7) 
 (801,) 
 (90,)


# Modelling

### XGBoost

In [33]:
# install latest version of XGBoost
# !pip install xgboost==1.4.2

In [34]:
# get the ratio of classes in the default column
from collections import Counter

counter = Counter(Y)

estimate = counter[0] / counter[1]
estimate

1.605263157894737

In [35]:
import xgboost as xgb
xgb_model= xgb.XGBClassifier(scale_pos_weight=round(estimate, 1))

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)

xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
xgb_roc_auc = roc_auc_score(y_test, xgb_pred)


print(f'Accuracy score of XGB is {xgb_acc}', '\n')
print(f'ROC_AUC score of XGB is {xgb_roc_auc}', '\n')
print(f'The f1 score XGB is {xgb_f1}', '\n')
print('The confusion matrix of XGB is:', '\n', f'{confusion_matrix(y_test, xgb_pred)}',)

Accuracy score of XGB is 0.7555555555555555 

ROC_AUC score of XGB is 0.7459514170040485 

The f1 score XGB is 0.7027027027027027 

The confusion matrix of XGB is: 
 [[42 10]
 [12 26]]


# SVC

In [36]:
# import the SVM classifier
from sklearn.svm import SVC

# instantiate the SVC classifier with a hyperparameter to account for imbalanced classes
svc = SVC(class_weight='balanced', kernel='poly', degree=8, coef0=5.0)

# fit the model on the train data
svc.fit(X_train, y_train)
# predict on the test set
svc_pred = svc.predict(X_test)

svc_acc = accuracy_score(y_test, svc_pred)
svc_f1 = f1_score(y_test, svc_pred)
svc_roc_auc = roc_auc_score(y_test, svc_pred)


print(f'Accuracy score of SVC is {svc_acc}', '\n')
print(f'ROC_AUC score of SVC is {svc_roc_auc}', '\n')
print(f'The f1 score SVC is {svc_f1}', '\n')
print('The confusion matrix of SVC is:', '\n', f'{confusion_matrix(y_test, svc_pred)}',)

Accuracy score of SVC is 0.8222222222222222 

ROC_AUC score of SVC is 0.8071862348178137 

The f1 score SVC is 0.7714285714285714 

The confusion matrix of SVC is: 
 [[47  5]
 [11 27]]


#KNN

In [37]:
# Import the KNN classifier
from sklearn.neighbors import  KNeighborsClassifier

# instantiate the KNN classifier with the resultant hyperparameters
knn = KNeighborsClassifier(algorithm='brute', n_neighbors=2, weights='distance', n_jobs=-1, p=1)

# fit the model on the train data
knn.fit(X_train, y_train)

# predict on the test set
knn_pred = knn.predict(X_test)

knn_acc = accuracy_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_roc_auc = roc_auc_score(y_test, knn_pred)


print(f'Accuracy score of KNN is {knn_acc}', '\n')
print(f'ROC_AUC score of KNN is {knn_roc_auc}', '\n')
print(f'The f1 score KNN is {knn_f1}', '\n')
print('The confusion matrix of KNN is:', '\n', f'{confusion_matrix(y_test, knn_pred)}',)

Accuracy score of KNN is 0.8444444444444444 

ROC_AUC score of KNN is 0.8299595141700407 

The f1 score KNN is 0.7999999999999999 

The confusion matrix of KNN is: 
 [[48  4]
 [10 28]]


#Logistic Regression

In [38]:
# import logistic regression classifier
from sklearn.linear_model import LogisticRegression

# instantiate the LR with the resultant hyperparameters and a balanced class weight to account for class imbalance
lr = LogisticRegression(solver='liblinear', class_weight='balanced')

# fit the model on the train data
lr.fit(X_train, y_train)

# predict on the test set
lr_pred = lr.predict(X_test)

lr_acc = accuracy_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
lr_roc_auc = roc_auc_score(y_test, lr_pred)

print(f'Accuracy score of Logistic Regression is {lr_acc}', '\n')
print(f'ROC_AUC score of Logistic Regression is {lr_roc_auc}', '\n')
print(f'The f1 score Logistic Regression is {lr_f1}', '\n')
print('The confusion matrix of Logistic Regression is:', '\n', f'{confusion_matrix(y_test, lr_pred)}',)

Accuracy score of Logistic Regression is 0.7555555555555555 

ROC_AUC score of Logistic Regression is 0.7388663967611336 

The f1 score Logistic Regression is 0.6857142857142857 

The confusion matrix of Logistic Regression is: 
 [[44  8]
 [14 24]]


# Decision Tree Classifier

In [39]:
# import decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# instantiate the DT classifier that accounts for class imbalance
dtc = DecisionTreeClassifier(class_weight='balanced', ccp_alpha=0.01)

# fit the model on the train data
dtc.fit(X_train, y_train)

# predict on the test set
dtc_pred = dtc.predict(X_test)

dtc_acc = accuracy_score(y_test, dtc_pred)
dtc_f1 = f1_score(y_test, dtc_pred)
dtc_roc_auc = roc_auc_score(y_test, dtc_pred)

print(f'Accuracy score of DTC is {dtc_acc}', '\n')
print(f'ROC_AUC score of DTC is {dtc_roc_auc}', '\n')
print(f'The f1 score DTC is {dtc_f1}', '\n')
print('The confusion matrix of DTC is:', '\n', f'{confusion_matrix(y_test, dtc_pred)}')

Accuracy score of DTC is 0.8111111111111111 

ROC_AUC score of DTC is 0.7940283400809717 

The f1 score DTC is 0.7536231884057972 

The confusion matrix of DTC is: 
 [[47  5]
 [12 26]]


# Random Forest Classifier

In [50]:
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier

# instantiate the RF classifier with the default hyperparameters
rfc = RandomForestClassifier(max_features='sqrt', class_weight='balanced', n_jobs=-1, ccp_alpha=0.001, criterion='gini', n_estimators=50)

# fit the model on the train data
rfc.fit(X_train, y_train)

# predict on the test set
rfc_pred = rfc.predict(X_test)

rfc_acc = accuracy_score(y_test, rfc_pred)
rfc_f1 = f1_score(y_test, rfc_pred)
rfc_roc_auc = roc_auc_score(y_test, rfc_pred)

print(f'Accuracy score of RFC is {rfc_acc}', '\n')
print(f'ROC_AUC score of RFC is {rfc_roc_auc}', '\n')
print(f'The f1 score RFC is {rfc_f1}', '\n')
print('The confusion matrix of RFC is:', '\n', f'{confusion_matrix(y_test, rfc_pred)}')

Accuracy score of RFC is 0.8333333333333334 

ROC_AUC score of RFC is 0.8203441295546559 

The f1 score RFC is 0.7887323943661972 

The confusion matrix of RFC is: 
 [[47  5]
 [10 28]]


# AdaBoostClassifier

In [41]:
from sklearn.ensemble import AdaBoostClassifier

# instantiate the RF classifier with the default hyperparameters
ada = AdaBoostClassifier(n_estimators=10, learning_rate=1, algorithm='SAMME.R')

# fit the model on the train data
ada.fit(X_train, y_train)

# predict on the test set
ada_pred = ada.predict(X_test)

ada_acc = accuracy_score(y_test, ada_pred)
ada_f1 = f1_score(y_test, ada_pred)
ada_roc_auc = roc_auc_score(y_test, ada_pred)

print(f'Accuracy score of ADA is {ada_acc}', '\n')
print(f'ROC_AUC score of ADA is {ada_roc_auc}', '\n')
print(f'The f1 score ADA is {ada_f1}', '\n')
print('The confusion matrix of ADA is:', '\n', f'{confusion_matrix(y_test, ada_pred)}')

Accuracy score of ADA is 0.8 

ROC_AUC score of ADA is 0.7808704453441296 

The f1 score ADA is 0.7352941176470588 

The confusion matrix of ADA is: 
 [[47  5]
 [13 25]]


# Multi Layer Perceptron Classifier

In [42]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 50, 25), learning_rate='invscaling', power_t=1)

# fit the model on the train data
mlp.fit(X_train, y_train)

# predict on the test set
mlp_pred = mlp.predict(X_test)

mlp_acc = accuracy_score(y_test, mlp_pred)
mlp_f1 = f1_score(y_test, mlp_pred)
mlp_roc_auc = roc_auc_score(y_test, mlp_pred)

print(f'Accuracy score of MLP is {mlp_acc}', '\n')
print(f'ROC_AUC score of MLP is {mlp_roc_auc}', '\n')
print(f'The f1 score MLP is {mlp_f1}', '\n')
print('The confusion matrix of MLP is:', '\n', f'{confusion_matrix(y_test, mlp_pred)}')

Accuracy score of MLP is 0.7888888888888889 

ROC_AUC score of MLP is 0.7677125506072875 

The f1 score MLP is 0.716417910447761 

The confusion matrix of MLP is: 
 [[47  5]
 [14 24]]


# Model Selection

From all the algorithms used to train the data, the Random classifier performed best with better ROC_AUC score and F1 socres. We'll use this model to train the entire data with the same parameters.

### Train the entire dataset

In [43]:
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier

# instantiate the RF classifier with the default hyperparameters
rfc_all = RandomForestClassifier(max_features='sqrt', class_weight='balanced', n_jobs=-1, ccp_alpha=0.001, criterion='gini', n_estimators=50)

# fit the model on the train data
rfc_all.fit(trans_data, Y)

# predict on the test set
rfc_all_pred = rfc_all.predict(X_test)

rfc_all_acc = accuracy_score(y_test, rfc_all_pred)
rfc_all_f1 = f1_score(y_test, rfc_all_pred)
rfc_all_roc_auc = roc_auc_score(y_test, rfc_all_pred)

print(f'Accuracy score of RFC Trained on the entire data is {rfc_all_acc}', '\n')
print(f'ROC_AUC score of RFC Trained on the entire data is {rfc_all_roc_auc}', '\n')
print(f'The f1 score RFC Trained on the entire data is {rfc_all_f1}', '\n')
print('The confusion matrix of RFC Trained on the entire data is:', '\n', f'{confusion_matrix(y_test, rfc_all_pred)}')

Accuracy score of RFC Trained on the entire data is 0.9555555555555556 

ROC_AUC score of RFC Trained on the entire data is 0.9473684210526316 

The f1 score RFC Trained on the entire data is 0.9444444444444444 

The confusion matrix of RFC Trained on the entire data is: 
 [[52  0]
 [ 4 34]]


# Save the Model

In [44]:
from joblib import dump, load

dump(rfc_all, 'random_forest_model.joblib')

['random_forest_model.joblib']