In [4]:
import numpy as np
import pandas as pd

from pycaret.classification import *

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

In [14]:
# importing the Titanic dataset
dataTr = pd.read_csv('../data/titanictr.csv')
dataTr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
# setting up the pycaret environment
clf_setup = setup(data=dataTr,target='Survived')

Unnamed: 0,Description,Value
0,Session id,4233
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


In [20]:
# compare the best models
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8187,0.8655,0.6788,0.8166,0.7378,0.6021,0.6107,0.752
ridge,Ridge Classifier,0.7496,0.8674,0.4221,0.8548,0.5589,0.4131,0.4656,0.065
et,Extra Trees Classifier,0.7174,0.8053,0.2962,0.8976,0.4313,0.3116,0.3961,0.142
nb,Naive Bayes,0.6774,0.8033,0.213,0.8256,0.3305,0.2068,0.2917,0.062
knn,K Neighbors Classifier,0.6294,0.6154,0.3554,0.5314,0.4223,0.1652,0.1746,0.062
rf,Random Forest Classifier,0.626,0.8147,0.0333,0.3467,0.0584,0.0333,0.0703,0.179
svm,SVM - Linear Kernel,0.6243,0.6242,0.2504,0.5451,0.2869,0.1175,0.1494,0.065
lda,Linear Discriminant Analysis,0.6228,0.5255,0.0217,0.0833,0.0345,0.0226,0.0313,0.064
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.068
qda,Quadratic Discriminant Analysis,0.6164,0.5757,0.0,0.0,0.0,0.0,0.0,0.055


In [22]:
lr = create_model('lr')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7619,0.8323,0.5833,0.7368,0.6512,0.4741,0.4816
1,0.7937,0.86,0.625,0.7895,0.6977,0.5442,0.5528
2,0.873,0.9177,0.75,0.9,0.8182,0.7219,0.7289
3,0.7742,0.8015,0.6667,0.7273,0.6957,0.5167,0.5179
4,0.7903,0.7993,0.5833,0.8235,0.6829,0.533,0.5507
5,0.9032,0.9375,0.875,0.875,0.875,0.7961,0.7961
6,0.7258,0.7741,0.5417,0.6842,0.6047,0.3991,0.4055
7,0.7903,0.8783,0.5417,0.8667,0.6667,0.5253,0.5562
8,0.8387,0.8969,0.7083,0.85,0.7727,0.6493,0.6558
9,0.9355,0.9576,0.913,0.913,0.913,0.8618,0.8618


In [24]:
# tune hyperparameters
tuned_dt = tune_model(lr)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7619,0.8686,0.5833,0.7368,0.6512,0.4741,0.4816
1,0.8254,0.8675,0.7083,0.8095,0.7556,0.6207,0.624
2,0.9048,0.9348,0.7917,0.95,0.8636,0.7914,0.7992
3,0.8226,0.8235,0.6667,0.8421,0.7442,0.6112,0.6209
4,0.7903,0.8059,0.625,0.7895,0.6977,0.5405,0.5491
5,0.9194,0.943,0.875,0.913,0.8936,0.8287,0.8292
6,0.6935,0.7763,0.5417,0.619,0.5778,0.3389,0.3408
7,0.8226,0.8925,0.6667,0.8421,0.7442,0.6112,0.6209
8,0.8387,0.8904,0.7917,0.7917,0.7917,0.6601,0.6601
9,0.9032,0.9487,0.8261,0.9048,0.8636,0.7889,0.7908


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [28]:
# importing the test data
dataTs = pd.read_csv('../data/titanicts.csv')
dataTs.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [34]:
# predicting with the best model
predictions = predict_model(tuned_dt,data=dataTs)
predictions.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,prediction_label,prediction_score
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,0.8953
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,0.6155
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,0.8203
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,0.7896
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,0.8157
