In [1]:
# Import necessary libraries
from pycaret.datasets import get_data
from pycaret.classification import *
import pandas as pd

In [2]:
# Load Titanic dataset
data = get_data('titanic')

# Show the first few rows of the dataset
print(data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [8]:
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_columns)

Categorical columns: Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


In [9]:
# Set up the environment in PyCaret
clf = setup(data, target='Survived', session_id=123, 
            normalize=True,  # Normalizes numeric features
            categorical_features=['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
            remove_outliers=True,
            ignore_features=['name', 'cabin', 'ticket'])


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(859, 14)"
5,Transformed train set shape,"(591, 14)"
6,Transformed test set shape,"(268, 14)"
7,Ignore features,3
8,Numeric features,6
9,Categorical features,5


In [10]:
# Compare all models and select the best one
best_model = compare_models()

# Print the best model
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7606,0.809,0.5022,0.8106,0.6112,0.4544,0.486,0.304
svm,SVM - Linear Kernel,0.7412,0.7772,0.4551,0.7696,0.5452,0.3955,0.4322,0.24
et,Extra Trees Classifier,0.7303,0.7743,0.4562,0.7485,0.5627,0.3846,0.4114,0.464
lr,Logistic Regression,0.7078,0.8303,0.331,0.7834,0.4592,0.3057,0.3599,1.354
nb,Naive Bayes,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.272
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.326
ridge,Ridge Classifier,0.6164,0.8175,0.0,0.0,0.0,0.0,0.0,0.32
rf,Random Forest Classifier,0.6164,0.8139,0.0,0.0,0.0,0.0,0.0,0.463
qda,Quadratic Discriminant Analysis,0.6164,0.4586,0.0,0.0,0.0,0.0,0.0,0.266
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.296


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')


In [11]:
# Tune the hyperparameters of the best model
tuned_model = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7778,0.8515,0.5,0.8571,0.6316,0.4878,0.5241
1,0.7778,0.8088,0.5833,0.7778,0.6667,0.5051,0.5168
2,0.8254,0.8985,0.5417,1.0,0.7027,0.594,0.65
3,0.8226,0.8729,0.5652,0.9286,0.7027,0.5867,0.6234
4,0.7419,0.7791,0.375,0.9,0.5294,0.3907,0.4618
5,0.7258,0.8152,0.375,0.8182,0.5143,0.3581,0.411
6,0.7258,0.8366,0.4167,0.7692,0.5405,0.3689,0.4041
7,0.8548,0.858,0.7083,0.8947,0.7907,0.6819,0.6928
8,0.7742,0.8328,0.5417,0.8125,0.65,0.493,0.5151
9,0.7742,0.8942,0.625,0.75,0.6818,0.509,0.5141


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [12]:
# Print the tuned model
print(tuned_model)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=-1, n_neighbors=16, p=2,
                     weights='uniform')


In [13]:
# Evaluate the tuned model using various plots (Precision-Recall, AUC, etc.)
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [14]:
# Ensemble the tuned model using stacking
stacked_model = stack_models([best_model, tuned_model])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7778,0.8467,0.5,0.8571,0.6316,0.4878,0.5241
1,0.8254,0.8189,0.6667,0.8421,0.7442,0.6144,0.624
2,0.8254,0.8996,0.5417,1.0,0.7027,0.594,0.65
3,0.8226,0.8657,0.5652,0.9286,0.7027,0.5867,0.6234
4,0.7258,0.7889,0.375,0.8182,0.5143,0.3581,0.411
5,0.7258,0.7955,0.4167,0.7692,0.5405,0.3689,0.4041
6,0.7419,0.8438,0.4167,0.8333,0.5556,0.401,0.4488
7,0.8065,0.8498,0.5833,0.875,0.7,0.5654,0.5908
8,0.7419,0.8213,0.4583,0.7857,0.5789,0.4109,0.442
9,0.7742,0.8832,0.7083,0.7083,0.7083,0.5241,0.5241


In [15]:
# Predict on unseen data (test set)
predictions = predict_model(stacked_model)

# Show the predictions
print(predictions.head())

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.75,0.8399,0.4757,0.7903,0.5939,0.429,0.4579


     PassengerId  Pclass                               Name     Sex   Age  \
178          179       2                 Hale, Mr. Reginald    male  30.0   
457          458       1  Kenyon, Mrs. Frederick R (Marion)  female   NaN   
16            17       3               Rice, Master. Eugene    male   2.0   
95            96       3        Shorney, Mr. Charles Joseph    male   NaN   
120          121       2        Hickman, Mr. Stanley George    male  21.0   

     SibSp  Parch        Ticket       Fare Cabin Embarked  Survived  \
178      0      0        250653  13.000000   NaN        S         0   
457      1      0         17464  51.862499   D21        S         1   
16       4      1        382652  29.125000   NaN        Q         0   
95       0      0        374910   8.050000   NaN        S         0   
120      2      0  S.O.C. 14879  73.500000   NaN        S         0   

     prediction_label  prediction_score  
178                 0            0.9869  
457                 1     