1. Install Pycaret & sanity check

In [2]:
from pycaret.classification import *

2. Load dataset in AutoML notebook & verify

In [3]:
import pandas as pd
df = pd.read_csv("../data/processed/stroke_cleaned.csv")

df.head()
df.info()
df['stroke'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
 12  bmi_category       5110 non-null   object 
dtypes: float64(3), int64(4), object(6)
memory usage: 519.1+ KB


stroke
0    4861
1     249
Name: count, dtype: int64

3. Initialize PyCaret Setup

In [6]:
clf_setup = setup(
    data=df,
    target='stroke',
    session_id=42,
    train_size=0.8,
    fold=5,
    normalize=True,
    transformation=True,
    fix_imbalance=True,
    verbose=True
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,stroke
2,Target type,Binary
3,Original data shape,"(5110, 13)"
4,Transformed data shape,"(8800, 25)"
5,Transformed train set shape,"(7778, 25)"
6,Transformed test set shape,"(1022, 25)"
7,Numeric features,6
8,Categorical features,6
9,Preprocess,True


4. Compare Models

In [7]:
best_model = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9215,0.8366,0.2258,0.213,0.2158,0.1752,0.1768,0.416
ridge,Ridge Classifier,0.921,0.8365,0.2258,0.2103,0.2147,0.1738,0.1752,0.704
lr,Logistic Regression,0.931,0.8259,0.1355,0.2045,0.1595,0.1256,0.13,2.166
ada,Ada Boost Classifier,0.944,0.8174,0.0453,0.185,0.0724,0.0546,0.0695,1.05
gbc,Gradient Boosting Classifier,0.9499,0.8169,0.015,0.25,0.0281,0.0228,0.0502,2.12
lightgbm,Light Gradient Boosting Machine,0.9462,0.8131,0.02,0.1286,0.0346,0.0227,0.0334,1.028
nb,Naive Bayes,0.1957,0.7993,0.99,0.0566,0.107,0.0164,0.088,0.874
rf,Random Forest Classifier,0.9494,0.7972,0.0,0.0,0.0,-0.0037,-0.0088,1.384
et,Extra Trees Classifier,0.9418,0.7696,0.0506,0.1646,0.0765,0.056,0.0665,0.87
svm,SVM - Linear Kernel,0.943,0.755,0.025,0.0526,0.0339,0.0183,0.0157,1.274


5. Evaluate AutoML Model

In [8]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦

6. Generate predictions and metrics explicitly

In [9]:
# Generate Predictions
predictions = predict_model(best_model)
predictions.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.9237,0.8485,0.36,0.2812,0.3158,0.276,0.2784


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,bmi_category,stroke,prediction_label,prediction_score
3725,24202,Male,63.0,0,0,Yes,Private,Rural,78.230003,34.799999,never smoked,Obese,0,0,0.953
4481,66006,Female,43.0,0,0,Yes,Private,Urban,86.669998,33.299999,never smoked,Obese,0,0,0.983
1545,52089,Female,23.0,0,0,No,Private,Urban,126.669998,28.700001,smokes,Overweight,0,0,0.9891
1820,47608,Female,21.0,0,0,No,Private,Urban,208.169998,24.9,never smoked,Normal,0,0,0.996
1262,48781,Male,67.0,0,0,Yes,Private,Rural,113.339996,26.299999,formerly smoked,Overweight,0,0,0.7401


In [11]:
# Compute ROC-AUC Score
from sklearn.metrics import roc_auc_score

roc_auc_automl = roc_auc_score(
    predictions['stroke'],
    1 - predictions['prediction_score']
)

roc_auc_automl


0.8318724279835391

7. Explainability for LDA using coefficients

In [15]:
# Extract feature names
X_transformed = get_config('X_train_transformed')
feature_names = X_transformed.columns

In [21]:
# Extract LDA Coefficients
import pandas as pd
import numpy as np

lda_coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': best_model.coef_[0]
})

lda_coefficients['Absolute_Coefficient'] = np.abs(lda_coefficients['Coefficient'])

lda_coefficients = lda_coefficients.sort_values(
    by='Absolute_Coefficient',
    ascending=False
)

lda_coefficients.head(10)


Unnamed: 0,Feature,Coefficient,Absolute_Coefficient
1,gender_Female,8.68893,8.68893
2,gender_Male,8.651935,8.651935
18,bmi_category_Overweight,3.113188,3.113188
16,bmi_category_Obese,2.883401,2.883401
20,smoking_status_never smoked,2.571308,2.571308
23,smoking_status_formerly smoked,2.531425,2.531425
22,smoking_status_Unknown,2.397361,2.397361
17,bmi_category_Normal,2.272598,2.272598
21,smoking_status_smokes,2.223301,2.223301
8,work_type_Private,2.061805,2.061805
