In [4]:
# Load libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report

from imblearn.over_sampling import SMOTE

import shap
import matplotlib.pyplot as plt



In [5]:
# Data loading and inspection

df = pd.read_csv("/content/healthcare-dataset-stroke-data.csv")

print(df.info())
print(df.isna().sum())

df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
wo

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
# Basic cleaning

# Drop non-informative identifier
df = df.drop(columns=["id"])

# Median imputation for BMI
df["bmi"] = df["bmi"].fillna(df["bmi"].median())


In [7]:
# Feature / target separation


X = df.drop(columns=["stroke"])
y = df["stroke"]


In [8]:
# Encode categorical variables

X = pd.get_dummies(X, drop_first=True)

print(X.shape)


(5110, 16)


In [9]:
# Trainâ€“test split


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_test.shape)



(3577, 16) (1533, 16)


In [10]:
# Feature scaling (numeric only)

num_cols = ["age", "avg_glucose_level", "bmi"]

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


In [11]:
# Baseline models (imbalanced data)

lr = LogisticRegression(
    max_iter=1000,
    random_state=42
)

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

print(
    "Logistic Regression ROC-AUC:",
    roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])
)

print(
    "Random Forest ROC-AUC:",
    roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
)


Logistic Regression ROC-AUC: 0.8378417924096937
Random Forest ROC-AUC: 0.8107178783721993


In [12]:
# Class-weighted logistic regression

lr_weighted = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

lr_weighted.fit(X_train, y_train)

y_test_prob = lr_weighted.predict_proba(X_test)[:, 1]

print(
    "Class-weighted LR ROC-AUC:",
    roc_auc_score(y_test, y_test_prob)
)

print(
    classification_report(
        y_test,
        (y_test_prob >= 0.5).astype(int)
    )
)


Class-weighted LR ROC-AUC: 0.8388660265203475
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      1458
           1       0.13      0.79      0.22        75

    accuracy                           0.73      1533
   macro avg       0.56      0.76      0.53      1533
weighted avg       0.94      0.73      0.81      1533



In [13]:
# SMOTE (training data only)


smote = SMOTE(random_state=42)

X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

pd.Series(y_train_bal).value_counts()


Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,3403
1,3403


In [14]:
# Models trained on balanced data

lr_smote = LogisticRegression(
    max_iter=1000,
    random_state=42
)

rf_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

lr_smote.fit(X_train_bal, y_train_bal)
rf_smote.fit(X_train_bal, y_train_bal)

print(
    "LR (SMOTE) ROC-AUC:",
    roc_auc_score(y_test, lr_smote.predict_proba(X_test)[:, 1])
)

print(
    "RF (SMOTE) ROC-AUC:",
    roc_auc_score(y_test, rf_smote.predict_proba(X_test)[:, 1])
)


LR (SMOTE) ROC-AUC: 0.8162962962962963
RF (SMOTE) ROC-AUC: 0.7909830818472794


In [16]:
# SHAP explainability (logistic regression, SMOTE)

# Ensure numeric matrix
X_train_shap = X_train.astype(float)
X_test_shap  = X_test.astype(float)


In [17]:
# SHAP explainer
masker = shap.maskers.Independent(X_train_shap)

explainer = shap.Explainer(
    lr_smote.predict_proba,
    masker
)

shap_values = explainer(X_test_shap)
shap_class1 = shap_values[..., 1]



PermutationExplainer explainer: 1534it [01:32, 15.00it/s]                          


In [18]:
# Global SHAP bar plot

shap.plots.bar(
    shap_class1,
    max_display=10,
    show=False
)

plt.savefig(
    "figures/shap_global_bar.png",
    bbox_inches="tight",
    dpi=300
)
plt.close()


In [19]:
# Global SHAP beeswarm plot

shap.plots.beeswarm(
    shap_class1,
    max_display=10,
    show=False
)

plt.savefig(
    "figures/shap_global_beeswarm.png",
    bbox_inches="tight",
    dpi=300
)
plt.close()
