In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [None]:
"""
the output from the cell above tells us a few things:
1. the dataset is highly imbalanced
2. testing the result with an accuracy score would be useless. why?
3. if we just predict no all the time, our model would be 4861/5110 = 95.1% accurate!
4. so to test our models performance, we will be using metrics like:
    - precision and recall
    - f1-score
    - roc-auc
"""

In [6]:
bmi_mean = df['bmi'].mean()
df['bmi'] = df['bmi'].fillna(bmi_mean)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [16]:
df['smoking_status'].value_counts()

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [None]:
"""
there are too many rows with 'Unknown' as a smoking status.
so instead of dropping or manipulating the data, we will treat it as it's own category
"""

In [17]:
df.columns = df.columns.str.lower()

In [18]:
df = df.drop('id', axis=1)

In [20]:
from sklearn.model_selection import train_test_split

In [71]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

len(df_train), len(df_val), len(df_test)

(3066, 1022, 1022)

In [24]:
# Separate the target variable (y)
y_train = df_train.stroke.values
y_val = df_val.stroke.values
y_test = df_test.stroke.values

# Remove target from the feature dataframes
del df_train['stroke']
del df_val['stroke']
del df_test['stroke']

In [28]:
from sklearn.metrics import mutual_info_score

# We will calculate MI only for our categorical features
# using only the training data
# MI score helps us know which features have more say on the outcome
categorical = ['gender', 'ever_married', 'work_type', 'residence_type', 'smoking_status']
print("--- Mutual Information Scores ---")
for col in categorical:
    score = mutual_info_score(df_train[col], y_train)
    print(f"'{col}': {round(score, 2)}")

--- Mutual Information Scores ---
'gender': 0.0
'ever_married': 0.01
'work_type': 0.01
'residence_type': 0.0
'smoking_status': 0.0


In [None]:
"""
it seems no feature holds more importance than the rest
"""

In [29]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [30]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [31]:
from sklearn.metrics import roc_auc_score

# 7. the predicted probabilities for the validation set
# We need the probabilities for the "positive" class (1)
y_pred = model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)

print(auc)

0.8311721509574269


In [None]:
"""
our auc score of 0.83117 is our baseline score, from our logistic regression model
it's a good score, but we know that we can do better other tree-based models
next up, we'll be trying our random forest regression
"""

In [60]:
# we're back. and now finetuning logistic reg to see what value of c works best

# The list of C values to test
c_values = [0.001, 0.01, 0.1, 1, 10, 35, 50, 100, 200]
# c_values = [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50]

print("--- Tuning the 'C' parameter ---")

# Loop through each C value
for c in c_values:
    # 1. Initialize the model with the specific C value
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    
    # 2. Train the model
    model.fit(X_train, y_train)
    
    # 3. Calculate auc score on the validation set
    y_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    
    # 4. Print the result
    print(f"C = {c}: \t AUC score = {round(auc, 7)}") # \t adds a nice tab for alignment

--- Tuning the 'C' parameter ---
C = 0.001: 	 AUC score = 0.6402445
C = 0.01: 	 AUC score = 0.682097
C = 0.1: 	 AUC score = 0.7907371
C = 1: 	 AUC score = 0.8311722
C = 10: 	 AUC score = 0.8348206
C = 35: 	 AUC score = 0.8359128
C = 50: 	 AUC score = 0.8358199
C = 100: 	 AUC score = 0.8357037
C = 200: 	 AUC score = 0.8357037


In [None]:
"""
C=35 gives us the highest auc score
"""

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

print("Random Forest Classifier trained successfully!")

Random Forest Classifier trained successfully!


In [47]:
from sklearn.metrics import roc_auc_score

y_pred = rf_model.predict_proba(X_val)[:, 1]
rf_auc = roc_auc_score(y_val, y_pred)

print(rf_auc)

0.7673126975274214


In [None]:
"""
the default settings for random forest perform poorly, compared to logistic regression
but we can fine tune it later
"""

In [49]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

print("XGBoost Classifier trained successfully!")

XGBoost Classifier trained successfully!


In [50]:
from sklearn.metrics import roc_auc_score

y_pred = xgb_model.predict_proba(X_val)[:, 1]
rf_auc = roc_auc_score(y_val, y_pred)

print(rf_auc)

0.7836958542480014


In [None]:
"""
i have learnt that for an imbalanced dataset like ours, tuning the 'scale_pos_weight' parameter is important
currently, it weighs the negative(no stroke) and positive (has_stroke) data equally. the goal of using this parameter
is to ensure that the model weighs them equally
the standard way to calculate it is: count(negative_class)/count(positive_class)
in our case, that's 4861 / 249, which is ~19.5.
"""

In [61]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42, scale_pos_weight=19.5)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict_proba(X_val)[:, 1]
rf_auc = roc_auc_score(y_val, y_pred)

print(rf_auc)

0.8051682468860383


In [None]:
"""
i have also learnt a better way to tune your xgboost model using GridSearchCV
so instead of manually tuning with multiple loops, we let gridsearchcv handle
all that work for us, and figure out the best set of parameters from our param_grid
"""

In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer

# Create 'y' by copying the 'stroke' column
y_full_train = df_full_train['stroke'].values

# Create 'X' by dropping 'stroke'. This makes a new copy.
X_full_train_df = df_full_train.drop('stroke', axis=1)

# --- 2. Vectorize our new X_full_train_df ---
dv_full = DictVectorizer(sparse=True)
X_full_train = dv_full.fit_transform(X_full_train_df.to_dict(orient='records'))

print("Full training data (80%) is now vectorized.")

# --- 3. Define our parameter grid ---
param_grid = {
    'max_depth': [3, 5, 7],         # Try simple, medium, and complex trees
    'learning_rate': [0.1, 0.05],   # Try a slow and a very slow rate
    'min_child_weight': [1, 5, 10]    # Try different anti-overfitting values
}

# --- 4. Create our base XGBoost model ---
xgb_model_tuned = XGBClassifier(
    scale_pos_weight=19.5, 
    random_state=42,
    n_estimators=100  # We can add this to make sure it has enough trees
)

# --- 5. Set up the Grid Search ---
grid_search = GridSearchCV(
    estimator=xgb_model_tuned, 
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1  # This will print updates so you know it's working
)

# --- 6. Run the Grid Search! ---
# This will take a few minutes. It's training (3*2*3) * 5 = 90 models!
print("--- Starting Grid Search (this may take a minute...) ---")
grid_search.fit(X_full_train, y_full_train)

# --- 7. Print the Best Results ---
print("\n--- Grid Search Complete! ---")
print(f"Best AUC Score: {grid_search.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search.best_params_)

Full training data (80%) is now vectorized.
--- Starting Grid Search (this may take a minute...) ---
Fitting 5 folds for each of 400 candidates, totalling 2000 fits

--- Grid Search Complete! ---
Best AUC Score: 0.8437
Best Parameters Found:
{'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 10}


In [73]:
grid_search.best_score_

0.843650453187147

In [None]:
"""
we've tuned our xgboost model to have a better roc score than our logistic regression model.
next, we explore another validation approach to confirm which model performs better
Precision-Recall
"""

In [75]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# --- 1. Train our BEST Logistic Regression Model ---
# (We use the C=35 we found from tuning)
model_lr = LogisticRegression(solver='liblinear', C=35, max_iter=1000, random_state=42)
model_lr.fit(X_train, y_train)
y_pred_proba_lr = model_lr.predict_proba(X_val)[:, 1]

# --- 2. Train our BEST XGBoost Model ---
# (We use the parameters we found from the Grid Search)
model_xgb = XGBClassifier(
    learning_rate=0.1,
    max_depth=2,
    min_child_weight=10,
    scale_pos_weight=19.5,  # The key imbalance parameter
    n_estimators=100,
    random_state=42
)
model_xgb.fit(X_train, y_train)
y_pred_proba_xgb = model_xgb.predict_proba(X_val)[:, 1]


# --- 3. Convert probabilities to 0/1 predictions using the 0.5 threshold ---
y_pred_lr = (y_pred_proba_lr >= 0.5).astype(int)
y_pred_xgb = (y_pred_proba_xgb >= 0.5).astype(int)

# --- 4. Calculate and Print Metrics ---
print("--- Metrics @ 0.5 Threshold ---")

# Logistic Regression
precision_lr = precision_score(y_val, y_pred_lr)
recall_lr = recall_score(y_val, y_pred_lr)
f1_lr = f1_score(y_val, y_pred_lr)

print(f"\nLogistic Regression (C=35):")
print(f"  Precision: {precision_lr:.4f}")
print(f"  Recall:    {recall_lr:.4f}")
print(f"  F1-Score:  {f1_lr:.4f}")

# XGBoost
precision_xgb = precision_score(y_val, y_pred_xgb)
recall_xgb = recall_score(y_val, y_pred_xgb)
f1_xgb = f1_score(y_val, y_pred_xgb)

print(f"\nTuned XGBoost:")
print(f"  Precision: {precision_xgb:.4f}")
print(f"  Recall:    {recall_xgb:.4f}")
print(f"  F1-Score:  {f1_xgb:.4f}")

--- Metrics @ 0.5 Threshold ---

Logistic Regression (C=35):
  Precision: 1.0000
  Recall:    0.0227
  F1-Score:  0.0444

Tuned XGBoost:
  Precision: 0.1285
  Recall:    0.7273
  F1-Score:  0.2184


In [None]:
"""
the logistic regression model (c=35) is effectively useless because it is too cautious. 
while its 1.0 precision seems perfect, this score is deceptive. it's achieved by a model that almost never predicts "stroke."
this is confirmed by its extremely low recall of 0.0227, meaning it fails to identify over 97% of the actual stroke patients,
thereby failing at the project's main goal.

the tuned xgboost model, in sharp contrast, is optimized for finding positive cases. it achieves a very high recall of 0.7273,
successfully identifying nearly 73% of all actual stroke patients. this usefulness comes at the cost of low precision (0.1285),
as it flags many non-stroke patients as well. despite the high number of false positives, this model is far more practical because
it actually finds the at-risk individuals.

in summary, we need to find a good threshold for both models
"""

In [76]:
import numpy as np
from sklearn.metrics import f1_score

# A list to store scores for our LR model
scores_lr = []
thresholds = np.arange(0.0, 1.0, 0.01)

# Loop through all thresholds for Logistic Regression
for t in thresholds:
    y_pred_lr = (y_pred_proba_lr >= t).astype(int)
    f1 = f1_score(y_val, y_pred_lr)
    scores_lr.append((t, f1))

# Find the best LR score
scores_lr.sort(key=lambda x: x[1], reverse=True)
best_f1_lr = scores_lr[0][1]
best_thresh_lr = scores_lr[0][0]

# --- Now do the same for XGBoost ---
scores_xgb = []
for t in thresholds:
    y_pred_xgb = (y_pred_proba_xgb >= t).astype(int)
    f1 = f1_score(y_val, y_pred_xgb)
    scores_xgb.append((t, f1))

# Find the best XGB score
scores_xgb.sort(key=lambda x: x[1], reverse=True)
best_f1_xgb = scores_xgb[0][1]
best_thresh_xgb = scores_xgb[0][0]


# --- Print The Final Results ---
print("--- Best F1-Score (after tuning threshold) ---")

print(f"\nLogistic Regression (C=35):")
print(f"  Best F1-Score: {best_f1_lr:.4f}")
print(f"  At Threshold:  {best_thresh_lr:.2f}")

print(f"\nTuned XGBoost:")
print(f"  Best F1-Score: {best_f1_xgb:.4f}")
print(f"  At Threshold:  {best_thresh_xgb:.2f}")

--- Best F1-Score (after tuning threshold) ---

Logistic Regression (C=35):
  Best F1-Score: 0.2489
  At Threshold:  0.09

Tuned XGBoost:
  Best F1-Score: 0.2411
  At Threshold:  0.59


In [None]:
"""
Final Model Selection: Logistic Regression

After tuning the decision threshold to find the best F1-Score for both models, the Logistic Regression (C=35) is the clear winner.

1. Final Performance (Best F1-Score): The models' peak performances were virtually tied, with the Logistic Regression just slightly ahead.
    Logistic Regression (C=35):
        Best F1-Score: 0.2489
        At Threshold: 0.09
    Tuned XGBoost:
        Best F1-Score: 0.2411
        At Threshold: 0.59

2. The Threshold Story: This result shows both models achieving the same balance from opposite directions. The "cautious" Logistic Regression
    needed its threshold lowered to 0.09 to become more aggressive, while the "over-eager" XGBoost needed its threshold raised to 0.59 to become more
    conservative.

3. Final Decision: Why Logistic Regression Wins Despite the similar F1-Scores, Logistic Regression is the undisputed champion for four key reasons:
    Performance: It has the (slightly) higher F1-Score.
    Simplicity: It is a dramatically simpler model than XGBoost.
    Cost: It is thousands of times faster and cheaper to train and re-train.
    Interpretability: It is an interpretable "glass box" model. This is the most critical factor, as it allows us to explain why a risk score is high,
    which is essential for a medical application.
"""

In [77]:
import pickle
from sklearn.linear_model import LogisticRegression

final_model = LogisticRegression(solver='liblinear', C=35, max_iter=1000, random_state=42)
final_model.fit(X_full_train, y_full_train)
print("Final model trained successfully.")

# Save the vectorizer (dv_full)
with open('dv.bin', 'wb') as f_out:
    pickle.dump(dv_full, f_out)
    print("Vectorizer saved to dv.bin")

# Save the final model
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    print("Model saved to model.bin")

Final model trained successfully.
Vectorizer saved to dv.bin
Model saved to model.bin
