In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna  # Bayesian Optimization for XGBoost
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, confusion_matrix,
    roc_curve, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Handle missing values
import joblib  # Save the best model

# ========================
# 1️⃣ Load & Preprocess Data
# ========================
print("\n🔹 Loading dataset...")
df = pd.read_csv("stroke_data.csv")  # Ensure the file is in the correct directory

# Standardize column names
df.columns = df.columns.str.strip().str.lower()

# 🔹 Check for missing values before handling them
print("\n🔹 Missing Values Before Processing:\n", df.isnull().sum())

# Handle missing values for BMI
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# 🔹 Feature Engineering: Create new meaningful features
df['bmi_category'] = pd.cut(df['bmi'], bins=[-np.inf, 18.5, 24.9, 29.9, np.inf], labels=[0, 1, 2, 3]).astype(int)
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 70, np.inf], labels=[0, 1, 2, 3]).astype(int)
df['hypertension_glucose'] = df['hypertension'] * df['avg_glucose_level']

# 🔹 Ensure smoking_status is properly mapped to numerical values (handle NaN)
df['smoking_status'] = df['smoking_status'].map({'never smoked': 0, 'formerly smoked': 1, 'smokes': 2}).fillna(0).astype(int)
df['heart_smoking'] = df['heart_disease'] * df['smoking_status']

# Convert categorical variables to numerical
df = pd.get_dummies(df, columns=['gender', 'work_type', 'residence_type', 'bmi_category', 'age_group'], drop_first=True)
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})

# 🔹 Handle any remaining missing values using SimpleImputer
imputer = SimpleImputer(strategy="median")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 🔹 Ensure categorical and integer columns retain their types after imputation
for col in df.columns:
    if df[col].dtype in ["int64", "bool"]:
        df_imputed[col] = df_imputed[col].astype(int)

df = df_imputed  # Overwrite the original DataFrame with cleaned data

# 🔹 Scale numerical features
scaler = StandardScaler()
numeric_features = ['age', 'bmi', 'avg_glucose_level', 'hypertension_glucose']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Define features (X) and target variable (y)
X = df.drop(columns=['stroke', 'id'], errors='ignore')  # Drop 'id' if it exists
y = df['stroke']

# 🔹 Check for any remaining missing values
print("\n✅ Missing Values Check (Post Processing):", X.isnull().sum().sum())

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\n✅ Data Preprocessing Complete! No missing values found.")
print(f"🔹 Training Set: {X_train_smote.shape}, Testing Set: {X_test.shape}")

# ========================
# 2️⃣ Train a Random Forest Model
# ========================
print("\n🔹 Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10, class_weight="balanced")
rf_model.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]  # Probability scores for ROC-AUC

# ========================
# 3️⃣ Fine-Tune XGBoost Using Optuna
# ========================

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 2, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True)
    }

    model = XGBClassifier(**params, random_state=42)
    model.fit(X_train_smote, y_train_smote)

    y_prob = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_prob)  # Optimize AUC-ROC

print("\n🔹 Optimizing XGBoost with Optuna...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, n_jobs=-1)

# Train XGBoost with best parameters
best_params_xgb = study.best_trial.params
print("\n✅ Best XGBoost Parameters:", best_params_xgb)

best_xgb = XGBClassifier(**best_params_xgb, random_state=42)
best_xgb.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_xgb = best_xgb.predict(X_test)
y_prob_xgb = best_xgb.predict_proba(X_test)[:, 1]

# ========================
# 4️⃣ Train a Logistic Regression Model
# ========================
print("\n🔹 Training Logistic Regression...")
log_reg_model = LogisticRegression(class_weight="balanced", max_iter=500, random_state=42)
log_reg_model.fit(X_train_smote, y_train_smote)

# Make predictions
y_pred_log = log_reg_model.predict(X_test)
y_prob_log = log_reg_model.predict_proba(X_test)[:, 1]

# ========================
# 5️⃣ Compare Model Performances
# ========================
models = {
    "Random Forest": (y_pred_rf, y_prob_rf),
    "XGBoost (Fine-Tuned)": (y_pred_xgb, y_prob_xgb),
    "Logistic Regression": (y_pred_log, y_prob_log),
}

print("\n🔹 Model Comparison Summary:")
for model_name, (y_pred, y_prob) in models.items():
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\n{model_name} → Accuracy: {accuracy:.4f} | AUC-ROC: {auc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

# ========================
# 6️⃣ Save the Best Model
# ========================
joblib.dump(best_xgb, "best_stroke_prediction_model.pkl")
print("\n✅ Best XGBoost model saved as 'best_stroke_prediction_model.pkl'")



🔹 Loading dataset...

🔹 Missing Values Before Processing:
 id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

✅ Missing Values Check (Post Processing): 0

✅ Data Preprocessing Complete! No missing values found.
🔹 Training Set: (5833, 22), Testing Set: (1022, 22)

🔹 Training Random Forest...


[I 2025-03-16 01:09:34,749] A new study created in memory with name: no-name-621a3530-6e66-416b-b7c0-5a09203997af



🔹 Optimizing XGBoost with Optuna...


[I 2025-03-16 01:09:38,617] Trial 1 finished with value: 0.7931687242798353 and parameters: {'n_estimators': 169, 'max_depth': 4, 'learning_rate': 0.057913910462950906, 'subsample': 0.7643591601213698, 'colsample_bytree': 0.9148473410075804, 'scale_pos_weight': 10, 'gamma': 0.012676823119548698, 'reg_alpha': 0.4218085122719195, 'reg_lambda': 1.1946361204132034}. Best is trial 1 with value: 0.7931687242798353.
[I 2025-03-16 01:09:38,786] Trial 14 finished with value: 0.7598148148148148 and parameters: {'n_estimators': 230, 'max_depth': 10, 'learning_rate': 0.24312699743500799, 'subsample': 0.9518125582037013, 'colsample_bytree': 0.6479870799100271, 'scale_pos_weight': 4, 'gamma': 0.700227789172651, 'reg_alpha': 1.8418741237292857, 'reg_lambda': 0.015847037942310377}. Best is trial 1 with value: 0.7931687242798353.
[I 2025-03-16 01:09:38,951] Trial 4 finished with value: 0.7468724279835389 and parameters: {'n_estimators': 136, 'max_depth': 7, 'learning_rate': 0.1709845791154909, 'subsamp


✅ Best XGBoost Parameters: {'n_estimators': 240, 'max_depth': 3, 'learning_rate': 0.01905823892710973, 'subsample': 0.7875926707011945, 'colsample_bytree': 0.8939654191906314, 'scale_pos_weight': 6, 'gamma': 0.03307531182509639, 'reg_alpha': 3.9234203959821907, 'reg_lambda': 1.8990435775825594}

🔹 Training Logistic Regression...

🔹 Model Comparison Summary:

Random Forest → Accuracy: 0.8405 | AUC-ROC: 0.7781 | Precision: 0.1447 | Recall: 0.4600 | F1-Score: 0.2201

XGBoost (Fine-Tuned) → Accuracy: 0.6399 | AUC-ROC: 0.8256 | Precision: 0.1045 | Recall: 0.8400 | F1-Score: 0.1858

Logistic Regression → Accuracy: 0.7720 | AUC-ROC: 0.7948 | Precision: 0.1383 | Recall: 0.7000 | F1-Score: 0.2310

✅ Best XGBoost model saved as 'best_stroke_prediction_model.pkl'
