# Heart Stroke Risk Prediction
## Notebook 3: Model Training, Tuning & Evaluation
**Author:** Dev Kapania | IIT Roorkee Research Intern

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, f1_score)
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded!')

## 1. Load Processed Data

In [None]:
X_train = np.load('../data/processed/X_train.npy')
X_test  = np.load('../data/processed/X_test.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test  = np.load('../data/processed/y_test.npy')

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test:  {X_test.shape},  y_test:  {y_test.shape}')

## 2. Train All 3 Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest':       RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost':             XGBClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
        'cv_f1': cv_score
    }
    print(f'{name}: F1={results[name]["f1"]:.3f} | ROC-AUC={results[name]["roc_auc"]:.3f} | CV-F1={cv_score:.3f}')

## 3. Model Comparison

In [None]:
comparison_df = pd.DataFrame({
    name: {'F1 Score': v['f1'], 'ROC-AUC': v['roc_auc'], 'CV F1': v['cv_f1']}
    for name, v in results.items()
}).T.round(3)

print('Model Comparison:')
print(comparison_df)

# Bar chart
comparison_df.plot(kind='bar', figsize=(10,5), colormap='RdYlGn')
plt.title('Model Performance Comparison', fontsize=13, fontweight='bold')
plt.ylabel('Score')
plt.xticks(rotation=15)
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('../data/processed/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Hyperparameter Tuning â€” XGBoost

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

xgb = XGBClassifier(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f'\nBest parameters: {grid_search.best_params_}')
print(f'Best CV F1: {grid_search.best_score_:.3f}')

## 5. Final Model Evaluation

In [None]:
best_model = grid_search.best_estimator_
y_pred_final = best_model.predict(X_test)
y_prob_final = best_model.predict_proba(X_test)[:,1]

print('Classification Report (XGBoost - Tuned):')
print(classification_report(y_test, y_pred_final, target_names=['No Disease','Heart Disease']))
print(f'ROC-AUC: {roc_auc_score(y_test, y_prob_final):.4f}')

# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['No Disease','Heart Disease'],
            yticklabels=['No Disease','Heart Disease'])
axes[0].set_title('Confusion Matrix')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_final)
axes[1].plot(fpr, tpr, color='#e74c3c', lw=2, label=f'ROC AUC = {roc_auc_score(y_test, y_prob_final):.2f}')
axes[1].plot([0,1],[0,1],'k--', lw=1)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/processed/final_evaluation.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Feature Importance

In [None]:
import pandas as pd
feat_imp = pd.Series(best_model.feature_importances_).sort_values(ascending=True).tail(10)

plt.figure(figsize=(8, 5))
feat_imp.plot(kind='barh', color='#3498db')
plt.title('Top 10 Feature Importances (XGBoost)', fontsize=12, fontweight='bold')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('../data/processed/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Save Final Model

In [None]:
joblib.dump(best_model, '../models/xgboost_model.pkl')
joblib.dump(results['Logistic Regression']['model'], '../models/logistic_regression.pkl')
joblib.dump(results['Random Forest']['model'], '../models/random_forest.pkl')

print('All models saved to ../models/')
print('Best model: XGBoost (Tuned)')