# 02 - Modeling

This notebook is like a place holder or sample for what your are to do on the EDA. Edit as neccessary:
- Loads the cleaned dataset saved by the EDA notebook
- Prepares preprocessing pipelines
- Trains baseline models (Logistic Regression, Random Forest, XGBoost)
- Handles class imbalance (class weights and SMOTE)
- Uses Optuna to tune an XGBoost model
- Evaluates models (ROC, PR, confusion matrix) and saves plots to `images/`


In [None]:
# Standard imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report, RocCurveDisplay, PrecisionRecallDisplay
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib

sns.set(style='whitegrid')
%matplotlib inline

CLEAN_PATH = '/mnt/data/heart-disease-project/data/processed/heart_clean.csv'
IMAGES_DIR = os.path.join('/mnt/data/heart-disease-project', 'images')
os.makedirs(IMAGES_DIR, exist_ok=True)
print('Images will be saved to:', IMAGES_DIR)


In [None]:
# 1. Load cleaned data
df = pd.read_csv(CLEAN_PATH)
print('Loaded shape:', df.shape)
display(df.head())


In [None]:
# 2. Prepare features and target
TARGET = 'target'
if TARGET not in df.columns:
    raise KeyError('Target column not found in cleaned data.')
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Identify numeric and categorical columns simply
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)


## 3. Train/test split
We keep a held-out test set for final evaluation.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


## 4. Preprocessing pipelines
Numeric: median imputation + standard scaling. Categorical: mode imputation + one-hot encoding.


In [None]:
from sklearn.preprocessing import OrdinalEncoder
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])
print('Preprocessor ready')


## 5. Baseline models
We'll train Logistic Regression (with class weight) and Random Forest. We'll also try a SMOTE pipeline.


In [None]:
# Logistic Regression with class weighting
lr_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])
lr_pipe.fit(X_train, y_train)
lr_probs = lr_pipe.predict_proba(X_test)[:,1]
print('Logistic AUC:', roc_auc_score(y_test, lr_probs))
# Random Forest without SMOTE
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'))
])
rf_pipe.fit(X_train, y_train)
rf_probs = rf_pipe.predict_proba(X_test)[:,1]
print('Random Forest AUC:', roc_auc_score(y_test, rf_probs))


## 6. SMOTE pipeline (oversampling) + Random Forest
SMOTE operates only on numeric arrays; using imbalanced-learn's pipeline simplifies this.


In [None]:
smote = SMOTE(random_state=42)
smote_pipe = ImbPipeline([
    ('pre', preprocessor),
    ('smote', smote),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])
smote_pipe.fit(X_train, y_train)
smote_probs = smote_pipe.predict_proba(X_test)[:,1]
print('SMOTE + RF AUC:', roc_auc_score(y_test, smote_probs))


## 7. Evaluate and plot ROC and PR curves for the models


In [None]:
models = {
    'Logistic': (lr_pipe, lr_probs),
    'RandomForest': (rf_pipe, rf_probs),
    'SMOTE_RandomForest': (smote_pipe, smote_probs)
}
plt.figure(figsize=(8,6))
for name, (m, probs) in models.items():
    fpr = np.nan
    try:
        from sklearn.metrics import roc_curve
        fpr, tpr, _ = roc_curve(y_test, probs)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc:.3f})")
    except Exception as e:
        print('Could not plot ROC for', name, e)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
fn = os.path.join(IMAGES_DIR, 'roc_curves.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()

# Precision-Recall
plt.figure(figsize=(8,6))
for name, (m, probs) in models.items():
    try:
        precision, recall, _ = precision_recall_curve(y_test, probs)
        pr_auc = auc(recall, precision)
        plt.plot(recall, precision, label=f"{name} (PR-AUC={pr_auc:.3f})")
    except Exception as e:
        print('Could not plot PR for', name, e)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
fn = os.path.join(IMAGES_DIR, 'pr_curves.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()


## 8. Confusion matrix & classification report for the best model (by AUC)


In [None]:
aucs = {name: roc_auc_score(y_test, probs) for name, (m, probs) in models.items()}
best_name = max(aucs, key=aucs.get)
print('AUCs:', aucs)
print('Best model by AUC:', best_name)
best_model = models[best_name][0]
best_probs = models[best_name][1]
best_preds = (best_probs >= 0.5).astype(int)
cm = confusion_matrix(y_test, best_preds)
print('Confusion matrix:\n', cm)
print('\nClassification report:')
print(classification_report(y_test, best_preds))
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f'Confusion Matrix - {best_name}')
fn = os.path.join(IMAGES_DIR, f'confusion_matrix_{best_name}.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()


## 9. Hyperparameter tuning with Optuna for XGBoost
We will run a simple Optuna search to maximize cross-validated AUC. This is beginner-friendly but still powerful.


In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 2, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'random_state': 42
    }
    model = Pipeline([
        ('pre', preprocessor),
        ('clf', XGBClassifier(**params))
    ])
    # cross-validated AUC (stratified)
    scores = cross_val_score(model, X_train, y_train, cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=42), scoring='roc_auc')
    return float(np.mean(scores))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)
print('Best trial:')
print(study.best_trial.params)


In [None]:
# Train a final XGBoost model with the best params
best_params = study.best_trial.params
best_params.update({'use_label_encoder': False, 'eval_metric': 'logloss', 'random_state': 42})
final_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(**best_params))
])
final_xgb.fit(X_train, y_train)
xgb_probs = final_xgb.predict_proba(X_test)[:,1]
print('Final XGB AUC:', roc_auc_score(y_test, xgb_probs))
# Save model
joblib.dump(final_xgb, os.path.join('/mnt/data/heart-disease-project', 'models', 'final_xgb.pkl'))
print('Saved final model (if models/ directory exists)')


## 10. Compare final XGBoost to previous best and save results


In [None]:
finals = {
    'XGBoost_Optuna': (final_xgb, xgb_probs)
}
all_models = {**models, **finals}
for name, (m, probs) in all_models.items():
    try:
        r, p = precision_recall_curve(y_test, probs)
        print(f"{name}: AUC={roc_auc_score(y_test, probs):.3f}, PR-AUC={auc(p, r):.3f}")
    except Exception as e:
        print('Skipped metrics for', name, e)


In [None]:
# Save a simple CSV of model scores
scores = []
for name, (m, probs) in all_models.items():
    try:
        scores.append({'model': name, 'auc': roc_auc_score(y_test, probs)})
    except:
        pass
scores_df = pd.DataFrame(scores).sort_values('auc', ascending=False)
scores_df.to_csv(os.path.join('/mnt/data/heart-disease-project', 'reports', 'model_scores.csv'), index=False)
display(scores_df)


## 11. Save ROC curve for final XGBoost


In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_test, xgb_probs)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'XGBoost (AUC={roc_auc_score(y_test, xgb_probs):.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Final XGBoost ROC')
plt.legend()
fn = os.path.join(IMAGES_DIR, 'final_xgb_roc.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()


## 12. Save Precision-Recall for final XGBoost


In [None]:
precision, recall, _ = precision_recall_curve(y_test, xgb_probs)
plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f'XGBoost (PR-AUC={auc(recall, precision):.3f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Final XGBoost Precision-Recall')
plt.legend()
fn = os.path.join(IMAGES_DIR, 'final_xgb_pr.png')
plt.savefig(fn, dpi=150, bbox_inches='tight')
plt.show()


## 13. Feature importance (from XGBoost) — simple approach
We will extract feature names after preprocessing by applying the preprocessor to a sample and then retrieving feature names.


In [None]:
def get_feature_names(column_transformer):
    """Utility to get feature names from a ColumnTransformer (works for our simple pipelines)."""
    out = []
    for name, trans, cols in column_transformer.transformers:
        if name == 'remainder':
            continue
        if hasattr(trans, 'named_steps') and 'ohe' in trans.named_steps:
            ohe = trans.named_steps['ohe']
            names = list(ohe.get_feature_names_out(cols))
            out.extend(names)
        else:
            out.extend(cols)
    return out

feat_names = get_feature_names(preprocessor)
try:
    booster = final_xgb.named_steps['clf']
    # xgboost feature importance uses original feature indices, but with pipeline we have transformed features
    importances = booster.feature_importances_
    fi_df = pd.DataFrame({'feature': feat_names, 'importance': importances})
    fi_df = fi_df.sort_values('importance', ascending=False).head(20)
    plt.figure(figsize=(8,6))
    sns.barplot(x='importance', y='feature', data=fi_df)
    plt.title('Top 20 feature importances (XGBoost)')
    fn = os.path.join(IMAGES_DIR, 'xgb_feature_importance.png')
    plt.savefig(fn, dpi=150, bbox_inches='tight')
    plt.show()
except Exception as e:
    print('Could not compute feature importances:', e)


## 14. Save final model and report


In [None]:
os.makedirs(os.path.join(base, 'models'), exist_ok=True)
joblib.dump(final_xgb, os.path.join(base, 'models', 'final_xgb_optuna.pkl'))
print('Saved final model to models/final_xgb_optuna.pkl')
print('Modeling notebook complete. Review images in images/ and model scores in reports/model_scores.csv')
