# 05 - Advanced Models

Goal: train more expressive models and perform light hyperparameter tuning.

Models:
- Random Forest
- XGBoost (if available)
- LightGBM (if available)

We will compare performance on the same train/test split.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
import numpy as np

PROCESSED_DIR = Path('data/processed')
X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()

# Random Forest with simple search
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': [200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=3, scoring='roc_auc', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
rf_probs = rf_best.predict_proba(X_test)[:,1]
rf_report = classification_report(y_test, rf_best.predict(X_test), output_dict=True)
rf_auc = roc_auc_score(y_test, rf_probs)

results = {'random_forest': {'report': rf_report, 'roc_auc': rf_auc}}

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum(),
        random_state=42,
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'
    )
    xgb.fit(X_train, y_train)
    xgb_probs = xgb.predict_proba(X_test)[:,1]
    results['xgboost'] = {
        'report': classification_report(y_test, xgb.predict(X_test), output_dict=True),
        'roc_auc': roc_auc_score(y_test, xgb_probs)
    }
except Exception as e:
    results['xgboost'] = {'error': str(e)}

# Optional: LightGBM
try:
    import lightgbm as lgb
    lgb_model = lgb.LGBMClassifier(
        class_weight='balanced',
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=64,
        random_state=42
    )
    lgb_model.fit(X_train, y_train)
    lgb_probs = lgb_model.predict_proba(X_test)[:,1]
    results['lightgbm'] = {
        'report': classification_report(y_test, lgb_model.predict(X_test), output_dict=True),
        'roc_auc': roc_auc_score(y_test, lgb_probs)
    }
except Exception as e:
    results['lightgbm'] = {'error': str(e)}

results

Feature importance (where available) should be captured and saved to `results/metrics` for later visualization.