# 05 - Advanced Models

Goal: train more expressive models and perform light hyperparameter tuning.

Models:
- Random Forest
- XGBoost (if available)
- LightGBM (if available)

We will compare performance on the same train/test split.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
import numpy as np
import os

PROCESSED_DIR = Path('data/processed')

def load_or_create_data():
    # Check if the processed directory and all files exist
    required_files = [
        PROCESSED_DIR / 'X_train_scaled.csv',
        PROCESSED_DIR / 'X_test_scaled.csv',
        PROCESSED_DIR / 'y_train.csv',
        PROCESSED_DIR / 'y_test.csv'
    ]
    missing_files = [str(f) for f in required_files if not f.exists()]

    if missing_files:
        print("Warning: The following required files are missing:")
        for f in missing_files:
            print(f"  - {f}")
        print("Generating new train/test split from raw data...")

        RAW_DIR = Path('data/raw')
        # Try both 'data.csv' and 'dataset.csv'
        possible_raw_files = [RAW_DIR / 'data.csv', RAW_DIR / 'dataset.csv']
        raw_file = None
        for candidate in possible_raw_files:
            if candidate.exists():
                raw_file = candidate
                break

        # If file(s) not found, prompt the user what to do next
        if raw_file is None:
            # List what files exist in the raw dir for user assistance
            available_files = list(RAW_DIR.glob("*.csv")) if RAW_DIR.exists() else []
            print("Could not find the required raw data file 'data.csv' or 'dataset.csv' in the directory:", RAW_DIR)
            if available_files:
                print("Available csv files in data/raw/:")
                for f in available_files:
                    print(f"  - {f}")
                print("Please rename the appropriate raw file to 'data.csv' or update the code to use the correct filename.")
            else:
                print("No CSV files found in the data/raw directory.")
                print("Please place your raw data file (with a 'target' column) named 'data.csv' in the data/raw directory.")
            raise FileNotFoundError(f"Raw data file not found. Checked: {[str(f) for f in possible_raw_files]}")
        
        # Load raw data and generate split. Update as per actual structure!
        df = pd.read_csv(raw_file)
        if 'target' not in df.columns:
            raise ValueError("Raw data must contain a 'target' column.")

        X = df.drop(columns=['target'])
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        # Optionally, do scaling here if needed for your models, e.g.:
        # from sklearn.preprocessing import StandardScaler
        # scaler = StandardScaler()
        # X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        # X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
        # Save processed
        PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
        X_train.to_csv(PROCESSED_DIR / 'X_train_scaled.csv', index=False)
        X_test.to_csv(PROCESSED_DIR / 'X_test_scaled.csv', index=False)
        y_train.to_csv(PROCESSED_DIR / 'y_train.csv', index=False)
        y_test.to_csv(PROCESSED_DIR / 'y_test.csv', index=False)
    else:
        X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
        X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
        y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
        y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()
        return X_train, X_test, y_train, y_test

    # After creation, reload to ensure consistent behavior
    X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
    X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
    y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
    y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_or_create_data()

# Random Forest with simple search
rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': [200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=3, scoring='roc_auc', n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
rf_probs = rf_best.predict_proba(X_test)[:,1]
rf_report = classification_report(y_test, rf_best.predict(X_test), output_dict=True)
rf_auc = roc_auc_score(y_test, rf_probs)

results = {'random_forest': {'report': rf_report, 'roc_auc': rf_auc}}

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum(),
        random_state=42,
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist'
    )
    xgb.fit(X_train, y_train)
    xgb_probs = xgb.predict_proba(X_test)[:,1]
    results['xgboost'] = {
        'report': classification_report(y_test, xgb.predict(X_test), output_dict=True),
        'roc_auc': roc_auc_score(y_test, xgb_probs)
    }
except Exception as e:
    results['xgboost'] = {'error': str(e)}

# Optional: LightGBM
try:
    import lightgbm as lgb
    lgb_model = lgb.LGBMClassifier(
        class_weight='balanced',
        n_estimators=400,
        learning_rate=0.05,
        num_leaves=64,
        random_state=42
    )
    lgb_model.fit(X_train, y_train)
    lgb_probs = lgb_model.predict_proba(X_test)[:,1]
    results['lightgbm'] = {
        'report': classification_report(y_test, lgb_model.predict(X_test), output_dict=True),
        'roc_auc': roc_auc_score(y_test, lgb_probs)
    }
except Exception as e:
    results['lightgbm'] = {'error': str(e)}

results

Feature importance (where available) should be captured and saved to `results/metrics` for later visualization.