# Ensemble Models

Comparing tuned RF, XGBoost, LightGBM, and CatBoost.


In [None]:
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

sys.path.append('..')
from utils.metrics import *

SEED = 42
np.random.seed(SEED)


## Load Data


In [None]:
DATA_PATH = '../data/processed/'

if os.path.exists(DATA_PATH + 'X_train.pkl'):
    X_train = pd.read_pickle(DATA_PATH + 'X_train.pkl')
    X_val = pd.read_pickle(DATA_PATH + 'X_val.pkl')
    X_test = pd.read_pickle(DATA_PATH + 'X_test.pkl')
    y_train = pd.read_pickle(DATA_PATH + 'y_train.pkl')
    y_val = pd.read_pickle(DATA_PATH + 'y_val.pkl')
    y_test = pd.read_pickle(DATA_PATH + 'y_test.pkl')
else:
    # temp preprocessing
    from sklearn.preprocessing import StandardScaler
    df = pd.read_csv('../data/creditcard.csv')
    
    train_end = int(len(df) * 0.70)
    val_end = int(len(df) * 0.85)
    
    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()
    
    scaler = StandardScaler()
    train_df['Amount_scaled'] = scaler.fit_transform(train_df[['Amount']])
    val_df['Amount_scaled'] = scaler.transform(val_df[['Amount']])
    test_df['Amount_scaled'] = scaler.transform(test_df[['Amount']])
    
    feat_cols = [f'V{i}' for i in range(1, 29)] + ['Amount_scaled']
    X_train, y_train = train_df[feat_cols], train_df['Class']
    X_val, y_val = val_df[feat_cols], val_df['Class']
    X_test, y_test = test_df[feat_cols], test_df['Class']

# for class imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Train: {len(X_train)}, scale_pos_weight: {scale_pos_weight:.1f}")


In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)


## Random Forest (Tuned)


In [None]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

t0 = time.time()
rf_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=SEED, n_jobs=-1),
    rf_params, n_iter=15, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
rf_search.fit(X_train, y_train)
rf_time = time.time() - t0

print(f"RF done in {rf_time:.0f}s, best score: {rf_search.best_score_:.4f}")


In [None]:
rf = rf_search.best_estimator_
rf_prob = rf.predict_proba(X_val)[:, 1]
rf_pred = rf.predict(X_val)
rf_metrics = calculate_all_metrics(y_val, rf_pred, rf_prob)
print(rf_metrics)


## XGBoost


In [None]:
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
}

t0 = time.time()
xgb_search = RandomizedSearchCV(
    XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=SEED, 
                  use_label_encoder=False, eval_metric='aucpr', n_jobs=-1),
    xgb_params, n_iter=15, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
xgb_search.fit(X_train, y_train)
xgb_time = time.time() - t0

print(f"XGB done in {xgb_time:.0f}s, best: {xgb_search.best_score_:.4f}")


In [None]:
xgb = xgb_search.best_estimator_
xgb_prob = xgb.predict_proba(X_val)[:, 1]
xgb_pred = xgb.predict(X_val)
xgb_metrics = calculate_all_metrics(y_val, xgb_pred, xgb_prob)
print(xgb_metrics)


## LightGBM


In [None]:
lgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50],
    'subsample': [0.7, 0.8],
}

t0 = time.time()
lgb_search = RandomizedSearchCV(
    LGBMClassifier(scale_pos_weight=scale_pos_weight, random_state=SEED, verbose=-1, n_jobs=-1),
    lgb_params, n_iter=15, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
lgb_search.fit(X_train, y_train)
lgb_time = time.time() - t0

print(f"LGB done in {lgb_time:.0f}s, best: {lgb_search.best_score_:.4f}")


In [None]:
lgb = lgb_search.best_estimator_
lgb_prob = lgb.predict_proba(X_val)[:, 1]
lgb_pred = lgb.predict(X_val)
lgb_metrics = calculate_all_metrics(y_val, lgb_pred, lgb_prob)
print(lgb_metrics)


## CatBoost


In [None]:
cat_params = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
}

t0 = time.time()
cat_search = RandomizedSearchCV(
    CatBoostClassifier(scale_pos_weight=scale_pos_weight, random_state=SEED, verbose=False),
    cat_params, n_iter=15, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
cat_search.fit(X_train, y_train)
cat_time = time.time() - t0

print(f"Cat done in {cat_time:.0f}s, best: {cat_search.best_score_:.4f}")


In [None]:
cat = cat_search.best_estimator_
cat_prob = cat.predict_proba(X_val)[:, 1]
cat_pred = cat.predict(X_val)
cat_metrics = calculate_all_metrics(y_val, cat_pred, cat_prob)
print(cat_metrics)


## Compare All


In [None]:
all_metrics = {
    'RF': rf_metrics,
    'XGB': xgb_metrics, 
    'LGB': lgb_metrics,
    'Cat': cat_metrics
}
print_metrics_table(all_metrics)


In [None]:
probs = {'RF': rf_prob, 'XGB': xgb_prob, 'LGB': lgb_prob, 'Cat': cat_prob}
plot_multiple_pr_curves(probs, y_val)
plt.show()


In [None]:
plot_multiple_roc_curves(probs, y_val)
plt.show()


In [None]:
# confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plot_confusion_matrix(y_val, rf_pred, 'RF', axes[0,0])
plot_confusion_matrix(y_val, xgb_pred, 'XGB', axes[0,1])
plot_confusion_matrix(y_val, lgb_pred, 'LGB', axes[1,0])
plot_confusion_matrix(y_val, cat_pred, 'Cat', axes[1,1])
plt.tight_layout()
plt.show()


In [None]:
# pick best by auc-pr
best_name = max(all_metrics.keys(), key=lambda x: all_metrics[x]['auc_pr'])
print(f"Best model: {best_name}")


## Save


In [None]:
os.makedirs('../models', exist_ok=True)

joblib.dump(rf, '../models/rf_tuned.joblib')
joblib.dump(xgb, '../models/xgb_tuned.joblib')
joblib.dump(lgb, '../models/lgb_tuned.joblib')
joblib.dump(cat, '../models/cat_tuned.joblib')

results = {
    'y_val': y_val,
    'rf_prob': rf_prob, 'xgb_prob': xgb_prob, 
    'lgb_prob': lgb_prob, 'cat_prob': cat_prob,
    'rf_metrics': rf_metrics, 'xgb_metrics': xgb_metrics,
    'lgb_metrics': lgb_metrics, 'cat_metrics': cat_metrics,
    'best': best_name,
    'times': {'rf': rf_time, 'xgb': xgb_time, 'lgb': lgb_time, 'cat': cat_time}
}
joblib.dump(results, '../models/ensemble_results.joblib')
print("done")
