# Ensemble Models

Comparing tuned RF, XGBoost, LightGBM, and CatBoost.


In [1]:
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

sys.path.append('..')
from utils.metrics import *

SEED = 42
np.random.seed(SEED)


## Load Data


In [2]:
DATA_PATH = '../data/processed/'

# Load SMOTE data
X_train_smote = pd.read_pickle(DATA_PATH + 'X_train_smote.pkl')
y_train_smote = pd.read_pickle(DATA_PATH + 'y_train_smote.pkl')

# Load RUS data
X_train_rus = pd.read_pickle(DATA_PATH + 'X_train_rus.pkl')
y_train_rus = pd.read_pickle(DATA_PATH + 'y_train_rus.pkl')

# Load test set
X_test = pd.read_pickle(DATA_PATH + 'X_test.pkl')
y_test = pd.read_pickle(DATA_PATH + 'y_test.pkl')

print(f"SMOTE: {len(X_train_smote)} samples")
print(f"RUS: {len(X_train_rus)} samples")
print(f"Test: {len(X_test)} samples")


SMOTE: 454902 samples
RUS: 788 samples
Test: 56962 samples


In [3]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

# store all results
all_results = {}


## Random Forest


In [None]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# RF on SMOTE
print("Training RF on SMOTE...")
t0 = time.time()
rf_smote_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    rf_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
rf_smote_search.fit(X_train_smote, y_train_smote)
print(f"RF SMOTE done in {time.time()-t0:.0f}s")

# RF on RUS
print("Training RF on RUS...")
t0 = time.time()
rf_rus_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=SEED, n_jobs=-1),
    rf_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
rf_rus_search.fit(X_train_rus, y_train_rus)
print(f"RF RUS done in {time.time()-t0:.0f}s")


Training RF on SMOTE...


In [None]:
# Evaluate RF
rf_smote = rf_smote_search.best_estimator_
rf_smote_prob = rf_smote.predict_proba(X_test)[:, 1]
rf_smote_pred = rf_smote.predict(X_test)
all_results['RF-SMOTE'] = calculate_all_metrics(y_test, rf_smote_pred, rf_smote_prob)

rf_rus = rf_rus_search.best_estimator_
rf_rus_prob = rf_rus.predict_proba(X_test)[:, 1]
rf_rus_pred = rf_rus.predict(X_test)
all_results['RF-RUS'] = calculate_all_metrics(y_test, rf_rus_pred, rf_rus_prob)

print("RF-SMOTE:", all_results['RF-SMOTE'])
print("RF-RUS:", all_results['RF-RUS'])


## XGBoost


In [None]:
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
}

# XGB on SMOTE
print("Training XGB on SMOTE...")
t0 = time.time()
xgb_smote_search = RandomizedSearchCV(
    XGBClassifier(random_state=SEED, eval_metric='aucpr', n_jobs=-1),
    xgb_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
xgb_smote_search.fit(X_train_smote, y_train_smote)
print(f"XGB SMOTE done in {time.time()-t0:.0f}s")

# XGB on RUS
print("Training XGB on RUS...")
t0 = time.time()
xgb_rus_search = RandomizedSearchCV(
    XGBClassifier(random_state=SEED, eval_metric='aucpr', n_jobs=-1),
    xgb_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
xgb_rus_search.fit(X_train_rus, y_train_rus)
print(f"XGB RUS done in {time.time()-t0:.0f}s")


In [None]:
# Evaluate XGB
xgb_smote = xgb_smote_search.best_estimator_
xgb_smote_prob = xgb_smote.predict_proba(X_test)[:, 1]
xgb_smote_pred = xgb_smote.predict(X_test)
all_results['XGB-SMOTE'] = calculate_all_metrics(y_test, xgb_smote_pred, xgb_smote_prob)

xgb_rus = xgb_rus_search.best_estimator_
xgb_rus_prob = xgb_rus.predict_proba(X_test)[:, 1]
xgb_rus_pred = xgb_rus.predict(X_test)
all_results['XGB-RUS'] = calculate_all_metrics(y_test, xgb_rus_pred, xgb_rus_prob)

print("XGB-SMOTE:", all_results['XGB-SMOTE'])
print("XGB-RUS:", all_results['XGB-RUS'])


## LightGBM


In [None]:
lgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, -1],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
}

# LGB on SMOTE
print("Training LGB on SMOTE...")
t0 = time.time()
lgb_smote_search = RandomizedSearchCV(
    LGBMClassifier(random_state=SEED, verbose=-1, n_jobs=-1),
    lgb_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
lgb_smote_search.fit(X_train_smote, y_train_smote)
print(f"LGB SMOTE done in {time.time()-t0:.0f}s")

# LGB on RUS
print("Training LGB on RUS...")
t0 = time.time()
lgb_rus_search = RandomizedSearchCV(
    LGBMClassifier(random_state=SEED, verbose=-1, n_jobs=-1),
    lgb_params, n_iter=10, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
lgb_rus_search.fit(X_train_rus, y_train_rus)
print(f"LGB RUS done in {time.time()-t0:.0f}s")


In [None]:
# Evaluate LGB
lgb_smote = lgb_smote_search.best_estimator_
lgb_smote_prob = lgb_smote.predict_proba(X_test)[:, 1]
lgb_smote_pred = lgb_smote.predict(X_test)
all_results['LGB-SMOTE'] = calculate_all_metrics(y_test, lgb_smote_pred, lgb_smote_prob)

lgb_rus = lgb_rus_search.best_estimator_
lgb_rus_prob = lgb_rus.predict_proba(X_test)[:, 1]
lgb_rus_pred = lgb_rus.predict(X_test)
all_results['LGB-RUS'] = calculate_all_metrics(y_test, lgb_rus_pred, lgb_rus_prob)

print("LGB-SMOTE:", all_results['LGB-SMOTE'])
print("LGB-RUS:", all_results['LGB-RUS'])


## CatBoost


In [None]:
cat_params = {
    'iterations': [100, 200],
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
}

# Cat on SMOTE
print("Training CatBoost on SMOTE...")
t0 = time.time()
cat_smote_search = RandomizedSearchCV(
    CatBoostClassifier(random_state=SEED, verbose=False),
    cat_params, n_iter=8, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
cat_smote_search.fit(X_train_smote, y_train_smote)
print(f"Cat SMOTE done in {time.time()-t0:.0f}s")

# Cat on RUS
print("Training CatBoost on RUS...")
t0 = time.time()
cat_rus_search = RandomizedSearchCV(
    CatBoostClassifier(random_state=SEED, verbose=False),
    cat_params, n_iter=8, cv=cv, scoring='average_precision', random_state=SEED, n_jobs=-1
)
cat_rus_search.fit(X_train_rus, y_train_rus)
print(f"Cat RUS done in {time.time()-t0:.0f}s")


In [None]:
# Evaluate Cat
cat_smote = cat_smote_search.best_estimator_
cat_smote_prob = cat_smote.predict_proba(X_test)[:, 1]
cat_smote_pred = cat_smote.predict(X_test)
all_results['Cat-SMOTE'] = calculate_all_metrics(y_test, cat_smote_pred, cat_smote_prob)

cat_rus = cat_rus_search.best_estimator_
cat_rus_prob = cat_rus.predict_proba(X_test)[:, 1]
cat_rus_pred = cat_rus.predict(X_test)
all_results['Cat-RUS'] = calculate_all_metrics(y_test, cat_rus_pred, cat_rus_prob)

print("Cat-SMOTE:", all_results['Cat-SMOTE'])
print("Cat-RUS:", all_results['Cat-RUS'])


## Compare All


In [None]:
print_metrics_table(all_results)


In [None]:
probs = {
    'RF-SMOTE': rf_smote_prob, 'RF-RUS': rf_rus_prob,
    'XGB-SMOTE': xgb_smote_prob, 'XGB-RUS': xgb_rus_prob,
    'LGB-SMOTE': lgb_smote_prob, 'LGB-RUS': lgb_rus_prob,
    'Cat-SMOTE': cat_smote_prob, 'Cat-RUS': cat_rus_prob
}
plot_multiple_pr_curves(probs, y_test)
plt.show()


In [None]:
plot_multiple_roc_curves(probs, y_test)
plt.show()


In [None]:
# confusion matrices - SMOTE models
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plot_confusion_matrix(y_test, rf_smote_pred, 'RF-SMOTE', axes[0,0])
plot_confusion_matrix(y_test, xgb_smote_pred, 'XGB-SMOTE', axes[0,1])
plot_confusion_matrix(y_test, lgb_smote_pred, 'LGB-SMOTE', axes[1,0])
plot_confusion_matrix(y_test, cat_smote_pred, 'Cat-SMOTE', axes[1,1])
plt.tight_layout()
plt.show()


In [None]:
# pick best by auc-pr
best_name = max(all_results.keys(), key=lambda x: all_results[x]['auc_pr'])
print(f"Best model: {best_name}")
print(f"Metrics: {all_results[best_name]}")


## Save


In [None]:
os.makedirs('../models', exist_ok=True)

# save all models
joblib.dump(rf_smote, '../models/rf_smote.joblib')
joblib.dump(rf_rus, '../models/rf_rus.joblib')
joblib.dump(xgb_smote, '../models/xgb_smote.joblib')
joblib.dump(xgb_rus, '../models/xgb_rus.joblib')
joblib.dump(lgb_smote, '../models/lgb_smote.joblib')
joblib.dump(lgb_rus, '../models/lgb_rus.joblib')
joblib.dump(cat_smote, '../models/cat_smote.joblib')
joblib.dump(cat_rus, '../models/cat_rus.joblib')

results = {
    'y_test': y_test,
    'all_probs': probs,
    'all_metrics': all_results,
    'best': best_name
}
joblib.dump(results, '../models/ensemble_results.joblib')
print("done")
