In [1]:
# ===============================================
# EXTRA MODELS FOR IMBALANCED DATA (EEC & BRF)
# ===============================================
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, balanced_accuracy_score,
    classification_report, confusion_matrix, precision_recall_curve
)

# Imbalanced-learn ensembles
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# --------- Load artifacts & preprocess (same as your baseline) ----------
preprocessor = joblib.load("preprocessor.pkl")
X_train, X_test, y_train, y_test = joblib.load("splits.pkl")

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

# Validation split from TRAIN (stratified)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_proc, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)

# --------- Models to try ----------
models = {
    "EasyEnsembleClassifier": EasyEnsembleClassifier(
        n_estimators=10,            # number of balanced AdaBoost sub-ensembles
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    "BalancedRandomForest": BalancedRandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
}

def evaluate_with_threshold(model, name, objective="F1", fixed_recall=0.80):
    # Fit on the ORIGINAL (imbalanced) train fold — these models internally rebalance
    model.fit(X_tr, y_tr)

    # --- choose threshold on validation ---
    # NOTE: Some classifiers (like BRF/EEC) expose predict_proba; if not, use decision_function fallback.
    if hasattr(model, "predict_proba"):
        val_proba = model.predict_proba(X_val)[:, 1]
    else:
        # fall back to decision_function (rare for these two, but safe)
        val_scores = model.decision_function(X_val)
        # map scores to pseudo-proba via min-max for thresholding (won't affect AUCs)
        smin, smax = val_scores.min(), val_scores.max()
        val_proba = (val_scores - smin) / (smax - smin + 1e-12)

    precisions, recalls, thresholds = precision_recall_curve(y_val, val_proba)

    if objective == "F1":
        f1s = (2 * precisions * recalls) / (precisions + recalls + 1e-9)
        best_idx = int(np.argmax(f1s[:-1]))  # last PR point has no threshold
    elif objective == "BalancedAcc":
        bal_scores = []
        for thr in thresholds:
            y_val_pred = (val_proba >= thr).astype(int)
            bal_scores.append(balanced_accuracy_score(y_val, y_val_pred))
        best_idx = int(np.argmax(bal_scores))
    elif objective == "FixedRecall":
        cand = np.where(recalls[:-1] >= fixed_recall)[0]
        best_idx = cand[np.argmax(precisions[cand])] if len(cand) else int(np.argmax(recalls[:-1]))
    else:
        raise ValueError("Unknown objective")

    best_thr = float(thresholds[best_idx])

    # --- evaluate on test ---
    if hasattr(model, "predict_proba"):
        test_proba = model.predict_proba(X_test_proc)[:, 1]
    else:
        test_scores = model.decision_function(X_test_proc)
        smin, smax = test_scores.min(), test_scores.max()
        test_proba = (test_scores - smin) / (smax - smin + 1e-12)

    y_pred = (test_proba >= best_thr).astype(int)

    print(f"\n=== {name} | Threshold objective: {objective} ===")
    print(f"Chosen threshold (val): {best_thr:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return {
        "Model": name,
        "Objective": objective,
        "Threshold": best_thr,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced_Acc": balanced_accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "ROC_AUC": roc_auc_score(y_test, test_proba),
        "PR_AUC": average_precision_score(y_test, test_proba),
    }

# Run both models under three thresholding rules (so you can show you tried)
rows = []
for mname, model in models.items():
    rows.append(evaluate_with_threshold(model, mname, objective="F1"))
    rows.append(evaluate_with_threshold(model, mname, objective="BalancedAcc"))
    rows.append(evaluate_with_threshold(model, mname, objective="FixedRecall", fixed_recall=0.80))

df_extra = pd.DataFrame(rows)
print("\n=== Extra Models (EEC & BRF) — Summary (sorted by PR_AUC) ===")
print(df_extra.sort_values(["PR_AUC","ROC_AUC"], ascending=False)[
    ["Model","Objective","Accuracy","Balanced_Acc","Precision","Recall","F1","ROC_AUC","PR_AUC","Threshold"]
])



=== EasyEnsembleClassifier | Threshold objective: F1 ===
Chosen threshold (val): 0.4413
              precision    recall  f1-score   support

           0       0.88      0.04      0.07     17762
           1       0.10      0.95      0.18      1976

    accuracy                           0.13     19738
   macro avg       0.49      0.50      0.13     19738
weighted avg       0.80      0.13      0.08     19738

Confusion Matrix:
 [[  682 17080]
 [   94  1882]]

=== EasyEnsembleClassifier | Threshold objective: BalancedAcc ===
Chosen threshold (val): 0.5129
              precision    recall  f1-score   support

           0       0.90      0.62      0.74     17762
           1       0.10      0.38      0.16      1976

    accuracy                           0.60     19738
   macro avg       0.50      0.50      0.45     19738
weighted avg       0.82      0.60      0.68     19738

Confusion Matrix:
 [[11093  6669]
 [ 1217   759]]

=== EasyEnsembleClassifier | Threshold objective: FixedRec