In [16]:
# ==================================================
# SCRIPT 2: RESAMPLING VARIANTS (ALL MODELS/RESAMPLERS)
# ==================================================
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, balanced_accuracy_score,
    classification_report, confusion_matrix, precision_recall_curve
)

# ------------------
# Setup & Artifacts
# ------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

preprocessor = joblib.load("preprocessor.pkl")
X_train, X_test, y_train, y_test = joblib.load("splits.pkl")

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_proc, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)

# ----------------------
# Base models (plain; resampling handles imbalance)
# ----------------------
models = {
    "Logistic Regression":
        LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    "Random Forest":
        RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1),
    "Gradient Boosting":
        GradientBoostingClassifier(random_state=RANDOM_STATE),
    "XGBoost":
        XGBClassifier(
            n_estimators=300, learning_rate=0.1, max_depth=4,
            subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
            random_state=RANDOM_STATE
        ),
}

# ----------------------
# Resamplers to test
# ----------------------
resamplers = {
    "RandomOverSampler": RandomOverSampler(random_state=RANDOM_STATE),
    "RandomUnderSampler": RandomUnderSampler(random_state=RANDOM_STATE),
    "SMOTE-Tomek": SMOTETomek(random_state=RANDOM_STATE),
    "SMOTEENN": SMOTEENN(random_state=RANDOM_STATE),
}

# ----------------------
# Helper: train/evaluate
# ----------------------
def evaluate_resampled(model, model_name, resampler_name, Xtrain, ytrain):
    X_res, y_res = resamplers[resampler_name].fit_resample(Xtrain, ytrain)
    model.fit(X_res, y_res)

    # Tune threshold on validation by F1 (same rule as baseline)
    val_proba = model.predict_proba(X_val)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_val, val_proba)
    f1s = (2 * precisions * recalls) / (precisions + recalls + 1e-9)
    best_idx = int(np.argmax(f1s[:-1]))
    best_thr = float(thresholds[best_idx])

    test_proba = model.predict_proba(X_test_proc)[:, 1]
    y_pred = (test_proba >= best_thr).astype(int)

    print(f"\n=== {model_name} + {resampler_name} ===")
    print(f"Chosen threshold (val F1-max): {best_thr:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return {
        "Resampler": resampler_name,
        "Model": model_name,
        "Threshold": best_thr,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Balanced_Acc": balanced_accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "ROC_AUC": roc_auc_score(y_test, test_proba),
        "PR_AUC": average_precision_score(y_test, test_proba),
    }

# -----------------
# Run: every (resampler x model)
# -----------------
rows = []
for rname in resamplers.keys():
    for mname, model in models.items():
        rows.append(evaluate_resampled(model, mname, rname, X_tr, y_tr))

df_res = pd.DataFrame(rows)
print("\n=== Resampling Variants (All Models) — Summary (sorted by PR_AUC) ===")
print(df_res.sort_values(["PR_AUC","ROC_AUC"], ascending=False)[
    ["Resampler","Model","Accuracy","Balanced_Acc","Precision","Recall","F1","ROC_AUC","PR_AUC","Threshold"]
])



=== Logistic Regression + RandomOverSampler ===
Chosen threshold (val F1-max): 0.4730
              precision    recall  f1-score   support

           0       0.90      0.03      0.06     17762
           1       0.10      0.97      0.18      1976

    accuracy                           0.12     19738
   macro avg       0.50      0.50      0.12     19738
weighted avg       0.82      0.12      0.07     19738

Confusion Matrix:
 [[  530 17232]
 [   60  1916]]

=== Random Forest + RandomOverSampler ===
Chosen threshold (val F1-max): 0.0733
              precision    recall  f1-score   support

           0       0.93      0.00      0.01     17762
           1       0.10      1.00      0.18      1976

    accuracy                           0.10     19738
   macro avg       0.52      0.50      0.09     19738
weighted avg       0.85      0.10      0.02     19738

Confusion Matrix:
 [[   67 17695]
 [    5  1971]]

=== Gradient Boosting + RandomOverSampler ===
Chosen threshold (val F1-max): 