In [1]:
# =========================
# SCRIPT 1: COST-SENSITIVE LEARNING (NO RESAMPLING)
# =========================

import numpy as np
import pandas as pd
import joblib
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, balanced_accuracy_score,
    classification_report, confusion_matrix, precision_recall_curve
)

RANDOM_STATE = 42

# --- Load artifacts/data
preprocessor = joblib.load("preprocessor.pkl")
X_train, X_test, y_train, y_test = joblib.load("splits.pkl")

# --- Preprocess (fit on train only, transform both)
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc  = preprocessor.transform(X_test)

# --- Make validation split from train (stratified)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_proc, y_train, test_size=0.2,
    random_state=RANDOM_STATE, stratify=y_train
)

# --- Compute pos_weight for XGBoost
cnt = Counter(y_tr)
neg, pos = cnt[0], cnt[1]
scale_pos_weight = neg / max(pos, 1)

# --- Define cost-sensitive models (no SMOTE)
log_reg_cs = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE, class_weight="balanced")
rf_cs      = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1, class_weight="balanced")
gb_cs      = GradientBoostingClassifier(random_state=RANDOM_STATE)  # no class_weight
xgb_cs     = XGBClassifier(
    n_estimators=300, learning_rate=0.1, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, eval_metric="logloss",
    random_state=RANDOM_STATE, scale_pos_weight=scale_pos_weight
)

def evaluate_with_f1_threshold(model, name):
    """
    Train on ORIGINAL imbalanced train fold (no resampling).
    Choose a threshold on validation that maximizes F1.
    Evaluate on test with that threshold. Return metrics row.
    """
    model.fit(X_tr, y_tr)

    # --- tune threshold on validation by F1 ---
    val_proba = model.predict_proba(X_val)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_val, val_proba)
    f1s = (2 * precisions * recalls) / (precisions + recalls + 1e-9)
    best_idx = np.argmax(f1s[:-1])  # last P/R point has no threshold
    best_thr = float(thresholds[best_idx])

    # --- evaluate on test ---
    test_proba = model.predict_proba(X_test_proc)[:, 1]
    y_pred = (test_proba >= best_thr).astype(int)

    acc   = accuracy_score(y_test, y_pred)
    bal   = balanced_accuracy_score(y_test, y_pred)
    prec  = precision_score(y_test, y_pred, zero_division=0)
    rec   = recall_score(y_test, y_pred, zero_division=0)
    f1    = f1_score(y_test, y_pred, zero_division=0)
    roc   = roc_auc_score(y_test, test_proba)
    prauc = average_precision_score(y_test, test_proba)

    print(f"\n=== {name} (Cost-Sensitive) ===")
    print(f"Chosen threshold (val F1): {best_thr:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return {
        "Model": name,
        "Threshold": best_thr,
        "Accuracy": acc,
        "Balanced_Acc": bal,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "ROC_AUC": roc,
        "PR_AUC": prauc
    }

# --- Run/evaluate
results_cs = []
results_cs.append(evaluate_with_f1_threshold(log_reg_cs, "Logistic Regression (class_weight=balanced)"))
results_cs.append(evaluate_with_f1_threshold(rf_cs,      "Random Forest (class_weight=balanced)"))
results_cs.append(evaluate_with_f1_threshold(gb_cs,      "Gradient Boosting (no class_weight)"))
results_cs.append(evaluate_with_f1_threshold(xgb_cs,     "XGBoost (scale_pos_weight)"))

# --- Table
results_cs_df = pd.DataFrame(results_cs).set_index("Model")
print("\n=== Cost-Sensitive Models: Comparison ===\n",
      results_cs_df[["Accuracy","Balanced_Acc","Precision","Recall","F1","ROC_AUC","PR_AUC","Threshold"]])



=== Logistic Regression (class_weight=balanced) (Cost-Sensitive) ===
Chosen threshold (val F1): 0.4866
              precision    recall  f1-score   support

           0       0.90      0.20      0.32     17762
           1       0.10      0.81      0.18      1976

    accuracy                           0.26     19738
   macro avg       0.50      0.50      0.25     19738
weighted avg       0.82      0.26      0.31     19738

Confusion Matrix:
 [[ 3494 14268]
 [  382  1594]]

=== Random Forest (class_weight=balanced) (Cost-Sensitive) ===
Chosen threshold (val F1): 0.0400
              precision    recall  f1-score   support

           0       0.85      0.00      0.00     17762
           1       0.10      1.00      0.18      1976

    accuracy                           0.10     19738
   macro avg       0.48      0.50      0.09     19738
weighted avg       0.77      0.10      0.02     19738

Confusion Matrix:
 [[   34 17728]
 [    6  1970]]

=== Gradient Boosting (no class_weight) (Co