In [None]:
# 3_classification.py

import pandas as pd
import numpy as np
import joblib
from utils.helpers import evaluate_basic_metrics, expected_loss, find_best_threshold, plot_confusion, plot_roc_curves, plot_precision_recall_curves
from sklearn.metrics import confusion_matrix

# Load test data
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv").squeeze()

# Load models
models = {
    "Logistic Regression": joblib.load("outputs/models/logistic_regression.pkl"),
    "Random Forest": joblib.load("outputs/models/random_forest.pkl"),
    "Gradient Boosting": joblib.load("outputs/models/gradient_boosting.pkl")
}

fp_cost = 1
fn_cost = 5
results = {}

# Evaluate and find optimal thresholds
for name, model in models.items():
    y_proba = model.predict_proba(X_test)[:, 1]
    best_thresh, min_loss = find_best_threshold(model, X_test, y_test, fp_cost, fn_cost)
    y_pred = (y_proba >= best_thresh).astype(int)

    metrics = evaluate_basic_metrics(y_test, y_pred, y_proba)
    metrics.update({
        "threshold": best_thresh,
        "expected_loss": min_loss,
        "y_true": y_test,
        "y_pred": y_pred,
        "y_proba": y_proba
    })
    results[name] = metrics

    print(f"\n{name} (Threshold = {best_thresh:.4f}):")
    for k, v in metrics.items():
        if isinstance(v, float) and k not in ["threshold"]:
            print(f"{k.capitalize():15}: {v:.4f}")
    print(f"Expected Loss        : ${min_loss}")

    plot_confusion(y_test, y_pred, ["Normal Growth", "Fast Growth"], f"Confusion Matrix - {name}")

# Identify best model
best_model = min(results.items(), key=lambda x: x[1]['expected_loss'])[0]
print(f"\nBest model based on expected loss: {best_model}")

# Plot ROC and PR curves
plot_roc_curves(results)
plot_precision_recall_curves(results)