# 4. Model Training and Comparison

In this notebook, we train multiple classification models to detect fraud and compare their performance using various metrics.

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.data.loading import load_fraud_data
from src.models.data_prep import prepare_model_data
from src.models.baseline import LogisticRegressionBaseline
from src.models.ensemble import RandomForestModel, XGBoostModel, LightGBMModel
from src.models.evaluation import calculate_metrics, stratified_kfold_cv, aggregate_cv_results, save_model
from src.models.comparison import compare_models, create_comparison_table, select_best_model
from src.visualization.model_viz import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve, plot_feature_importance, plot_model_comparison

## 1. Prepare Data

Loading the featured dataset and splitting into train/test sets with preprocessing.

In [None]:
# Load processed data (or re-load raw if pipeline is combined)
# Using raw load for demo purposes if processed isn't ready
df = pd.read_csv('../data/processed/fraud_featured.csv')

# Define columns
target = 'class'
drop_cols = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'country']
numeric_cols = ['purchase_value', 'age', 'time_since_signup', 'user_txn_count', 'user_avg_amount']
categorical_cols = ['source', 'browser', 'sex', 'country_risk_level']

X_train, X_test, y_train, y_test = prepare_model_data(
    df, 
    target, 
    numeric_cols, 
    categorical_cols, 
    drop_cols=drop_cols,
    test_size=0.2,
    apply_smote=True  # Address class imbalance
)

## 2. Model Training & Evaluation

In [None]:
results = []

# --- 1. Baseline: Logistic Regression ---
print("Training Logistic Regression...")
lr = LogisticRegressionBaseline()
lr.train(X_train, y_train)
lr_metrics = lr.evaluate(X_test, y_test)
results.append(lr_metrics)

# --- 2. Random Forest ---
print("Training Random Forest...")
rf = RandomForestModel()
rf.train(X_train, y_train)
rf_metrics = rf.evaluate(X_test, y_test)
results.append(rf_metrics)

# --- 3. XGBoost ---
print("Training XGBoost...")
xgb = XGBoostModel()
xgb.train(X_train, y_train)
xgb_metrics = xgb.evaluate(X_test, y_test)
results.append(xgb_metrics)

# --- 4. LightGBM ---
print("Training LightGBM...")
lgbm = LightGBMModel()
lgbm.train(X_train, y_train)
lgbm_metrics = lgbm.evaluate(X_test, y_test)
results.append(lgbm_metrics)

## 3. Comparison and Selection

In [None]:
comparison_df = compare_models(results, sort_by='auc_pr')
comparison_table = create_comparison_table(results)
print(comparison_table)

best = select_best_model(results, primary_metric='auc_pr')
print(best['justification'])

In [None]:
plot_model_comparison(comparison_df, metrics=['roc_auc', 'auc_pr', 'f1'])

## 4. Visualization for Best Model

Let's assume XGBoost or LightGBM performed best.

In [None]:
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)

plot_confusion_matrix(y_test, y_pred)
plot_roc_curve(y_test, y_proba)
plot_precision_recall_curve(y_test, y_proba)
plot_feature_importance(xgb.model, X_train.columns)

## 5. Save the Best Model

In [None]:
save_model(xgb.model, "best_fraud_model")
print("Best model saved to models/best_fraud_model.joblib")