In [1]:
# 03_model_training.ipynb
# Training baseline models for PaySim fraud detection

In [1]:
# --- imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
import xgboost as xgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv


In [2]:
# --- Load environment variables ---
load_dotenv()
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_registry_uri(os.getenv("MLFLOW_ARTIFACT_URI"))

In [3]:
# --- Load Processed Data ---
path = "../data/processed/paysim_features.csv"
df = pd.read_csv(path)
print("Loaded processed dataset with shape: ", df.shape)

Loaded processed dataset with shape:  (6362620, 16)


In [4]:
# --- Features and Target ---
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [5]:
# --- Train/Test Split (Stratified) ---
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Validation Dataset Split---
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (4072076, 15), Val: (1018020, 15), Test: (1272524, 15)


In [6]:
# --- Setup MLflow ---
mlflow.set_experiment("fraud_detection_baseline")

<Experiment: artifact_location='mlflow-artifacts:/352602100880773315', creation_time=1754422319468, experiment_id='352602100880773315', last_update_time=1754422319468, lifecycle_stage='active', name='fraud_detection_baseline', tags={}>

In [9]:
# --- Baseline Model: Logistic Regression ---
with mlflow.start_run(run_name="LogisticRegression"):
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    val_auc = roc_auc_score(y_val, y_val_pred)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    # Log to MLflow
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_metric("val_roc_auc", val_auc)
    mlflow.log_metric("val_avg_precision", val_ap)
    mlflow.log_metrics(
        {
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"]
    }
    )

    print("\nLogistic Regression Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))
    




Logistic Regression Validation Results:
ROC AUC: 0.9563304498967194
Avg Precision: 0.5856908999798426
              precision    recall  f1-score   support

           0       1.00      0.95      0.98   1016706
           1       0.03      0.96      0.05      1314

    accuracy                           0.95   1018020
   macro avg       0.51      0.96      0.51   1018020
weighted avg       1.00      0.95      0.98   1018020

🏃 View run LogisticRegression at: http://localhost:5000/#/experiments/352602100880773315/runs/7a3ed1c978db420295b8541457121ca1
🧪 View experiment at: http://localhost:5000/#/experiments/352602100880773315


In [10]:
# --- XGBoost Model Training ---
with mlflow.start_run(run_name="XGBoost"):
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=(len(y_train)-sum(y_train)) / sum(y_train),
        randmo_state=42,
        use_label_encoder=False,
        eval_metrics="logloss"
    )
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    val_auc = roc_auc_score(y_val, y_val_prob)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    # Log to MLflow
    mlflow.xgboost.log_model(model, "model")
    params = {
        "n_estimators": 100,
        "max_depth": 6,
        "learning_rate": 0.1
    }
    mlflow.log_params(params=params)
    metrics = {
        "val_roc_auc": val_auc,
        "val_avg_precision": val_ap,
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"],
    }

    mlflow.log_metrics(metrics=metrics)

    print("\nXGBoost Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))

    
    

Parameters: { "eval_metrics", "randmo_state", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)



XGBoost Validation Results:
ROC AUC: 0.9992022930074768
Avg Precision: 0.8979261867377035
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1016706
           1       0.07      0.99      0.13      1314

    accuracy                           0.98   1018020
   macro avg       0.54      0.99      0.56   1018020
weighted avg       1.00      0.98      0.99   1018020

🏃 View run XGBoost at: http://localhost:5000/#/experiments/352602100880773315/runs/83f1f6c85495482f9583119b154bcf05
🧪 View experiment at: http://localhost:5000/#/experiments/352602100880773315


In [7]:
# --- Random Forest Model Training ---
with mlflow.start_run(run_name="RandomForest"):
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=6,
        random_state=42,
        class_weight='balanced'
    )
    model.fit(X_train, y_train)

    # --- Validation predictions ---
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    # --- Evaluation metrics ---
    val_auc = roc_auc_score(y_val, y_val_prob)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    # --- Log model + params ---
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_params({
        "n_estimators": 100,
        "max_depth": 6,
        "class_weight": 'balanced'
    })

    # --- Log metrics ---
    mlflow.log_metrics({
        "val_roc_auc": val_auc,
        "val_avg_precision": val_ap,
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"]
    })

    # --- Print results ---
    print("\nRandom Forest Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))





Random Forest Validation Results:
ROC AUC: 0.9987158480201443
Avg Precision: 0.826631532340178
              precision    recall  f1-score   support

           0       1.00      0.97      0.99   1016706
           1       0.05      1.00      0.09      1314

    accuracy                           0.97   1018020
   macro avg       0.52      0.99      0.54   1018020
weighted avg       1.00      0.97      0.99   1018020

🏃 View run RandomForest at: http://localhost:5000/#/experiments/352602100880773315/runs/bb9c20fa432a4ccab3bcb6fc6a060a20
🧪 View experiment at: http://localhost:5000/#/experiments/352602100880773315


In [9]:
# --- Neural Network Model Training ---
with mlflow.start_run(run_name="NeuralNetwork"):
    model = MLPClassifier(
        hidden_layer_sizes=(50, 30),
        max_iter=500,
        activation='relu',
        random_state=42,
        solver="adam"
    )

    model.fit(X_train, y_train)

    # --- Validation predictions ---
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    # --- Evaluation metrics ---
    val_auc = roc_auc_score(y_val, y_val_prob)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    # --- Log model + params ---
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_params({
        "hidden_layer_sizes": (50, 30),
        "max_iter": 500,
        "activation": 'relu',
        "solver": 'adam'
    })

    # --- Log metrics ---
    mlflow.log_metrics({
        "val_roc_auc": val_auc,
        "val_avg_precision": val_ap,
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"]
    })

    # --- Print results ---
    print("\nNeural Network Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Neural Network Validation Results:
ROC AUC: 0.4990628882653513
Avg Precision: 0.0012883303993261777
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1016706
           1       0.00      0.00      0.00      1314

    accuracy                           1.00   1018020
   macro avg       0.50      0.50      0.50   1018020
weighted avg       1.00      1.00      1.00   1018020

🏃 View run NeuralNetwork at: http://localhost:5000/#/experiments/352602100880773315/runs/3c9bdc7ffc9f473dbb93989cdf3338e9
🧪 View experiment at: http://localhost:5000/#/experiments/352602100880773315


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### 🔍 Model Performance Comparison (Validation Metrics – Fraud Class = 1)

| Model            | ROC AUC | Avg Precision | Precision | Recall | F1 Score |
|------------------|---------|----------------|-----------|--------|----------|
| **Logistic**     | 0.956   | 0.586          | 0.027     | 0.958  | 0.052    |
| **Random Forest**| 0.999   | 0.827          | 0.048     | 0.998  | 0.092    |
| **XGBoost**      | 0.999   | 0.898          | 0.070     | 0.988  | 0.131    |
| **Neural Net**   | 0.499   | 0.001          | 0.000     | 0.000  | 0.000    |


### Model Selection Rationale: Why We Chose XGBoost 
After evaluating multiple models on the PaySim dataset, including **Logistic Regression**, **Random Forest**, **Neural Network**, and **XGBoost**, we chose **XGBoost** as the primary model for further tuning and deployment based on the following reasons: 
* 📈 **Best Validation Metrics**: XGBoost outperformed other models in **ROC AUC (0.999)** and **Average Precision (0.898)**, which are critical for imbalanced fraud detection tasks.
* ⚖️ **Strong Recall on Minority Class**: Despite class imbalance, XGBoost achieved a **recall of 0.99** for fraudulent transactions, ensuring fewer false negatives.
* 🧠 **Built-in Handling of Class Imbalance**: Via `scale_pos_weight`, XGBoost handles skewed datasets efficiently without oversampling/undersampling.
* 🚀 **Faster Training Time**: Compared to Logistic Regression and Neural Network, XGBoost completed training noticeably faster, likely due to its optimized gradient boosting implementation.
* 🔁 **Tunable & Flexible**: XGBoost offers extensive hyperparameter control, making it ideal for performance tuning.

