In [1]:
# 04_model_xgboost.ipynb
# Clean and modular XGBoost training with hyperparameter optimization for PaySim fraud detection

In [14]:
# --- Imports ---
import pandas as pd
import numpy as np
import os
import pathlib
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

import xgboost as xgb
import mlflow
import mlflow.xgboost

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

In [15]:
# absolute path to ../mlruns
local_mlruns = pathlib.Path("../mlruns").resolve()

In [16]:
# # --- Load Environment Variables ---
# load_dotenv()
# mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
# mlflow.set_registry_uri(os.getenv("MLFLOW_ARTIFACT_URI"))

In [17]:
mlflow.set_tracking_uri(f"file://{local_mlruns}")

In [5]:
# --- Load Data ---
path = "../data/processed/paysim_features.csv"
df = pd.read_csv(path)
print("Loaded dataset: ", df.shape)


Loaded dataset:  (6362620, 16)


In [6]:
X = df.drop(columns=["isFraud"])
y = df['isFraud']

In [7]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Train: (4072076, 15), Val: (1018020, 15), Test: (1272524, 15)


In [8]:
# --- Define Search Space ---
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 10)),
    'gamma': hp.uniform('gamma', 0, 5),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
}

In [18]:
# --- Objective Function ---
def objective(params):
    with mlflow.start_run(nested=True):
        model = xgb.XGBClassifier(
            eval_metric='logloss',
            scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),
            random_state=42,
            **params
        )

        model.fit(X_train, y_train)
        y_val_prob = model.predict_proba(X_val)[:, 1]
        val_ap = average_precision_score(y_val, y_val_prob)

        mlflow.log_params(params)
        mlflow.log_metric("val_avg_precision", val_ap)

        return {'loss': -val_ap, 'status': STATUS_OK}

In [19]:
# --- Run Hyperparameter Optimization ---
mlflow.set_experiment("xgboost_hyperopt")
with mlflow.start_run(run_name="xgb_hyperopt_run"):
    trials = Trials()
    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=20,
        trials=trials
    )
    print("\nBest hyperparameters:")
    print(best_result)

2025/08/10 17:45:29 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_hyperopt' does not exist. Creating a new experiment.




Best hyperparameters:
{'colsample_bytree': np.float64(0.791581269196184), 'gamma': np.float64(1.9331048512480469), 'learning_rate': np.float64(0.06417734245106829), 'max_depth': np.float64(9.0), 'n_estimators': np.float64(160.0), 'subsample': np.float64(0.8041961358281932)}


In [25]:
best_params = {
    'colsample_bytree': float(0.7916), 
    'gamma': float(1.9331), 
    'learning_rate': float(0.0642), 
    'max_depth': int(9.0), 
    'n_estimators': int(160), 
    'subsample': float(0.8042)
}

In [26]:
best_params

{'colsample_bytree': 0.7916,
 'gamma': 1.9331,
 'learning_rate': 0.0642,
 'max_depth': 9,
 'n_estimators': 160,
 'subsample': 0.8042}

In [34]:
# --- Final Model Training with Best Hyperparameters ---
with mlflow.start_run(run_name="xgb_final_tuned"):
    model = xgb.XGBClassifier(
        eval_metric='logloss',
        scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),
        random_state=42,
        **best_params
    )

    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]
    
    val_auc = roc_auc_score(y_val, y_val_prob)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    
    test_auc = roc_auc_score(y_test, y_test_prob)
    test_ap = average_precision_score(y_test, y_test_prob)

    # Log everything to MLflow
    mlflow.xgboost.log_model(model, "model")
    mlflow.log_params(best_params)
    mlflow.log_metrics({
        "val_roc_auc": val_auc,
        "val_avg_precision": val_ap,
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"],
        "test_roc_auc": test_auc,
        "test_avg_precision": test_ap
    })

    print("\nFinal XGBoost Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))





Final XGBoost Validation Results:
ROC AUC: 0.9989062239170021
Avg Precision: 0.8948402770731887
              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1016706
           1       0.15      0.95      0.26      1314

    accuracy                           0.99   1018020
   macro avg       0.57      0.97      0.63   1018020
weighted avg       1.00      0.99      1.00   1018020

