In [1]:
# 03_model_training.ipynb
# Training baseline models for PaySim fraud detection

In [2]:
# --- imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os
from dotenv import load_dotenv


In [3]:
# --- Load environment variables ---
load_dotenv()
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_registry_uri(os.getenv("MLFLOW_ARTIFACT_URI"))

In [4]:
# --- Load Processed Data ---
path = "../data/processed/paysim_features.csv"
df = pd.read_csv(path)
print("Loaded processed dataset with shape: ", df.shape)

Loaded processed dataset with shape:  (6362620, 16)


In [5]:
# --- Features and Target ---
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [6]:
# --- Train/Test Split (Stratified) ---
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Validation Dataset Split---
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (4072076, 15), Val: (1018020, 15), Test: (1272524, 15)


In [8]:
# --- Setup MLflow ---
mlflow.set_experiment("fraud_detection_baseline")

2025/08/05 19:31:59 INFO mlflow.tracking.fluent: Experiment with name 'fraud_detection_baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/352602100880773315', creation_time=1754422319468, experiment_id='352602100880773315', last_update_time=1754422319468, lifecycle_stage='active', name='fraud_detection_baseline', tags={}>

In [9]:
# --- Baseline Model: Logistic Regression ---
with mlflow.start_run(run_name="LogisticRegression"):
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    y_val_prob = model.predict_proba(X_val)[:, 1]

    val_auc = roc_auc_score(y_val, y_val_pred)
    val_ap = average_precision_score(y_val, y_val_prob)
    val_report = classification_report(y_val, y_val_pred, output_dict=True)

    # Log to MLflow
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_metric("val_roc_auc", val_auc)
    mlflow.log_metric("val_avg_precision", val_ap)
    mlflow.log_metrics(
        {
        "val_precision": val_report["1"]["precision"],
        "val_recall": val_report["1"]["recall"],
        "val_f1": val_report["1"]["f1-score"]
    }
    )

    print("\nLogistic Regression Validation Results:")
    print("ROC AUC:", val_auc)
    print("Avg Precision:", val_ap)
    print(classification_report(y_val, y_val_pred))
    




Logistic Regression Validation Results:
ROC AUC: 0.9563304498967194
Avg Precision: 0.5856908999798426
              precision    recall  f1-score   support

           0       1.00      0.95      0.98   1016706
           1       0.03      0.96      0.05      1314

    accuracy                           0.95   1018020
   macro avg       0.51      0.96      0.51   1018020
weighted avg       1.00      0.95      0.98   1018020

🏃 View run LogisticRegression at: http://localhost:5000/#/experiments/352602100880773315/runs/7a3ed1c978db420295b8541457121ca1
🧪 View experiment at: http://localhost:5000/#/experiments/352602100880773315
