Model Building and Training

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Alternative if XGBoost issues
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

Load Processed data

In [None]:
# Load r processed e-commerce data
data = pd.read_csv('../data/processed/processed_fraud_data.csv')  

print(data.shape)
print(data['class'].value_counts(normalize=True))  # Confirm imbalance

# Separate features and target
X = data.drop('class', axis=1)
y = data['class']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape, "Fraud rate:", y_train.mean())
print("Test shape:", X_test.shape, "Fraud rate:", y_test.mean())

(151112, 16)
class
0    0.906354
1    0.093646
Name: proportion, dtype: float64
Train shape: (120889, 15) Fraud rate: 0.09364789186774644
Test shape: (30223, 15) Fraud rate: 0.09363729609899746


Baseline: Logistic Regression

In [3]:
# Baseline model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

# Metrics
precision, recall, _ = precision_recall_curve(y_test, y_prob_lr)
auc_pr_lr = auc(recall, precision)
f1_lr = f1_score(y_test, y_pred_lr)

print("Logistic Regression")
print(f"AUC-PR: {auc_pr_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

Logistic Regression
AUC-PR: 0.6507
F1-Score: 0.6547
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27393
           1       0.90      0.51      0.65      2830

    accuracy                           0.95     30223
   macro avg       0.93      0.75      0.81     30223
weighted avg       0.95      0.95      0.94     30223

[[27231   162]
 [ 1374  1456]]


Ensemble Model: XGBoost

In [4]:
# XGBoost model
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # Handles imbalance
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

precision_x, recall_x, _ = precision_recall_curve(y_test, y_prob_xgb)
auc_pr_xgb = auc(recall_x, precision_x)
f1_xgb = f1_score(y_test, y_pred_xgb)

print("XGBoost")
print(f"AUC-PR: {auc_pr_xgb:.4f}")
print(f"F1-Score: {f1_xgb:.4f}")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

XGBoost
AUC-PR: 0.7046
F1-Score: 0.6130
              precision    recall  f1-score   support

           0       0.97      0.94      0.96     27393
           1       0.56      0.68      0.61      2830

    accuracy                           0.92     30223
   macro avg       0.76      0.81      0.78     30223
weighted avg       0.93      0.92      0.92     30223

[[25858  1535]
 [  901  1929]]


Model Comparison and Selection

In [5]:
print("Comparison:")
print(f"Logistic Regression - AUC-PR: {auc_pr_lr:.4f}, F1: {f1_lr:.4f}")
print(f"XGBoost - AUC-PR: {auc_pr_xgb:.4f}, F1: {f1_xgb:.4f}")

# Select best (XGBoost usually wins on imbalanced data)
best_model = xgb_model  # Change to lr_model if LR better (unlikely)

# Save the best model
joblib.dump(best_model, '../models/best_model.pkl')
print("Best model saved to ../models/best_model.pkl")

Comparison:
Logistic Regression - AUC-PR: 0.6507, F1: 0.6547
XGBoost - AUC-PR: 0.7046, F1: 0.6130
Best model saved to ../models/best_model.pkl
