Model Building and Training

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Alternative if XGBoost issues
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

Load Processed data

In [3]:
# Load r processed e-commerce data
data = pd.read_csv('../data/processed/processed_fraud_data.csv')  

print(data.shape)
print(data['class'].value_counts(normalize=True))  # Confirm imbalance

# Separate features and target
X = data.drop('class', axis=1)
y = data['class']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape, "Fraud rate:", y_train.mean())
print("Test shape:", X_test.shape, "Fraud rate:", y_test.mean())

(151112, 16)
class
0    0.906354
1    0.093646
Name: proportion, dtype: float64
Train shape: (120889, 15) Fraud rate: 0.09364789186774644
Test shape: (30223, 15) Fraud rate: 0.09363729609899746


Baseline: Logistic Regression

In [4]:
# Baseline model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

# Metrics
precision, recall, _ = precision_recall_curve(y_test, y_prob_lr)
auc_pr_lr = auc(recall, precision)
f1_lr = f1_score(y_test, y_pred_lr)

print("Logistic Regression")
print(f"AUC-PR: {auc_pr_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

Logistic Regression
AUC-PR: 0.6507
F1-Score: 0.6547
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27393
           1       0.90      0.51      0.65      2830

    accuracy                           0.95     30223
   macro avg       0.93      0.75      0.81     30223
weighted avg       0.95      0.95      0.94     30223

[[27231   162]
 [ 1374  1456]]


Ensemble Model: XGBoost

In [5]:
# XGBoost model
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # Handles imbalance
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

precision_x, recall_x, _ = precision_recall_curve(y_test, y_prob_xgb)
auc_pr_xgb = auc(recall_x, precision_x)
f1_xgb = f1_score(y_test, y_pred_xgb)

print("XGBoost")
print(f"AUC-PR: {auc_pr_xgb:.4f}")
print(f"F1-Score: {f1_xgb:.4f}")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

XGBoost
AUC-PR: 0.7046
F1-Score: 0.6130
              precision    recall  f1-score   support

           0       0.97      0.94      0.96     27393
           1       0.56      0.68      0.61      2830

    accuracy                           0.92     30223
   macro avg       0.76      0.81      0.78     30223
weighted avg       0.93      0.92      0.92     30223

[[25858  1535]
 [  901  1929]]


Model Comparison and Selection

In [6]:
print("Comparison:")
print(f"Logistic Regression - AUC-PR: {auc_pr_lr:.4f}, F1: {f1_lr:.4f}")
print(f"XGBoost - AUC-PR: {auc_pr_xgb:.4f}, F1: {f1_xgb:.4f}")

# Select best (XGBoost usually wins on imbalanced data)
best_model = xgb_model  # Change to lr_model if LR better (unlikely)

# Save the best model
joblib.dump(best_model, '../models/best_model.pkl')
print("Best model saved to ../models/best_model.pkl")

Comparison:
Logistic Regression - AUC-PR: 0.6507, F1: 0.6547
XGBoost - AUC-PR: 0.7046, F1: 0.6130
Best model saved to ../models/best_model.pkl


## Cross-Validation for Reliable Performance Estimation
Using StratifiedKFold (k=5) to preserve class distribution in each fold.

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, precision_recall_curve, f1_score

# Prepare data (already have X, y)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_pr_scores = []
f1_scores = []

for train_idx, val_idx in cv.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train XGBoost on fold
    model_cv = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=(y_train_fold == 0).sum() / (y_train_fold == 1).sum(),
        random_state=42,
        eval_metric='logloss'
    )
    model_cv.fit(X_train_fold, y_train_fold)
    
    # Predict probabilities
    y_prob_val = model_cv.predict_proba(X_val_fold)[:, 1]
    y_pred_val = model_cv.predict(X_val_fold)
    
    # Metrics
    precision, recall, _ = precision_recall_curve(y_val_fold, y_prob_val)
    auc_pr_scores.append(auc(recall, precision))
    f1_scores.append(f1_score(y_val_fold, y_pred_val))

print("Cross-Validation Results (XGBoost)")
print(f"AUC-PR: {np.mean(auc_pr_scores):.4f} ± {np.std(auc_pr_scores):.4f}")
print(f"F1-Score: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

Cross-Validation Results (XGBoost)
AUC-PR: 0.7154 ± 0.0056
F1-Score: 0.6206 ± 0.0055


## Hyperparameter Tuning with GridSearchCV
Tuning key XGBoost parameters for better performance.

In [7]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import joblib

# Smaller, safe grid
param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        scale_pos_weight=10  # ~1:10 imbalance ratio
    ),
    param_grid=param_grid,
    scoring='average_precision',  # optimizes AUC-PR
    cv=3,
    n_jobs=1,        # <<< Prevents TerminatedWorkerError on Windows
    verbose=2
)

# This is the important line — make sure it finishes!
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV AUC-PR score:", grid_search.best_score_)

# Now create the best model
best_xgb = grid_search.best_estimator_

# Optional: retrain on full training data (already done by best_estimator_)
# Evaluate on test set to confirm
from sklearn.metrics import auc, precision_recall_curve, f1_score, classification_report

y_prob = best_xgb.predict_proba(X_test)[:, 1]
y_pred = best_xgb.predict(X_test)

precision, recall, _ = precision_recall_curve(y_test, y_prob)
print(f"Tuned model Test AUC-PR: {auc(recall, precision):.4f}")
print(f"Tuned model Test F1: {f1_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=   1.8s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=   2.1s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=200; total time=   1.5s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=400; total time=   3.3s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=400; total time=   4.2s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=400; total time=   3.6s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=   4.5s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=   4.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=200; total time=   3.2s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=400; total time=  12.5s
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=400; total time=   5.9s
[CV] END ..learning_rate=0.05, max_depth=7, n_est

Retrain final model

In [8]:
best_params = grid_search.best_params_
best_params['scale_pos_weight'] = 10  # Keep if not in grid
best_params['random_state'] = 42

final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

# Evaluate on test
from sklearn.metrics import auc, precision_recall_curve, f1_score
y_prob = final_model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob)
print(f"Tuned Test AUC-PR: {auc(recall, precision):.4f}")

# Save
joblib.dump(final_model, '../models/best_tuned_model.pkl')

Tuned Test AUC-PR: 0.7078


['../models/best_tuned_model.pkl']

In [10]:
joblib.dump(best_xgb, '../models/best_tuned_model.pkl')
print("Tuned model saved!")

Tuned model saved!
