In [None]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
)

import matplotlib.pyplot as plt
import seaborn as sns

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from data.load_data import download_creditcard_data, load_creditcard_df

# Project step 4: XGBoost on credit card fraud detection

In this notebook we use XGBoost, a gradient boosting algorithm that builds trees sequentially. Each new tree tries to fix the mistakes of the previous ones. Its known for being fast and accurate on tabular data like ours.

In [None]:
download_creditcard_data()
df = load_creditcard_df()

df = df.drop_duplicates().reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"Fraud percentage: {df['Class'].mean() * 100:.4f}%")

In [None]:
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train size: {len(X_train)}, Fraud cases: {y_train.sum()}")
print(f"Test size: {len(X_test)}, Fraud cases: {y_test.sum()}")

In [None]:
numeric_to_scale = ["Time", "Amount"]
other_features = [col for col in X.columns if col not in numeric_to_scale]

preprocessor = ColumnTransformer(
    transformers=[
        ("scale_time_amount", StandardScaler(), numeric_to_scale),
        ("pass_others", "passthrough", other_features),
    ]
)

## 1. Baseline XGBoost

XGBoost handles imbalanced data with the scale_pos_weight parameter. We set it to the ratio of negative to positive samples.

In [None]:
# calculate class imbalance ratio
scale_pos = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos:.2f}")

baseline_xgb = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(
            n_estimators=100,
            scale_pos_weight=scale_pos,
            random_state=42,
            n_jobs=-1,
            eval_metric="logloss",
        )),
    ]
)

baseline_xgb.fit(X_train, y_train)

y_pred_baseline = baseline_xgb.predict(X_test)
y_proba_baseline = baseline_xgb.predict_proba(X_test)[:, 1]

print("\nBaseline XGBoost results:")
print(classification_report(y_test, y_pred_baseline, digits=4))

## 2. Hyperparameter tuning

XGBoost has many parameters. The most important ones are:
- n_estimators: number of trees
- max_depth: depth of each tree
- learning_rate: how much each tree contributes
- subsample: fraction of samples used per tree
- colsample_bytree: fraction of features used per tree

We use RandomizedSearchCV to explore different combinations.

In [None]:
from scipy.stats import randint, uniform

xgb_pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(
            scale_pos_weight=scale_pos,
            random_state=42,
            n_jobs=-1,
            eval_metric="logloss",
        )),
    ]
)

param_distributions = {
    "model__n_estimators": randint(50, 200),
    "model__max_depth": randint(3, 10),
    "model__learning_rate": uniform(0.01, 0.29),
    "model__subsample": uniform(0.6, 0.4),
    "model__colsample_bytree": uniform(0.6, 0.4),
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_distributions,
    n_iter=20,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

random_search.fit(X_train, y_train)

print(f"\nBest parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV F1 score: {random_search.best_score_:.4f}")

In [None]:
best_xgb = random_search.best_estimator_

y_pred_tuned = best_xgb.predict(X_test)
y_proba_tuned = best_xgb.predict_proba(X_test)[:, 1]

print("Tuned XGBoost results:")
print(classification_report(y_test, y_pred_tuned, digits=4))

## 3. Comparison: baseline vs tuned

In [None]:
def compute_metrics(y_true, y_pred, y_proba, name):
    return {
        "model": name,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
    }

results = []
results.append(compute_metrics(y_test, y_pred_baseline, y_proba_baseline, "XGB Baseline"))
results.append(compute_metrics(y_test, y_pred_tuned, y_proba_tuned, "XGB Tuned"))

results_df = pd.DataFrame(results)
results_df

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

models_preds = {
    "XGB Baseline": y_pred_baseline,
    "XGB Tuned": y_pred_tuned,
}

for ax, (name, y_pred) in zip(axes, models_preds.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=ax)
    ax.set_title(name)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
plt.bar(results_df["model"], results_df["f1"], color=["steelblue", "green"])
plt.ylabel("F1 score (fraud class)")
plt.title("F1 scores comparison")
plt.tight_layout()
plt.show()

## 4. Feature importance

XGBoost also provides feature importances, lets see which features matter most.

In [None]:
xgb_model = best_xgb.named_steps["model"]
importances = xgb_model.feature_importances_

feature_names = numeric_to_scale + other_features

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

plt.figure(figsize=(8, 6))
plt.barh(importance_df["feature"][:15][::-1], importance_df["importance"][:15][::-1])
plt.xlabel("Importance")
plt.title("Top 15 most important features")
plt.tight_layout()
plt.show()

print("Top 10 features:")
print(importance_df.head(10).to_string(index=False))

## 5. Conclusion

XGBoost delivers strong results on our fraud detection problem. Both baseline and tuned models achieve an F1 score around 0.85-0.86, which is better than what we got with Random Forest (0.83).

Looking at the confusion matrices, the tuned model catches 74 out of 95 frauds (78% recall) while only generating 3 false alarms. The baseline is almost identical with 73 true positives. This shows that XGBoost with scale_pos_weight already works well out of the box, and tuning only gives a small improvement.

The precision is excellent at 96%, meaning when the model flags a transaction as fraud, its almost always right. This is important in practice because too many false alerts would annoy customers and overload the fraud team.

Compared to our previous models, XGBoost gives the best balance between catching frauds and avoiding false positives. The gradient boosting approach where each tree corrects the errors of previous ones seems to work well for this type of tabular data.