In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

# Load engineered dataset from Notebook 00
df = pd.read_csv("data/engineered_transactions.csv")
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Drop non-feature columns
X = df.drop(columns=['transaction_id', 'transaction_time', 'is_fraud'])
y = df['is_fraud']

# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

In [None]:
# Compute scale_pos_weight for XGBoost
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

In [None]:
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    n_jobs=2  # limit CPU usage to avoid freezing
)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"F1-score: {f1_score(y_test, y_pred):.2f}")

In [None]:
# Use a smaller sample to reduce CPU usage
shap_sample = X_test.sample(100, random_state=42)

explainer = shap.Explainer(model)
shap_values = explainer(shap_sample)

# Summary plot
shap.summary_plot(shap_values, shap_sample)

# Bar plot (optional)
shap.summary_plot(shap_values, shap_sample, plot_type="bar")