# Step 5: Model Explainability with SHAP

In this step, we interpret our XGBoost model to understand what drives fraud predictions.  
We will generate global feature importance, individual prediction force plots, and actionable business insights.


In [4]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

import xgboost as xgb


AttributeError: module 'numpy' has no attribute '_no_nep50_warning'

In [None]:
# Load processed dataset
fraud_df = pd.read_csv("../data/processed/Fraud_Data_cleaned.csv")

# Prepare features & target
X = fraud_df.drop(columns=['class','signup_time','purchase_time','ip_address','user_id','device_id'], errors='ignore')
y = fraud_df['class']

# One-hot encode categorical features
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, stratify=y, random_state=42)

# Scale numeric features
num_cols = X_train.select_dtypes(include=['float64','int64']).columns
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Load XGBoost model from Step 4 (or retrain if not saved)
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=(y_train==0).sum() / (y_train==1).sum(),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train_res, y_train_res)
