In [2]:
!pip install shap



In [3]:
import shap
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [4]:
# --- Step 1: Load your dataset ---
fraud_df = pd.read_csv("Fraud_Data.csv")

# Drop columns not needed for modeling
X_fraud = fraud_df.drop(columns=["class", "purchase_time", "signup_time", "ip_address"], errors='ignore')
y_fraud = fraud_df["class"]

# Since 'country' column was missing earlier, let's remove it if present in features
if 'country' in X_fraud.columns:
    X_fraud = X_fraud.drop(columns=['country'])

In [5]:
# Identify categorical and numerical columns
categorical_cols = ['browser', 'source', 'sex']
numerical_cols = [col for col in X_fraud.columns if col not in categorical_cols]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_fraud, y_fraud, stratify=y_fraud, test_size=0.2, random_state=42
)

In [6]:
# Detect numeric columns by dtype
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define categorical columns as the rest
categorical_cols = [col for col in X_train.columns if col not in numerical_cols]

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

# Now, create the preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)



Numerical columns: ['user_id', 'purchase_value', 'age']
Categorical columns: ['device_id', 'source', 'browser', 'sex']


In [7]:
# --- Step 2: Train LightGBM model ---
lgbm = LGBMClassifier(class_weight='balanced', random_state=42)
lgbm.fit(X_train_proc, y_train)

[LightGBM] [Info] Number of positive: 11321, number of negative: 109568
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 120889, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [8]:
X_train_proc = preprocessor.fit_transform(X_train).toarray()
X_test_proc = preprocessor.transform(X_test).toarray()


In [None]:
# --- Step 3: SHAP Explainability ---
explainer = shap.Explainer(lgbm, X_train_proc)
shap_values = explainer(X_test_proc)


 26%|=====               | 7717/30223 [14:37<42:37]       

In [None]:
# --- Step 4: Plot SHAP Summary plot (global feature importance) ---
shap.summary_plot(shap_values, X_test_proc, feature_names=preprocessor.get_feature_names_out())