In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib
import shap

In [2]:
sns.set_style("whitegrid")
print("Libraries imported successfully.")

Libraries imported successfully.


Load the Dataset

In [None]:
try:
    df = pd.read_csv('../data/raw/PS_20174392719_1491204439457_log.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found.")
    df = None

Preprocessing and Feature Engineering

In [None]:
if df is not None:
    df_filtered = df[df['type'].isin(['CASH_OUT', 'TRANSFER'])].copy()
    df_filtered = pd.get_dummies(df_filtered, columns=['type'], prefix='type', drop_first=True)
    X = df_filtered.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
    y = df_filtered['isFraud']
    print("Preprocessing and feature engineering complete.")



Split Data into Training and Testing Sets

In [None]:
if 'X' in locals():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Data split complete.")

Handle Class Imbalance with SMOTE

In [None]:
if 'X_train' in locals():
    print("\nOriginal training set distribution:\n", y_train.value_counts())
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print("\nResampled training set distribution:\n", y_train_resampled.value_counts())

# Ensure all features are numeric (float64) for SHAP and XGBoost compatibility
if 'X_train_resampled' in locals():
    X_train_resampled = X_train_resampled.apply(pd.to_numeric, errors='coerce').astype(np.float64)
if 'X_test' in locals():
    X_test = X_test.apply(pd.to_numeric, errors='coerce').astype(np.float64)


Train the XGBoost Model

In [None]:
if 'X_train_resampled' in locals():
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        use_label_encoder=False,
        random_state=42
    )
    print("\nTraining the XGBoost model...")
    model.fit(X_train_resampled, y_train_resampled)
    print("Model training completed.")

Evaluate the Model 

In [None]:
if 'model' in locals():
    print("\nEvaluating model performance...")
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


Create SHAP Explainer

In [None]:
if 'model' in locals():
    print("\nCreating SHAP explainer...")
    # We use the resampled training data to create the background distribution for the explainer
    explainer = shap.TreeExplainer(model, X_train_resampled)
    print("SHAP explainer created successfully.")
    
    # Optional: Visualize a sample explanation
    shap_values = explainer.shap_values(X_test.iloc[:1])
    shap.initjs()
    display(shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:]))

Save the Model and the Explainer

In [None]:
if 'model' in locals() and 'explainer' in locals():
    # Save Model
    model_path = '../models/xgboost_model.pkl'
    joblib.dump(model, model_path)
    print(f"\nModel saved successfully to {model_path}")
    
    # Save SHAP Explainer
    explainer_path = '../models/shap_explainer.pkl'
    joblib.dump(explainer, explainer_path)
    print(f"SHAP explainer saved successfully to {explainer_path}")
    
    # Save Columns
    columns_path = '../models/model_columns.pkl'
    joblib.dump(X.columns.tolist(), columns_path)
    print(f"Model columns saved successfully to {columns_path}")