# Phase 1: Data Collection — IEEE-CIS Fraud Detection

This notebook performs Phase 1 (Data Collection) for the IEEE-CIS Fraud Detection dataset:
- Sets up local paths to CSVs stored at C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection
- Verifies files exist
- Loads train/test transaction and identity tables
- Identifies feature groups by column patterns
- Merges on TransactionID (left join)
- Runs basic sanity checks
- Persists merged outputs for downstream phases

In [2]:
# Imports, environment setup, and base paths
from pathlib import Path
import os
import pandas as pd
import numpy as np

# Reproducibility (not strictly needed in Phase 1, but handy later)
SEED = 42
np.random.seed(SEED)

# Base directory where the Kaggle CSVs live (Windows-safe raw string)
base_dir = Path(r"C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection")

# Expected file names
train_transaction_fp = base_dir / "train_transaction.csv"
train_identity_fp   = base_dir / "train_identity.csv"
test_transaction_fp  = base_dir / "test_transaction.csv"
test_identity_fp     = base_dir / "test_identity.csv"
sample_submission_fp = base_dir / "sample_submission.csv"

print("Base data directory:", base_dir)
for p in [train_transaction_fp, train_identity_fp, test_transaction_fp, test_identity_fp, sample_submission_fp]:
    print("-", p)

Base data directory: C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection
- C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\train_transaction.csv
- C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\train_identity.csv
- C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\test_transaction.csv
- C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\test_identity.csv
- C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\sample_submission.csv


In [3]:
# Verify dataset files exist in base_dir
required = {
    "train_transaction": train_transaction_fp,
    "train_identity": train_identity_fp,
    "test_transaction": test_transaction_fp,
    "test_identity": test_identity_fp,
    "sample_submission": sample_submission_fp,
}

missing = {name: p for name, p in required.items() if not p.exists()}
if missing:
    print("Missing required files:")
    for name, p in missing.items():
        print(f" - {name}: {p}")
    raise FileNotFoundError("One or more required CSV files are missing in base_dir. Please ensure all files are downloaded.")
else:
    print("All required files are present.")

All required files are present.


## Optional: If files are missing, you can fetch them via the Kaggle API.

If any files are missing, download the dataset manually from Kaggle or use the Kaggle API. Ensure you have kaggle.json configured and the Kaggle CLI installed. Example (do not run if you already have the files):

- Install Kaggle CLI and set up credentials
- Run: kaggle competitions download -c ieee-fraud-detection
- Unzip the downloaded files and move them into C:\\Users\\asus\\OneDrive\\Desktop\\datasets\\ieee-fraud-detection

Proceed only after all five files are present: train_transaction.csv, train_identity.csv, test_transaction.csv, test_identity.csv, sample_submission.csv.

In [4]:
# Load training tables (transaction, identity)

def mem_mb(df: pd.DataFrame) -> float:
    return float(df.memory_usage(deep=True).sum()) / (1024 ** 2)

print("Loading train_transaction.csv ...")
train_transaction = pd.read_csv(train_transaction_fp, low_memory=False)
print("Loading train_identity.csv ...")
train_identity = pd.read_csv(train_identity_fp, low_memory=False)

print("train_transaction shape:", train_transaction.shape, f"memory ~{mem_mb(train_transaction):.1f} MB")
print("train_identity   shape:", train_identity.shape,   f"memory ~{mem_mb(train_identity):.1f} MB")

display(train_transaction.head(3))
display(train_identity.head(3))

# Basic assertions for Phase 1
assert 'TransactionID' in train_transaction.columns, "TransactionID missing from train_transaction"
assert 'TransactionID' in train_identity.columns,   "TransactionID missing from train_identity"
assert 'isFraud' in train_transaction.columns,      "isFraud not found in train_transaction (target should be present in train only)"
print("Training tables OK: keys and target verified.")

Loading train_transaction.csv ...
Loading train_identity.csv ...
Loading train_identity.csv ...
train_transaction shape: (590540, 394) memory ~2062.1 MB
train_transaction shape: (590540, 394) memory ~2062.1 MB
train_identity   shape: (144233, 41) memory ~143.1 MB
train_identity   shape: (144233, 41) memory ~143.1 MB


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows


Training tables OK: keys and target verified.


In [5]:
# Load test tables (transaction, identity)
print("Loading test_transaction.csv ...")
test_transaction = pd.read_csv(test_transaction_fp, low_memory=False)
print("Loading test_identity.csv ...")
test_identity = pd.read_csv(test_identity_fp, low_memory=False)

print("test_transaction shape:", test_transaction.shape, f"memory ~{mem_mb(test_transaction):.1f} MB")
print("test_identity   shape:", test_identity.shape,   f"memory ~{mem_mb(test_identity):.1f} MB")

display(test_transaction.head(3))
display(test_identity.head(3))

# Basic assertions for Phase 1
assert 'TransactionID' in test_transaction.columns, "TransactionID missing from test_transaction"
assert 'TransactionID' in test_identity.columns,   "TransactionID missing from test_identity"
assert 'isFraud' not in test_transaction.columns,  "isFraud should not be present in test_transaction"
print("Test tables OK: keys verified and no target present.")

Loading test_transaction.csv ...


Loading test_identity.csv ...
test_transaction shape: (506691, 393) memory ~1771.8 MB
test_transaction shape: (506691, 393) memory ~1771.8 MB
test_identity   shape: (141907, 41) memory ~140.1 MB
test_identity   shape: (141907, 41) memory ~140.1 MB


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,


Unnamed: 0,TransactionID,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0


Test tables OK: keys verified and no target present.


In [6]:
# Identify feature components by column patterns

from typing import List, Dict

# Helpers to collect columns by pattern
cols_train = set(train_transaction.columns) | set(train_identity.columns)
cols_test  = set(test_transaction.columns) | set(test_identity.columns)

# Common names (handle variations e.g., TransactionAmt/TransactionAMT)
def find_cols(prefixes: List[str], cols: set) -> List[str]:
    out = []
    for c in sorted(cols):
        for p in prefixes:
            if c == p or c.startswith(p):
                out.append(c)
                break
    return out

transaction_core = [
    'TransactionDT', 'TransactionAmt', 'TransactionAMT', 'ProductCD',
    'card1','card2','card3','card4','card5','card6',
    'addr1','addr2',
    'P_emaildomain','R_emaildomain'
]

C_cols = [f"C{i}" for i in range(1, 15)]  # C1–C14
V_cols = [f"V{i}" for i in range(1, 340)]  # V1–V339
M_cols = [f"M{i}" for i in range(1, 10)]   # M1–M9
id_cols = ['DeviceType','DeviceInfo'] + [f"id_{i:02d}" for i in range(1, 39)]  # id_01–id_38

transaction_features_train = find_cols(transaction_core, cols_train) + find_cols(C_cols+V_cols+M_cols, cols_train)
identity_features_train    = find_cols(id_cols, cols_train)

transaction_features_test = find_cols(transaction_core, cols_test) + find_cols(C_cols+V_cols+M_cols, cols_test)
identity_features_test    = find_cols(id_cols, cols_test)

target = ['isFraud'] if 'isFraud' in train_transaction.columns else []

print("Detected feature groups (train):")
print(" - Transaction features:", len(transaction_features_train))
print(" - Identity features:", len(identity_features_train))
print(" - Target:", target)

print("\nDetected feature groups (test):")
print(" - Transaction features:", len(transaction_features_test))
print(" - Identity features:", len(identity_features_test))

Detected feature groups (train):
 - Transaction features: 375
 - Identity features: 40
 - Target: ['isFraud']

Detected feature groups (test):
 - Transaction features: 375
 - Identity features: 2


In [7]:
# Merge training tables on TransactionID (left join)
identity_only_cols = [c for c in train_identity.columns if c != 'TransactionID']

print("Merging train_transaction with train_identity (left join) ...")
train_merged = train_transaction.merge(
    train_identity,
    on='TransactionID',
    how='left',
    suffixes=('', '_id')
)

print("train_merged shape:", train_merged.shape)

# Validations
assert len(train_merged) == len(train_transaction), "Row count changed after merge; expected left join to preserve transaction rows."
assert train_transaction['TransactionID'].is_unique, "TransactionID not unique in train_transaction; unexpected duplicates."

# Check for column collisions (suffixed with _id)
collisions = [c for c in train_identity.columns if c in train_transaction.columns and c != 'TransactionID']
print("Number of overlapping column names (resolved with suffix _id):", len(collisions))

# Coverage: rows with any identity info present
if identity_only_cols:
    has_identity_mask = train_merged[identity_only_cols].notna().any(axis=1)
    with_identity = int(has_identity_mask.sum())
    without_identity = int((~has_identity_mask).sum())
    print(f"Identity coverage (train): with={with_identity:,} without={without_identity:,} of {len(train_merged):,}")
else:
    print("No identity-only columns detected; cannot compute identity coverage.")

Merging train_transaction with train_identity (left join) ...
train_merged shape: (590540, 434)
Number of overlapping column names (resolved with suffix _id): 0
train_merged shape: (590540, 434)
Number of overlapping column names (resolved with suffix _id): 0
Identity coverage (train): with=144,233 without=446,307 of 590,540
Identity coverage (train): with=144,233 without=446,307 of 590,540


In [8]:
# Merge test tables on TransactionID (left join)
identity_only_cols_test = [c for c in test_identity.columns if c != 'TransactionID']

print("Merging test_transaction with test_identity (left join) ...")
test_merged = test_transaction.merge(
    test_identity,
    on='TransactionID',
    how='left',
    suffixes=('', '_id')
)

print("test_merged shape:", test_merged.shape)

# Validations
assert len(test_merged) == len(test_transaction), "Row count changed after merge; expected left join to preserve transaction rows."
assert test_transaction['TransactionID'].is_unique, "TransactionID not unique in test_transaction; unexpected duplicates."

# Coverage: rows with any identity info present
if identity_only_cols_test:
    has_identity_mask_test = test_merged[identity_only_cols_test].notna().any(axis=1)
    with_identity_test = int(has_identity_mask_test.sum())
    without_identity_test = int((~has_identity_mask_test).sum())
    print(f"Identity coverage (test): with={with_identity_test:,} without={without_identity_test:,} of {len(test_merged):,}")
else:
    print("No identity-only columns detected in test; cannot compute identity coverage.")

Merging test_transaction with test_identity (left join) ...
test_merged shape: (506691, 433)
test_merged shape: (506691, 433)
Identity coverage (test): with=141,907 without=364,784 of 506,691
Identity coverage (test): with=141,907 without=364,784 of 506,691


In [9]:
# Sanity checks and basic stats

# isFraud distribution
fraud_counts = train_merged['isFraud'].value_counts(dropna=False).rename(index={0:'not fraud (0)',1:'fraud (1)'}).\
    rename_axis('class').reset_index(name='count')
print("isFraud counts (train):\n", fraud_counts)

# Null ratios for major groups
import re

def null_ratio_by_prefix(df: pd.DataFrame, pattern: str) -> float:
    cols = [c for c in df.columns if re.match(pattern, c)]
    if not cols:
        return float('nan')
    return float(df[cols].isna().mean().mean())

for label, regex in [
    ("card*", r"^card[0-9]+$"),
    ("C*",    r"^C[0-9]+$"),
    ("V*",    r"^V[0-9]+$"),
    ("id_*",  r"^id_\d{2}$"),
]:
    ratio = null_ratio_by_prefix(train_merged, regex)
    print(f"Average null ratio for {label} (train): {ratio:.3f}")

# TransactionAmt stats and TransactionDT range
if 'TransactionAmt' in train_merged.columns:
    print("\nTransactionAmt describe (train):\n", train_merged['TransactionAmt'].describe())
if 'TransactionDT' in train_merged.columns:
    print("\nTransactionDT min/max (train):", int(train_merged['TransactionDT'].min()), int(train_merged['TransactionDT'].max()))

# Column alignment between train_merged and test_merged (excluding isFraud)
train_cols_no_target = [c for c in train_merged.columns if c != 'isFraud']
missing_in_test = sorted(set(train_cols_no_target) - set(test_merged.columns))
missing_in_train = sorted(set(test_merged.columns) - set(train_cols_no_target))
print("\nColumns in train (excluding isFraud) missing in test:", missing_in_test)
print("Columns in test missing in train (excluding isFraud):", missing_in_train)

isFraud counts (train):
            class   count
0  not fraud (0)  569877
1      fraud (1)   20663
Average null ratio for card* (train): 0.005
Average null ratio for C* (train): 0.000
Average null ratio for V* (train): 0.430
Average null ratio for V* (train): 0.430
Average null ratio for id_* (train): 0.848

TransactionAmt describe (train):
 count    590540.000000
mean        135.027176
std         239.162522
min           0.251000
25%          43.321000
50%          68.769000
75%         125.000000
max       31937.391000
Name: TransactionAmt, dtype: float64

TransactionDT min/max (train): 86400 15811131

Columns in train (excluding isFraud) missing in test: ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37'

In [10]:
# Persist merged datasets to disk (Parquet preferred, CSV fallback)

# Save into the dataset directory for convenient reuse with the raw files
out_train_parquet = base_dir / "train_merged.parquet"
out_test_parquet  = base_dir / "test_merged.parquet"
out_train_csv     = base_dir / "train_merged.csv"
out_test_csv      = base_dir / "test_merged.csv"

saved = []

# Always provide CSV as a fallback
try:
    train_merged.to_csv(out_train_csv, index=False)
    test_merged.to_csv(out_test_csv, index=False)
    saved += [out_train_csv, out_test_csv]
    print("Saved CSV files:")
    print(" -", out_train_csv)
    print(" -", out_test_csv)
except Exception as e:
    print("CSV save failed:", e)

# Show sizes
for p in saved:
    try:
        size_mb = (p.stat().st_size) / (1024**2)
        print(f"File: {p} | Size: {size_mb:.2f} MB")
    except Exception:
        pass

Saved CSV files:
 - C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\train_merged.csv
 - C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\test_merged.csv
File: C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\train_merged.csv | Size: 693.43 MB
File: C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection\test_merged.csv | Size: 622.66 MB


# Phase 2: Data Preparation and Preprocessing (Section 3.3)

This section prepares the merged training data for modeling:

- Remove identifiers (drop TransactionID)
- Handle missing data: median for numeric, most-frequent for categorical
- Encode categorical variables using label encoding (OrdinalEncoder)
- Split dataset into train (80%) and validation (20%) with stratification
- Address class imbalance with SMOTE on the training split only
- Persist processed arrays and the fitted preprocessing pipeline for reuse

In [11]:
# Phase 2 setup: ensure required packages are available
import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    name = import_name or pkg
    try:
        importlib.import_module(name)
    except ImportError:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

for pkg in ["scikit-learn", "imblearn", "joblib"]:
    ensure(pkg)


Installing scikit-learn ...


In [12]:
# Prepare dataset, drop identifiers, and define column groups
from pathlib import Path
import pandas as pd
import numpy as np

# Reuse SEED and base_dir if already defined
try:
    SEED
except NameError:
    SEED = 42
try:
    base_dir
except NameError:
    base_dir = Path(r"C:\Users\asus\OneDrive\Desktop\datasets\ieee-fraud-detection")

# Load merged datasets if not present in memory
if 'train_merged' not in globals() or 'test_merged' not in globals():
    train_path = base_dir / "train_merged.csv"
    test_path  = base_dir / "test_merged.csv"
    if train_path.exists() and test_path.exists():
        print("Loading pre-merged CSVs ...")
        train_merged = pd.read_csv(train_path, low_memory=False)
        test_merged  = pd.read_csv(test_path, low_memory=False)
    else:
        raise RuntimeError("Merged datasets not found in memory and CSVs missing. Run Phase 1 cells first.")

# 1) Remove identifiers
id_cols = [c for c in ['TransactionID'] if c in train_merged.columns]
X_df = train_merged.drop(columns=id_cols + ['isFraud'])
y = train_merged['isFraud'].astype(int)

# Split columns by dtype (object => categorical)
cat_cols = [c for c in X_df.columns if X_df[c].dtype == 'object']
num_cols = [c for c in X_df.columns if c not in cat_cols]

print(f"Identifier columns dropped: {id_cols}")
print(f"Detected categorical columns: {len(cat_cols)}")
print(f"Detected numerical columns: {len(num_cols)}")


Identifier columns dropped: ['TransactionID']
Detected categorical columns: 31
Detected numerical columns: 401


In [13]:
# Build preprocessing pipeline, encode categoricals, and split 80/20 (stratified)
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# 2) Impute missing data and 3) Encode categoricals via a preprocessing pipeline
numeric_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0,
)

# 4) Split the dataset with stratification to preserve class distribution
X_train_df, X_valid_df, y_train, y_valid = train_test_split(
    X_df, y, test_size=0.20, random_state=SEED, stratify=y
)

# Fit on training only, transform both train/valid
X_train = preprocessor.fit_transform(X_train_df)
X_valid = preprocessor.transform(X_valid_df)

print("Shapes after preprocessing:")
print(" - X_train:", X_train.shape, "| y_train:", y_train.shape)
print(" - X_valid:", X_valid.shape, "| y_valid:", y_valid.shape)

# Verify class distribution preservation
print("Class distribution (train):")
print(y_train.value_counts(normalize=True).rename(index={0: "not fraud", 1: "fraud"}))
print("Class distribution (valid):")
print(y_valid.value_counts(normalize=True).rename(index={0: "not fraud", 1: "fraud"}))


Shapes after preprocessing:
 - X_train: (472432, 432) | y_train: (472432,)
 - X_valid: (118108, 432) | y_valid: (118108,)
Class distribution (train):
isFraud
not fraud    0.965011
fraud        0.034989
Name: proportion, dtype: float64
Class distribution (valid):
isFraud
not fraud    0.965007
fraud        0.034993
Name: proportion, dtype: float64


In [14]:
# Apply SMOTE to address class imbalance (training set only)
from collections import Counter
from imblearn.over_sampling import SMOTE

print("Before SMOTE class counts:", Counter(y_train))
smote = SMOTE(random_state=SEED, sampling_strategy="auto")
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("After SMOTE class counts:", Counter(y_train_smote))


Before SMOTE class counts: Counter({0: 455902, 1: 16530})
After SMOTE class counts: Counter({0: 455902, 1: 455902})
After SMOTE class counts: Counter({0: 455902, 1: 455902})


In [15]:
# Persist processed datasets and the fitted preprocessor/schema
from pathlib import Path
import joblib
import numpy as np

processed_dir = Path("data") / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

# Save arrays
np.save(processed_dir / "X_train_smote.npy", X_train_smote)
np.save(processed_dir / "y_train_smote.npy", y_train_smote.to_numpy() if hasattr(y_train_smote, "to_numpy") else np.asarray(y_train_smote))
np.save(processed_dir / "X_valid.npy", X_valid)
np.save(processed_dir / "y_valid.npy", y_valid.to_numpy() if hasattr(y_valid, "to_numpy") else np.asarray(y_valid))

# Save the fitted preprocessor and column schema
artifacts_dir = processed_dir / "artifacts"
artifacts_dir.mkdir(exist_ok=True)
joblib.dump(preprocessor, artifacts_dir / "preprocessor.joblib")

schema = {
    "drop_cols": id_cols,
    "categorical_cols": cat_cols,
    "numerical_cols": num_cols,
    "feature_count": int(X_train.shape[1]),
}
joblib.dump(schema, artifacts_dir / "schema.joblib")

print("Saved processed artifacts:")
for p in [
    processed_dir / "X_train_smote.npy",
    processed_dir / "y_train_smote.npy",
    processed_dir / "X_valid.npy",
    processed_dir / "y_valid.npy",
    artifacts_dir / "preprocessor.joblib",
    artifacts_dir / "schema.joblib",
]:
    print(" -", p.resolve())


Saved processed artifacts:
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\X_train_smote.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\y_train_smote.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\X_valid.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\y_valid.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\preprocessor.joblib
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\schema.joblib


# Phase 3: Feature Selection (Section 3.4)

This phase selects a compact, high-signal feature subset:

- Train an initial XGBoost classifier on the full preprocessed training data (after SMOTE)
- Compute SHAP values to quantify global feature influence
- Select the top 30 features by mean absolute SHAP value and persist indices/names and reduced arrays

In [16]:
# Ensure Phase 3 dependencies: xgboost and shap
import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    name = import_name or pkg
    try:
        importlib.import_module(name)
    except ImportError:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

ensure("xgboost")
ensure("shap")


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# Load processed arrays and schema for feature names
from pathlib import Path
import numpy as np
import joblib

processed_dir = Path("data") / "processed"
artifacts_dir = processed_dir / "artifacts"

X_train_smote = np.load(processed_dir / "X_train_smote.npy")
y_train_smote = np.load(processed_dir / "y_train_smote.npy")
X_valid       = np.load(processed_dir / "X_valid.npy")
y_valid       = np.load(processed_dir / "y_valid.npy")

schema = joblib.load(artifacts_dir / "schema.joblib")
feature_names = list(schema.get("numerical_cols", [])) + list(schema.get("categorical_cols", []))

# Cast to float32 to reduce memory pressure
X_train_smote = X_train_smote.astype(np.float32, copy=False)
X_valid       = X_valid.astype(np.float32, copy=False)

print("Loaded processed arrays:")
print(" - X_train_smote:", X_train_smote.shape, "y:", y_train_smote.shape)
print(" - X_valid:", X_valid.shape, "y:", y_valid.shape)
print("Feature names count:", len(feature_names))
if X_train_smote.shape[1] != len(feature_names):
    print("WARNING: Feature name count does not match X columns. Will fallback to positional naming.")


Loaded processed arrays:
 - X_train_smote: (911804, 432) y: (911804,)
 - X_valid: (118108, 432) y: (118108,)
Feature names count: 432


In [18]:
# Train an initial XGBoost classifier on full preprocessed features
from xgboost import XGBClassifier
import numpy as np

# Optional cap to prevent out-of-memory on very large resampled sets
CAP_TRAIN_ROWS = 300_000
if X_train_smote.shape[0] > CAP_TRAIN_ROWS:
    rng = np.random.default_rng(SEED)
    idx = rng.choice(X_train_smote.shape[0], size=CAP_TRAIN_ROWS, replace=False)
    X_train_sub = X_train_smote[idx]
    y_train_sub = y_train_smote[idx]
    print(f"Subsampled training to {CAP_TRAIN_ROWS} rows for efficiency.")
else:
    X_train_sub = X_train_smote
    y_train_sub = y_train_smote

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    random_state=SEED,
    n_jobs=-1,
    eval_metric="logloss",
)

model.fit(X_train_sub, y_train_sub, eval_set=[(X_valid, y_valid)], verbose=False)

# Persist the baseline model
(artifacts_dir).mkdir(parents=True, exist_ok=True)
model_path = artifacts_dir / "xgb_baseline_model.json"
model.save_model(str(model_path))
print("Saved baseline XGBoost model to:", model_path.resolve())


Subsampled training to 300000 rows for efficiency.
Saved baseline XGBoost model to: C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\xgb_baseline_model.json
Saved baseline XGBoost model to: C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\xgb_baseline_model.json


In [19]:
# Compute SHAP values and global feature importances (with robust fallback for XGBoost)
import numpy as np
import pandas as pd
import shap
import xgboost as xgb

# Use a sample of validation data for efficiency and to reflect real class distribution
SAMPLE_SHAP = min(20000, X_valid.shape[0])
rng = np.random.default_rng(SEED)
idx = rng.choice(X_valid.shape[0], size=SAMPLE_SHAP, replace=False) if X_valid.shape[0] > SAMPLE_SHAP else np.arange(X_valid.shape[0])
X_shap = X_valid[idx]

# Try TreeExplainer first; if it fails due to version incompatibilities, fall back to XGBoost pred_contribs
try:
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_shap)
    # Handle potential list output (e.g., multiclass). For binary, take the single array.
    if isinstance(shap_values, list):
        shap_values = shap_values[0]
except Exception as e:
    print("SHAP TreeExplainer failed; falling back to XGBoost pred_contribs. Reason:", e)
    booster = model.get_booster()
    d_shap = xgb.DMatrix(X_shap)
    contribs = booster.predict(d_shap, pred_contribs=True)
    # For binary classification contribs shape is (n_samples, n_features+1), last col is bias term
    if contribs.ndim == 3:  # multiclass shape (n, nfeat+1, nclasses) -> use first class
        contribs = contribs[:, :, 0]
    shap_values = contribs[:, :-1]  # drop bias

mean_abs_shap = np.abs(shap_values).mean(axis=0)

# Fallback to positional names if counts mismatch
if X_train_smote.shape[1] != len(feature_names):
    feature_names_used = [f"f_{i}" for i in range(X_train_smote.shape[1])]
else:
    feature_names_used = feature_names

importance_df = (
    pd.DataFrame({"feature": feature_names_used, "mean_abs_shap": mean_abs_shap})
    .sort_values("mean_abs_shap", ascending=False)
    .reset_index(drop=True)
)

print("Top 10 features by mean |SHAP|:")
print(importance_df.head(10))


SHAP TreeExplainer failed; falling back to XGBoost pred_contribs. Reason: could not convert string to float: '[4.9974334E-1]'
 could not convert string to float: '[4.9974334E-1]'
Top 10 features by mean |SHAP|:
          feature  mean_abs_shap
0           card6       0.409217
1             C14       0.313880
2            V317       0.282561
3            V306       0.270561
4              C1       0.249904
5             C11       0.232290
6  TransactionAmt       0.227434
7            V282       0.199029
8             C13       0.198148
9            V126       0.186841
Top 10 features by mean |SHAP|:
          feature  mean_abs_shap
0           card6       0.409217
1             C14       0.313880
2            V317       0.282561
3            V306       0.270561
4              C1       0.249904
5             C11       0.232290
6  TransactionAmt       0.227434
7            V282       0.199029
8             C13       0.198148
9            V126       0.186841


In [20]:
# Select top-30 features by SHAP, persist selection and reduced arrays
from pathlib import Path
import numpy as np
import joblib

TOP_K = 30
selected_df = importance_df.head(TOP_K).copy()
selected_features = selected_df["feature"].tolist()

# Build index mapping
if feature_names and set(selected_features).issubset(set(feature_names)):
    name_to_idx = {n: i for i, n in enumerate(feature_names)}
    selected_indices = [name_to_idx[n] for n in selected_features]
else:
    # Positional fallback
    selected_indices = list(range(TOP_K))

# Save selection
selection = {
    "top_k": TOP_K,
    "feature_names": feature_names if feature_names else [f"f_{i}" for i in range(X_train_smote.shape[1])],
    "selected_features": selected_features,
    "selected_indices": selected_indices,
}
artifacts_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(selection, artifacts_dir / "selected_features.joblib")

# Reduce arrays and persist
reduced_dir = Path("data") / "processed"
np.save(reduced_dir / "X_train_smote_top30.npy", X_train_smote[:, selected_indices])
np.save(reduced_dir / "X_valid_top30.npy", X_valid[:, selected_indices])

# Also persist a CSV of SHAP importances for reference
(artifacts_dir / "shap_importance.csv").write_text(selected_df.to_csv(index=False))

print("Selected top-30 features saved. Indices:", selected_indices[:10], "...")
print("Paths:")
print(" -", (artifacts_dir / "selected_features.joblib").resolve())
print(" -", (reduced_dir / "X_train_smote_top30.npy").resolve())
print(" -", (reduced_dir / "X_valid_top30.npy").resolve())
print(" -", (artifacts_dir / "shap_importance.csv").resolve())


Selected top-30 features saved. Indices: [403, 23, 355, 344, 10, 20, 1, 320, 22, 164] ...
Paths:
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\selected_features.joblib
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\X_train_smote_top30.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\X_valid_top30.npy
 - C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\shap_importance.csv


# Phase 4: Hyperparameter Optimization (Section 3.5)

This phase tunes the XGBoost classifier with Optuna:

- Use Optuna for automated hyperparameter search (20 trials by default)
- Tune: n_estimators, max_depth, learning_rate, subsample, colsample_bytree, scale_pos_weight (around 1 since training is SMOTE-balanced),
  plus a few regularization knobs (min_child_weight, reg_alpha, reg_lambda, gamma)
- Retrain with the best parameters and persist the optimized model and study artifacts

In [21]:
# Ensure Optuna is available
import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    name = import_name or pkg
    try:
        importlib.import_module(name)
    except ImportError:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

ensure("optuna")
import optuna


In [22]:
# Load reduced features for tuning (top-30), fallback to full features if missing
from pathlib import Path
import numpy as np
import joblib

processed_dir = Path("data") / "processed"
artifacts_dir = processed_dir / "artifacts"

# Prefer reduced arrays from Phase 3
X_train_path = processed_dir / "X_train_smote_top30.npy"
X_valid_path = processed_dir / "X_valid_top30.npy"

if X_train_path.exists() and X_valid_path.exists():
    print("Using top-30 selected features for tuning.")
    X_train_tune = np.load(X_train_path)
    X_valid_tune = np.load(X_valid_path)
else:
    print("Top-30 arrays not found; falling back to full feature arrays.")
    X_train_tune = np.load(processed_dir / "X_train_smote.npy")
    X_valid_tune = np.load(processed_dir / "X_valid.npy")

# Targets
y_train_smote = np.load(processed_dir / "y_train_smote.npy")
y_valid = np.load(processed_dir / "y_valid.npy")

print("Shapes for tuning:")
print(" - X_train_tune:", X_train_tune.shape, "y:", y_train_smote.shape)
print(" - X_valid_tune:", X_valid_tune.shape, "y:", y_valid.shape)


Using top-30 selected features for tuning.
Shapes for tuning:
 - X_train_tune: (911804, 30) y: (911804,)
 - X_valid_tune: (118108, 30) y: (118108,)


In [23]:
# Run Optuna study to tune XGBoost hyperparameters
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import numpy as np

N_TRIALS = 20
MAX_ROWS_TUNE = 300_000  # cap rows to keep trials fast

# Subsample large SMOTE-balanced training set for tuning speed (keep validation full)
if X_train_tune.shape[0] > MAX_ROWS_TUNE:
    rng = np.random.default_rng(SEED)
    sel = rng.choice(X_train_tune.shape[0], size=MAX_ROWS_TUNE, replace=False)
    X_train_opt = X_train_tune[sel]
    y_train_opt = y_train_smote[sel]
else:
    X_train_opt = X_train_tune
    y_train_opt = y_train_smote

# Since training data is SMOTE-balanced, scale_pos_weight is expected near 1.0; we still allow slight tuning.
def objective(trial: optuna.trial.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 2.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.8, 1.2),
        # Fixed settings
        "tree_method": "hist",
        "n_jobs": -1,
        "random_state": SEED,
        "eval_metric": "auc",
    }

    model = XGBClassifier(**params)
    pruning_cb = XGBoostPruningCallback(trial, "validation_0-auc")
    model.fit(
        X_train_opt,
        y_train_opt,
        eval_set=[(X_valid_tune, y_valid)],
        verbose=False,
        early_stopping_rounds=50,
        callbacks=[pruning_cb],
    )

    # Predict probabilities for ROC AUC (use best_iteration from early stopping)
    y_proba = model.predict_proba(X_valid_tune)[:, 1]
    auc = roc_auc_score(y_valid, y_proba)
    return auc

study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=SEED))
study.optimize(objective, n_trials=N_TRIALS, timeout=1200, show_progress_bar=False)

print("Best AUC:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f" - {k}: {v}")

# Persist study and params
import joblib
artifacts_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(study, artifacts_dir / "optuna_study.joblib")
joblib.dump(study.best_params, artifacts_dir / "optuna_best_params.joblib")

# Also save trials summary to CSV
import pandas as pd
try:
    df_trials = study.trials_dataframe()
    df_trials.to_csv(artifacts_dir / "optuna_trials.csv", index=False)
except Exception as e:
    print("Could not export trials to CSV:", e)


[I 2025-11-05 12:40:56,648] A new study created in memory with name: no-name-7e9e7dd8-58e1-457f-b798-a0c1380d1c5c
[I 2025-11-05 12:41:25,488] Trial 0 finished with value: 0.9405304126922193 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.06504856968981275, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'min_child_weight': 2, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 1.7323522915498704, 'gamma': 3.005575058716044, 'scale_pos_weight': 1.0832290311184183}. Best is trial 0 with value: 0.9405304126922193.
[I 2025-11-05 12:41:25,488] Trial 0 finished with value: 0.9405304126922193 and parameters: {'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.06504856968981275, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'min_child_weight': 2, 'reg_alpha': 0.05808361216819946, 'reg_lambda': 1.7323522915498704, 'gamma': 3.005575058716044, 'scale_pos_weight': 1.0832290311184183}. Best is trial 0 with value: 0.9405

KeyboardInterrupt: 

In [24]:
# Retrain XGBoost with best Optuna parameters and persist model
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import joblib

best_params = joblib.load(artifacts_dir / "optuna_best_params.joblib")

# Ensure fixed fields
best_params.update({
    "tree_method": "hist",
    "n_jobs": -1,
    "random_state": SEED,
    "eval_metric": "auc",
})

best_model = XGBClassifier(**best_params)
best_model.fit(
    X_train_tune,
    y_train_smote,
    eval_set=[(X_valid_tune, y_valid)],
    verbose=False,
)

# Evaluate
val_auc = roc_auc_score(y_valid, best_model.predict_proba(X_valid_tune)[:, 1])
print(f"Validation ROC AUC (best params): {val_auc:.4f}")

# Persist optimized model
optimized_model_path = artifacts_dir / "xgb_optuna_model.json"
best_model.save_model(str(optimized_model_path))
print("Saved optimized XGBoost model to:", optimized_model_path.resolve())


Validation ROC AUC (best params): 0.9562
Saved optimized XGBoost model to: C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\xgb_optuna_model.json


# Phase 5: Model Development — Stacking Ensemble (Section 3.6)\n\nIn this phase we build a stacking ensemble: three gradient-boosting base learners (XGBoost, LightGBM, CatBoost) feed their predicted probabilities into a simple XGBoost meta-learner.\n\nSteps:\n- Train base learners using the top-30 SHAP-selected features on the SMOTE-balanced training set.\n- Generate out-of-fold (OOF) predictions from base learners to form meta-training features (reduces leakage).\n- Train the meta-learner on OOF features.\n- Refit base learners on full balanced training and evaluate the stacked model on the held-out validation set.\n- Persist all trained models and stacking metadata for inference.


In [25]:
# Phase 5 setup: ensure LightGBM and CatBoost are available
import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    name = import_name or pkg
    try:
        importlib.import_module(name)
    except ImportError:
        print(f"Installing {pkg} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# Ensure required packages for stacking
ensure("lightgbm", "lightgbm")
ensure("catboost", "catboost")
ensure("xgboost", "xgboost")


In [27]:
# Load top-30 arrays for stacking (fallback to full features if missing)
from pathlib import Path
import numpy as np
import joblib

# Reuse SEED if already defined
try:
    SEED
except NameError:
    SEED = 42

processed_dir = Path("data") / "processed"
artifacts_dir = processed_dir / "artifacts"

X_train_path = processed_dir / "X_train_smote_top30.npy"
X_valid_path = processed_dir / "X_valid_top30.npy"

if X_train_path.exists() and X_valid_path.exists():
    print("Using top-30 selected features for stacking.")
    X_train_stack = np.load(X_train_path)
    X_valid_stack = np.load(X_valid_path)
else:
    print("Top-30 arrays not found; falling back to full feature arrays.")
    X_train_stack = np.load(processed_dir / "X_train_smote.npy")
    X_valid_stack = np.load(processed_dir / "X_valid.npy")

y_train_smote = np.load(processed_dir / "y_train_smote.npy")
y_valid = np.load(processed_dir / "y_valid.npy")

# Cast to float32 for efficiency
X_train_stack = X_train_stack.astype(np.float32, copy=False)
X_valid_stack  = X_valid_stack.astype(np.float32, copy=False)

print("Shapes for stacking:")
print(" - X_train_stack:", X_train_stack.shape, "y:", y_train_smote.shape)
print(" - X_valid_stack:", X_valid_stack.shape, "y:", y_valid.shape)


Using top-30 selected features for stacking.
Shapes for stacking:
 - X_train_stack: (911804, 30) y: (911804,)
 - X_valid_stack: (118108, 30) y: (118108,)


In [28]:
# Train base learners with OOF and fit meta-learner; evaluate and persist models
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1) Define base learners
# Try to reuse tuned XGBoost params if available; otherwise use reasonable defaults
xgb_base_params = {
    "n_estimators": 500,
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",
    "random_state": SEED,
    "n_jobs": -1,
    "eval_metric": "auc",
}
try:
    _best = joblib.load(artifacts_dir / "optuna_best_params.joblib")
    # Keep critical fields consistent; fall back if keys missing
    xgb_base_params.update({k: v for k, v in _best.items() if k in xgb_base_params or k in ["min_child_weight","reg_alpha","reg_lambda","gamma","scale_pos_weight"]})
except Exception:
    pass

def make_xgb():
    return XGBClassifier(**xgb_base_params)

def make_lgbm():
    return LGBMClassifier(
        n_estimators=600, learning_rate=0.05, num_leaves=64,
        subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.0, reg_lambda=0.0,
        random_state=SEED, n_jobs=-1, objective="binary",
        verbosity=-1,
    )

def make_cat():
    return CatBoostClassifier(
        iterations=600, learning_rate=0.05, depth=6, l2_leaf_reg=3.0,
        loss_function="Logloss", eval_metric="AUC",
        random_seed=SEED, thread_count=-1, verbose=False,
    )

base_names = ["xgb", "lgbm", "cat"]
n_base = len(base_names)

# 2) Out-of-fold (OOF) predictions for meta-training
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
n_train = X_train_stack.shape[0]
meta_train = np.zeros((n_train, n_base), dtype=np.float32)

print("Generating OOF predictions for meta-learner ...")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train_stack, y_train_smote), start=1):
    X_tr, X_va = X_train_stack[tr_idx], X_train_stack[va_idx]
    y_tr, y_va = y_train_smote[tr_idx], y_train_smote[va_idx]
    
    # Fresh instances per fold
    xgb_m = make_xgb()
    lgb_m = make_lgbm()
    cat_m = make_cat()
    
    xgb_m.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    lgb_m.fit(X_tr, y_tr, eval_set=[(X_va, y_va)])
    cat_m.fit(X_tr, y_tr, eval_set=(X_va, y_va))
    
    meta_train[va_idx, 0] = xgb_m.predict_proba(X_va)[:, 1]
    meta_train[va_idx, 1] = lgb_m.predict_proba(X_va)[:, 1]
    meta_train[va_idx, 2] = cat_m.predict_proba(X_va)[:, 1]
    
    fold_auc = roc_auc_score(y_va, meta_train[va_idx].mean(axis=1))
    print(f" - Fold {fold}: mean(base-proba) AUC = {fold_auc:.4f}")

# 3) Train meta-learner on OOF features
meta_learner = XGBClassifier(
    n_estimators=400, learning_rate=0.05, max_depth=3,
    subsample=0.9, colsample_bytree=1.0,
    tree_method="hist", random_state=SEED, n_jobs=-1, eval_metric="auc",
)
meta_learner.fit(meta_train, y_train_smote)
print("Meta-learner trained on OOF features.")

# 4) Refit base learners on full balanced training and evaluate on held-out validation
xgb_full = make_xgb()
lgb_full = make_lgbm()
cat_full = make_cat()

xgb_full.fit(X_train_stack, y_train_smote, eval_set=[(X_valid_stack, y_valid)], verbose=False)
lgb_full.fit(X_train_stack, y_train_smote, eval_set=[(X_valid_stack, y_valid)])
cat_full.fit(X_train_stack, y_train_smote, eval_set=(X_valid_stack, y_valid))

# Base learners' validation AUCs
from sklearn.metrics import roc_auc_score
p_xgb = xgb_full.predict_proba(X_valid_stack)[:, 1]
p_lgb = lgb_full.predict_proba(X_valid_stack)[:, 1]
p_cat = cat_full.predict_proba(X_valid_stack)[:, 1]
auc_xgb = roc_auc_score(y_valid, p_xgb)
auc_lgb = roc_auc_score(y_valid, p_lgb)
auc_cat = roc_auc_score(y_valid, p_cat)
print(f"Validation AUCs — XGB: {auc_xgb:.4f} | LGBM: {auc_lgb:.4f} | CatBoost: {auc_cat:.4f}")

# Stacked validation AUC
meta_valid = np.vstack([p_xgb, p_lgb, p_cat]).T.astype(np.float32)
p_stack = meta_learner.predict_proba(meta_valid)[:, 1]
auc_stack = roc_auc_score(y_valid, p_stack)
print(f"Stacked model Validation ROC AUC: {auc_stack:.4f}")

# 5) Persist base learners, meta-learner, and stacking metadata
artifacts_dir.mkdir(parents=True, exist_ok=True)
try:
    xgb_full.get_booster().save_model(str(artifacts_dir / "stack_xgb_base.json"))
except Exception as e:
    print("Could not save XGB base as JSON; pickling instead.", e)
    joblib.dump(xgb_full, artifacts_dir / "stack_xgb_base.joblib")

try:
    lgb_full.booster_.save_model(str(artifacts_dir / "stack_lgb_base.txt"))
except Exception as e:
    print("Could not save LGBM booster; pickling instead.", e)
    joblib.dump(lgb_full, artifacts_dir / "stack_lgb_base.joblib")

try:
    cat_full.save_model(str(artifacts_dir / "stack_cat_base.cbm"))
except Exception as e:
    print("Could not save CatBoost .cbm; pickling instead.", e)
    joblib.dump(cat_full, artifacts_dir / "stack_cat_base.joblib")

try:
    meta_learner.get_booster().save_model(str(artifacts_dir / "stack_meta_xgb.json"))
except Exception as e:
    print("Could not save meta-learner booster; pickling instead.", e)
    joblib.dump(meta_learner, artifacts_dir / "stack_meta_xgb.joblib")

stack_info = {
    "phase": 5,
    "top_k_features": int(X_train_stack.shape[1]),
    "oof_folds": 5,
    "base_models": base_names,
    "metrics": {
        "auc_xgb": float(auc_xgb),
        "auc_lgb": float(auc_lgb),
        "auc_cat": float(auc_cat),
        "auc_stack": float(auc_stack),
    },
}
joblib.dump(stack_info, artifacts_dir / "stacking_info.joblib")
print("Saved stacking artifacts to:", artifacts_dir.resolve())


Generating OOF predictions for meta-learner ...




 - Fold 1: mean(base-proba) AUC = 0.9980




 - Fold 2: mean(base-proba) AUC = 0.9977




 - Fold 3: mean(base-proba) AUC = 0.9980




 - Fold 4: mean(base-proba) AUC = 0.9980




 - Fold 5: mean(base-proba) AUC = 0.9979
Meta-learner trained on OOF features.




Validation AUCs — XGB: 0.9562 | LGBM: 0.9407 | CatBoost: 0.8960
Stacked model Validation ROC AUC: 0.9577
Saved stacking artifacts to: C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts


# Phase 6: Training and Evaluation (Sections 3.7, 3.8)
\nWe now:
- Apply 5-fold Stratified Cross-Validation during training (on the SMOTE-balanced training set) to gauge stability.
- Evaluate on the held-out (imbalanced) validation set with Accuracy, Precision, Recall, F1, ROC AUC, and Confusion Matrix.
- Optimize the decision threshold via the Precision–Recall (PR) curve to maximize F1 (typically around ~0.44 in this setup).


In [29]:
# 5-fold Stratified Cross-Validation for tuned XGBoost (top-30 features if available)
from pathlib import Path
import numpy as np
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

# Load training arrays for CV (prefer top-30)
processed_dir = Path("data") / "processed"
artifacts_dir = processed_dir / "artifacts"
X_train_path = processed_dir / "X_train_smote_top30.npy"
if X_train_path.exists():
    X_train_cv = np.load(X_train_path)
else:
    X_train_cv = np.load(processed_dir / "X_train_smote.npy")
y_train_cv = np.load(processed_dir / "y_train_smote.npy")

# Retrieve best params if available, else basic defaults
try:
    best_params = joblib.load(artifacts_dir / "optuna_best_params.joblib")
except Exception:
    best_params = {
        "n_estimators": 500,
        "learning_rate": 0.05,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
best_params.update({
    "tree_method": "hist",
    "n_jobs": -1,
    "random_state": SEED if 'SEED' in globals() else 42,
    "eval_metric": "auc",
})

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED if 'SEED' in globals() else 42)
auc_scores = []
acc_scores = []
prec_scores = []
rec_scores = []
f1_scores = []

print("Running 5-fold Stratified CV for tuned XGBoost ...")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train_cv, y_train_cv), start=1):
    X_tr, X_va = X_train_cv[tr_idx], X_train_cv[va_idx]
    y_tr, y_va = y_train_cv[tr_idx], y_train_cv[va_idx]
    model = XGBClassifier(**best_params)
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    proba = model.predict_proba(X_va)[:, 1]
    preds = (proba >= 0.5).astype(int)
    auc = roc_auc_score(y_va, proba)
    acc = accuracy_score(y_va, preds)
    prec = precision_score(y_va, preds, zero_division=0)
    rec = recall_score(y_va, preds, zero_division=0)
    f1 = f1_score(y_va, preds, zero_division=0)
    auc_scores.append(auc); acc_scores.append(acc); prec_scores.append(prec); rec_scores.append(rec); f1_scores.append(f1)
    print(f" - Fold {fold}: AUC={auc:.4f} | Acc={acc:.4f} | Prec={prec:.4f} | Rec={rec:.4f} | F1={f1:.4f}")

def mean_std(arr):
    import numpy as _np
    return float(_np.mean(arr)), float(_np.std(arr))

m_auc, s_auc = mean_std(auc_scores)
m_acc, s_acc = mean_std(acc_scores)
m_prec, s_prec = mean_std(prec_scores)
m_rec, s_rec = mean_std(rec_scores)
m_f1, s_f1 = mean_std(f1_scores)

print("\nCV Summary (5-fold, tuned XGBoost, thr=0.5):")
print(f" AUC  : {m_auc:.4f} ± {s_auc:.4f}")
print(f" Acc  : {m_acc:.4f} ± {s_acc:.4f}")
print(f" Prec : {m_prec:.4f} ± {s_prec:.4f}")
print(f" Rec  : {m_rec:.4f} ± {s_rec:.4f}")
print(f" F1   : {m_f1:.4f} ± {s_f1:.4f}")


Running 5-fold Stratified CV for tuned XGBoost ...
 - Fold 1: AUC=0.9984 | Acc=0.9912 | Prec=0.9976 | Rec=0.9847 | F1=0.9911
 - Fold 2: AUC=0.9981 | Acc=0.9906 | Prec=0.9975 | Rec=0.9837 | F1=0.9906
 - Fold 3: AUC=0.9983 | Acc=0.9909 | Prec=0.9974 | Rec=0.9844 | F1=0.9909
 - Fold 4: AUC=0.9984 | Acc=0.9910 | Prec=0.9974 | Rec=0.9847 | F1=0.9910
 - Fold 5: AUC=0.9983 | Acc=0.9910 | Prec=0.9978 | Rec=0.9842 | F1=0.9909

CV Summary (5-fold, tuned XGBoost, thr=0.5):
 AUC  : 0.9983 ± 0.0001
 Acc  : 0.9910 ± 0.0002
 Prec : 0.9975 ± 0.0002
 Rec  : 0.9843 ± 0.0004
 F1   : 0.9909 ± 0.0002


In [30]:
# Evaluate stacked model on imbalanced validation; compute PR curve and optimal threshold
from pathlib import Path
import numpy as np
import json
import joblib
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, precision_recall_curve
)
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Ensure validation arrays for stacking are available
processed_dir = Path("data") / "processed"
artifacts_dir = processed_dir / "artifacts"

try:
    X_valid_stack
    y_valid
except NameError:
    # Load if not in memory
    X_valid_path = processed_dir / "X_valid_top30.npy"
    if X_valid_path.exists():
        X_valid_stack = np.load(X_valid_path)
    else:
        X_valid_stack = np.load(processed_dir / "X_valid.npy")
    y_valid = np.load(processed_dir / "y_valid.npy")

# Robust loaders for saved base learners and meta-learner
def load_xgb_model(json_path, joblib_path):
    # Try JSON booster into classifier
    try:
        clf = XGBClassifier()
        clf.load_model(str(json_path))
        return clf
    except Exception:
        pass
    # Fallback to joblib
    return joblib.load(joblib_path)

def load_lgb_model(txt_path, joblib_path):
    try:
        booster = lgb.Booster(model_file=str(txt_path))
        return booster
    except Exception:
        pass
    return joblib.load(joblib_path)

def load_cat_model(cbm_path, joblib_path):
    try:
        clf = CatBoostClassifier()
        clf.load_model(str(cbm_path))
        return clf
    except Exception:
        pass
    return joblib.load(joblib_path)

xgb_json = artifacts_dir / "stack_xgb_base.json"
xgb_job  = artifacts_dir / "stack_xgb_base.joblib"
lgb_txt  = artifacts_dir / "stack_lgb_base.txt"
lgb_job  = artifacts_dir / "stack_lgb_base.joblib"
cat_cbm  = artifacts_dir / "stack_cat_base.cbm"
cat_job  = artifacts_dir / "stack_cat_base.joblib"
meta_json = artifacts_dir / "stack_meta_xgb.json"
meta_job  = artifacts_dir / "stack_meta_xgb.joblib"

xgb_base = load_xgb_model(xgb_json, xgb_job)
lgb_base = load_lgb_model(lgb_txt, lgb_job)
cat_base = load_cat_model(cat_cbm, cat_job)
meta_learner = load_xgb_model(meta_json, meta_job)

# Predict probabilities on validation
def predict_lgb(booster_or_model, X):
    # If Booster, use booster.predict; else assume sklearn wrapper
    if isinstance(booster_or_model, lgb.Booster):
        return booster_or_model.predict(X)
    else:
        return booster_or_model.predict_proba(X)[:, 1]

p_xgb = xgb_base.predict_proba(X_valid_stack)[:, 1]
p_lgb = predict_lgb(lgb_base, X_valid_stack)
p_cat = cat_base.predict_proba(X_valid_stack)[:, 1]

# Meta input and predictions
meta_valid = np.vstack([p_xgb, p_lgb, p_cat]).T.astype(np.float32)
p_stack = meta_learner.predict_proba(meta_valid)[:, 1]

# Metrics at 0.5 threshold
def metrics_at_threshold(y_true, y_scores, thr):
    preds = (y_scores >= thr).astype(int)
    cm = confusion_matrix(y_true, preds)
    return {
        "threshold": float(thr),
        "accuracy": float(accuracy_score(y_true, preds)),
        "precision": float(precision_score(y_true, preds, zero_division=0)),
        "recall": float(recall_score(y_true, preds, zero_division=0)),
        "f1": float(f1_score(y_true, preds, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_true, y_scores)),
        "confusion_matrix": cm.tolist(),
    }

metrics_05 = metrics_at_threshold(y_valid, p_stack, 0.5)
print("Validation metrics at threshold 0.5:")
for k, v in metrics_05.items():
    if k != "confusion_matrix":
        print(f" - {k}: {v}")
print(" - confusion_matrix:")
print(np.array(metrics_05["confusion_matrix"]))

# Precision-Recall curve to optimize threshold for F1
prec, rec, thr = precision_recall_curve(y_valid, p_stack)
# Align lengths: thr has len = len(prec)-1
f1_vals = np.zeros_like(thr)
for i, t in enumerate(thr):
    p = prec[i+1]  # corresponding precision
    r = rec[i+1]   # corresponding recall
    f1_vals[i] = 0.0 if (p + r) == 0 else 2 * p * r / (p + r)

best_idx = int(np.argmax(f1_vals))
best_thr = float(thr[best_idx])
print(f"\nBest F1 threshold from PR curve: {best_thr:.4f} (F1={float(f1_vals[best_idx]):.4f})")

metrics_best = metrics_at_threshold(y_valid, p_stack, best_thr)
print("Validation metrics at best threshold:")
for k, v in metrics_best.items():
    if k != "confusion_matrix":
        print(f" - {k}: {v}")
print(" - confusion_matrix:")
print(np.array(metrics_best["confusion_matrix"]))

# Persist metrics and threshold
artifacts_dir.mkdir(parents=True, exist_ok=True)
with open(artifacts_dir / "phase6_metrics.json", "w", encoding="utf-8") as f:
    json.dump({
        "threshold_0_5": metrics_05,
        "threshold_best": metrics_best,
        "best_threshold": best_thr,
    }, f, indent=2)
joblib.dump(best_thr, artifacts_dir / "optimal_threshold.joblib")
print("Saved Phase 6 metrics and optimal threshold to:", artifacts_dir.resolve())

Validation metrics at threshold 0.5:
 - threshold: 0.5
 - accuracy: 0.9841585667355301
 - precision: 0.8982394366197183
 - recall: 0.617227195741592
 - f1: 0.7316793345762226
 - roc_auc: 0.9576836314499463
 - confusion_matrix:
[[113686    289]
 [  1582   2551]]

Best F1 threshold from PR curve: 0.3562 (F1=0.7425)
Validation metrics at best threshold:
 - threshold: 0.35618826746940613
 - accuracy: 0.9837182917329902
 - precision: 0.8318318318318318
 - recall: 0.6702153399467699
 - f1: 0.742328822189468
 - roc_auc: 0.9576836314499463
 - confusion_matrix:
[[113415    560]
 [  1363   2770]]
Saved Phase 6 metrics and optimal threshold to: C:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts


In [None]:
print("Saved Phase 6 metrics and optimal threshold to:")

Saved Phase 6 metrics and optimal threshold to:


# Phase 7: Explainable AI (XAI) — Section 3.9


In this section, we apply model-agnostic and model-specific explainability techniques to our fraud detection models using the validation set and saved artifacts.


- SHAP (Global + Local):


  - Global: Feature importance via mean |SHAP|, plus beeswarm and bar plots.


  - Local: Force plot for an individual prediction showing how features push toward fraud vs non-fraud.


- LIME (Local):


  - Local surrogate explanation highlighting which features pushed a specific prediction toward each class.


- Permutation Feature Importance (PFI) (Global):


  - Global ranking by measuring drop in ROC AUC when each feature is randomly shuffled.


- Partial Dependence Plots (PDPs) (Global):


  - Global view of the relationship between key features (e.g., V12, C2, C14) and predicted fraud risk while holding others constant.


Outputs will be saved under `data/processed/artifacts/xai/` for reproducibility and reporting.

In [31]:
# Environment setup: install and import XAI requirements
%pip install -q --disable-pip-version-check shap lime scikit-learn matplotlib seaborn xgboost catboost lightgbm
import os, json, warnings, math, textwrap
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from lime.lime_tabular import LimeTabularExplainer
sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.dpi"] = 120

Note: you may need to restart the kernel to use updated packages.


In [47]:
# Load validation arrays, feature names, and best available model
from typing import Optional, Tuple, List
PROC_DIR = os.path.join("data", "processed")
ART_DIR = os.path.join(PROC_DIR, "artifacts")
XAI_DIR = os.path.join(ART_DIR, "xai")
os.makedirs(XAI_DIR, exist_ok=True)

def _safe_listdir(p: str) -> List[str]:
    return sorted(os.listdir(p)) if os.path.exists(p) else []

def _coerce_to_float_array(X: np.ndarray) -> np.ndarray:
    """
    Ensure X is a float array. If it's object/string (e.g., '[4.8190865E-1]'), strip brackets and cast to float.
    """
    arr = np.asarray(X)
    if np.issubdtype(arr.dtype, np.number):
        return arr.astype(np.float32, copy=False)
    try:
        s = arr.astype(str)
        s = np.char.strip(s)
        s = np.char.strip(s, '[]')
        s = np.char.replace(s, ",", "")
        flat = s.ravel().astype(np.float64)
        return flat.reshape(arr.shape).astype(np.float32)
    except Exception as e:
        raise ValueError(f"Could not coerce X to float array: {e}")

print("PROC_DIR:", os.path.abspath(PROC_DIR))
print("ART_DIR:", os.path.abspath(ART_DIR))
print("XAI_DIR:", os.path.abspath(XAI_DIR))
print("processed files:", _safe_listdir(PROC_DIR))
print("artifact files:", _safe_listdir(ART_DIR))

# --- Load validation data (initially the full set) ---
X_valid_path = os.path.join(PROC_DIR, "X_valid.npy")
y_valid_path = os.path.join(PROC_DIR, "y_valid.npy")
assert os.path.exists(X_valid_path) and os.path.exists(y_valid_path), "Validation arrays not found under data/processed/."
X_valid = np.load(X_valid_path, allow_pickle=False)
y_valid = np.load(y_valid_path, allow_pickle=False).ravel()
print("Initial X_valid shape:", X_valid.shape, "y_valid shape:", y_valid.shape)

# --- Feature names ---
feature_names: Optional[List[str]] = None
sel_feat_path = os.path.join(ART_DIR, "selected_features.joblib")
schema_path = os.path.join(ART_DIR, "schema.joblib")
try:
    if os.path.exists(sel_feat_path):
        feature_names = joblib.load(sel_feat_path)
        print(f"Loaded {len(feature_names)} selected feature names from artifacts.")
    elif os.path.exists(schema_path):
        schema = joblib.load(schema_path)
        if isinstance(schema, dict) and "features" in schema:
            feature_names = list(schema["features"])
            print(f"Loaded {len(feature_names)} feature names from schema.")
except Exception as e:
    print("Feature name loading warning:", e)

# --- Load best available model (XGBoost -> LightGBM -> CatBoost) ---
def load_best_model() -> Tuple[object, str]:
    # Try XGBoost JSON models in a preferred order
    xgb_candidates = [
        "xgb_optuna_model.json",
        "xgb_baseline_model.json",
        "stack_xgb_base.json",
        "stack_meta_xgb.json",
    ]
    for fname in xgb_candidates:
        fpath = os.path.join(ART_DIR, fname)
        if os.path.exists(fpath):
            try:
                clf = xgb.XGBClassifier(n_jobs=-1, tree_method="hist", enable_categorical=False)
                clf.load_model(fpath)
                return clf, "xgb"
            except Exception as e:
                print(f"Tried XGBoost model {fname} but failed:", e)

    # Try LightGBM Booster
    lgb_path = os.path.join(ART_DIR, "stack_lgb_base.txt")
    if os.path.exists(lgb_path):
        try:
            booster = lgb.Booster(model_file=lgb_path)
            return booster, "lightgbm"
        except Exception as e:
            print("Tried LightGBM model but failed:", e)

    # Try CatBoost
    cat_path = os.path.join(ART_DIR, "stack_cat_base.cbm")
    if os.path.exists(cat_path):
        try:
            cat = CatBoostClassifier()
            cat.load_model(cat_path)
            return cat, "catboost"
        except Exception as e:
            print("Tried CatBoost model but failed:", e)

    raise FileNotFoundError("No supported model artifact found in artifacts directory.")

model, model_type = load_best_model()
print("Loaded model type:", model_type)

# --- Determine expected number of features for the model ---
expected_n_features: Optional[int] = None
try:
    if model_type == "xgb":
        expected_n_features = int(model.get_booster().num_features())
        print("Model expects", expected_n_features, "features (from XGBoost booster).")
    elif model_type == "lightgbm":
        expected_n_features = int(model.num_feature())
        print("Model expects", expected_n_features, "features (from LightGBM booster).")
except Exception as e:
    print("Could not introspect model feature count:", e)

# If CatBoost or introspection failed, fall back to selected_features length if available
if expected_n_features is None and feature_names is not None:
    expected_n_features = len(feature_names)
    print("Assuming model expects", expected_n_features, "features (from selected_features).")

# --- Align validation features to the model ---
if expected_n_features is not None and X_valid.shape[1] != expected_n_features:
    # Prefer a pre-saved top-K validation array if present (e.g., X_valid_top30.npy)
    cand = os.path.join(PROC_DIR, f"X_valid_top{expected_n_features}.npy")
    if os.path.exists(cand):
        X_valid = np.load(cand, allow_pickle=False)
        print(f"Loaded top-K validation array to match model: {cand} -> shape {X_valid.shape}")
        # Ensure feature_names length matches the array
        if feature_names is None or len(feature_names) != expected_n_features:
            try:
                if os.path.exists(sel_feat_path):
                    feature_names = joblib.load(sel_feat_path)
                    if len(feature_names) != expected_n_features:
                        raise ValueError("selected_features length mismatch")
                    print(f"Aligned feature names from artifacts: {len(feature_names)}")
                else:
                    raise FileNotFoundError("selected_features.joblib not found")
            except Exception as e:
                feature_names = [f"f{i}" for i in range(expected_n_features)]
                print("Falling back to generic feature names:", e)
    else:
        print(
            f"Warning: model expects {expected_n_features} features but X_valid has {X_valid.shape[1]} and no matching top-K file found.",
        )

# --- Coerce validation to numeric just in case ---
X_valid = _coerce_to_float_array(X_valid)

# Finalize feature names if still None/mismatched
if feature_names is None or len(feature_names) != X_valid.shape[1]:
    feature_names = [f"f{i}" for i in range(X_valid.shape[1])]
    print("Using generic feature names of length", len(feature_names))

X_valid_df = pd.DataFrame(X_valid, columns=feature_names)

# --- Decision threshold ---
threshold_path = os.path.join(ART_DIR, "optimal_threshold.joblib")
threshold = 0.5
if os.path.exists(threshold_path):
    try:
        threshold = float(joblib.load(threshold_path))
    except Exception as e:
        print("Warning: could not load optimal threshold, defaulting to 0.5:", e)
print("Using decision threshold:", threshold)

# --- Unified wrapper compatible with sklearn utilities ---
class ModelWrapper:
    def __init__(self, model, model_type: str = "xgb", threshold: float = 0.5):
        self.model = model
        self.model_type = model_type
        self.threshold = float(threshold)
        self.classes_ = np.array([0, 1])
        self.n_features_in_ = X_valid.shape[1]
        # Mark as classifier so sklearn uses classification scoring and response methods.
        self._estimator_type = "classifier"

    def fit(self, X, y=None):
        # No-op fit to be compatible with sklearn utilities that require a fit method.
        self.is_fitted_ = True
        return self

    def predict_proba(self, X):
        # Native predict_proba path
        if hasattr(self.model, "predict_proba"):
            proba = self.model.predict_proba(X)
            if proba.ndim == 1 or (proba.ndim == 2 and proba.shape[1] == 1):
                proba = np.column_stack([1 - proba.ravel(), proba.ravel()])
            return proba
        # LightGBM Booster returns probabilities with .predict
        if self.model_type == "lightgbm" and hasattr(self.model, "predict"):
            p1 = np.asarray(self.model.predict(X))
            if p1.ndim == 1:
                return np.column_stack([1 - p1, p1])
            # Multiclass fallback
            return p1
        # Fallback to predict labels
        pred = getattr(self.model, "predict", None)
        if pred is not None:
            labels = np.asarray(pred(X)).astype(int).ravel()
            return np.column_stack([1 - labels, labels])
        raise AttributeError("Model does not support predict_proba or predict.")

    def predict(self, X):
        p1 = self.predict_proba(X)[:, 1]
        return (p1 >= self.threshold).astype(int)

wrapped = ModelWrapper(model, model_type=model_type, threshold=threshold)
proba_valid = wrapped.predict_proba(X_valid)[:, 1]
print("Validation AUC (sanity):", round(roc_auc_score(y_valid, proba_valid), 6))

# Numeric aliases used by downstream XAI steps
Xn = X_valid.astype(np.float32, copy=False)
Xn_df = pd.DataFrame(Xn, columns=feature_names)

PROC_DIR: c:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed
ART_DIR: c:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts
XAI_DIR: c:\Users\asus\OneDrive\Desktop\ML PROJECT 5sem\data\processed\artifacts\xai
processed files: ['X_train_smote.npy', 'X_train_smote_top30.npy', 'X_valid.npy', 'X_valid_top30.npy', 'artifacts', 'y_train_smote.npy', 'y_valid.npy']
artifact files: ['optimal_threshold.joblib', 'optuna_best_params.joblib', 'optuna_study.joblib', 'optuna_trials.csv', 'phase6_metrics.json', 'preprocessor.joblib', 'schema.joblib', 'selected_features.joblib', 'shap_importance.csv', 'stack_cat_base.cbm', 'stack_lgb_base.txt', 'stack_meta_xgb.json', 'stack_xgb_base.json', 'stacking_info.joblib', 'xai', 'xgb_baseline_model.json', 'xgb_optuna_model.json']
Initial X_valid shape: (118108, 432) y_valid shape: (118108,)
Loaded 4 selected feature names from artifacts.
Initial X_valid shape: (118108, 432) y_valid shape: (118108,)
Loaded 4 selected feature names fr

## SHAP: Global and Local Explanations
We compute global feature importance via mean |SHAP| and visualize with beeswarm and bar plots. Then we generate a local force plot for a representative validation instance.

In [41]:
# SHAP global (summary) and local (force) explanations
def _compute_shap_values(model, model_type, X, feature_names):
    """Return (base_values, shap_values) for class 1 in shape (n_samples, n_features).
    For XGBoost, prefer fast pred_contribs. Otherwise try TreeExplainer then KernelExplainer.
    """
    import types
    # Fast path for XGBoost using pred_contribs
    if model_type == "xgb":
        try:
            booster = model.get_booster() if hasattr(model, "get_booster") else model
            dmat = xgb.DMatrix(X, feature_names=feature_names)
            contribs = booster.predict(dmat, pred_contribs=True)  # (n_samples, n_features+1)
            base_vals = contribs[:, -1].astype(float)
            shap_vals = contribs[:, :-1].astype(float)
            # Construct a minimal explainer-like object with expected_value for compatibility
            expl = types.SimpleNamespace(expected_value=float(np.mean(base_vals)))
            return expl, base_vals, shap_vals
        except Exception as e:
            print("XGB pred_contribs failed, trying TreeExplainer -> KernelExplainer:", e)
    # TreeExplainer for tree-based models
    try:
        if model_type in ("xgb", "catboost") and hasattr(shap, "TreeExplainer"):
            explainer = shap.TreeExplainer(model)
            shap_out = explainer.shap_values(X) if hasattr(explainer, "shap_values") else explainer(X)
            # Normalize to class-1 SHAP values
            if isinstance(shap_out, list) and len(shap_out) == 2:
                shap_vals = np.asarray(shap_out[1])
            elif hasattr(shap_out, "values") and shap_out.values is not None:
                vals = np.asarray(shap_out.values)
                shap_vals = vals[:, :, 1] if vals.ndim == 3 and vals.shape[-1] >= 2 else vals
            else:
                shap_vals = np.asarray(shap_out)
            base_vals = getattr(explainer, "expected_value", getattr(explainer, "expected_values", 0.0))
            if isinstance(base_vals, (list, np.ndarray)) and np.size(base_vals) > 1:
                base_vals = np.asarray(base_vals).ravel()[-1]
            return explainer, np.asarray(base_vals), shap_vals
    except Exception as e:
        print("TreeExplainer failed, falling back to KernelExplainer:", e)
    # KernelExplainer fallback (use small background and sample X to avoid slowness)
    bg = shap.sample(X, min(200, X.shape[0]), random_state=42) if hasattr(shap, "sample") else X[: min(200, X.shape[0])]
    ke = shap.KernelExplainer(lambda Z: wrapped.predict_proba(Z)[:, 1], bg)
    # Limit evaluated rows for global importance to keep runtime reasonable
    subsz = min(5000, X.shape[0])
    X_sub = X[:subsz]
    shap_vals_sub = ke.shap_values(X_sub, nsamples=100)
    # Broadcast base value as scalar if provided as array
    base_val = getattr(ke, "expected_value", getattr(ke, "expected_values", 0.0))
    if isinstance(base_val, (list, np.ndarray)):
        base_val = float(np.asarray(base_val).ravel()[0])
    return ke, base_val, np.asarray(shap_vals_sub)

# Compute SHAP
explainer, base_vals, shap_vals_c1 = _compute_shap_values(model, model_type, Xn, feature_names)
# If KernelExplainer returned subset, align feature importance computation
if shap_vals_c1.shape[0] != Xn.shape[0]:
    X_for_imp = Xn[:shap_vals_c1.shape[0]]
    Xdf_for_imp = Xn_df.iloc[:shap_vals_c1.shape[0]]
else:
    X_for_imp = Xn
    Xdf_for_imp = Xn_df

# Global importance
mean_abs = np.abs(shap_vals_c1).mean(axis=0)
imp_df = pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs}).sort_values("mean_abs_shap", ascending=False)
imp_csv_path = os.path.join(XAI_DIR, "shap_importance_xai.csv")
imp_df.to_csv(imp_csv_path, index=False)
print(f"Saved global SHAP importance -> {imp_csv_path}")

# SHAP summary plots (beeswarm + bar)
try:
    plt.figure(figsize=(9, 6))
    shap.summary_plot(shap_vals_c1, Xdf_for_imp, show=False, plot_type="dot")
    plt.tight_layout()
    out1 = os.path.join(XAI_DIR, "shap_summary_beeswarm.png")
    plt.savefig(out1, dpi=150)
    plt.close()
    print("Saved:", out1)
except Exception as e:
    print("Beeswarm plot failed:", e)

try:
    plt.figure(figsize=(9, 6))
    shap.summary_plot(shap_vals_c1, Xdf_for_imp, show=False, plot_type="bar")
    plt.tight_layout()
    out2 = os.path.join(XAI_DIR, "shap_summary_bar.png")
    plt.savefig(out2, dpi=150)
    plt.close()
    print("Saved:", out2)
except Exception as e:
    print("Bar plot failed:", e)

# Choose a representative instance (prefer a predicted positive)
p1 = wrapped.predict_proba(Xn)[:, 1]
pos_idx = np.where(p1 >= threshold)[0]
idx = int(pos_idx[0]) if len(pos_idx) else int(np.argmax(p1))
print("Selected instance index for local explanations:", idx, "with p1=", float(p1[idx]))

# Local force plot
force_html = os.path.join(XAI_DIR, "shap_force_local.html")
try:
    base_val_scalar = float(np.mean(base_vals)) if isinstance(base_vals, (np.ndarray, list)) else float(base_vals)
    # Try legacy API first
    if hasattr(shap, "force_plot"):
        fp = shap.force_plot(base_val_scalar, shap_vals_c1[idx], Xn_df.iloc[idx], matplotlib=False)
        if hasattr(shap, "save_html"):
            shap.save_html(force_html, fp)
        else:
            from IPython.display import HTML
            HTML(fp.data if hasattr(fp, "data") else str(fp))
        print("Saved local SHAP force plot:", force_html)
    else:
        # New API via Explanation object
        from shap import Explanation
        expl = Explanation(values=shap_vals_c1[idx], base_values=base_val_scalar, data=Xn_df.iloc[idx].values, feature_names=feature_names)
        obj = shap.plots.force(expl)
        shap.save_html(force_html, obj)
        print("Saved local SHAP force plot:", force_html)
except Exception as e:
    print("Local force plot generation failed:", e)

# Save per-feature local contributions for the chosen instance
try:
    local_df = (
        pd.DataFrame({"feature": feature_names, "shap_value": shap_vals_c1[idx], "value": Xn_df.iloc[idx].values})
        .sort_values("shap_value", key=np.abs, ascending=False)
    )
    local_csv = os.path.join(XAI_DIR, "shap_local_contributions.csv")
    local_df.to_csv(local_csv, index=False)
    print("Saved:", local_csv)
except Exception as e:
    print("Saving local contributions failed:", e)

TreeExplainer failed, falling back to KernelExplainer: could not convert string to float: '[4.8190865E-1]'


100%|██████████| 118108/118108 [2:04:00<00:00, 15.87it/s] 


Saved global SHAP importance -> data\processed\artifacts\xai\shap_importance_xai.csv
Saved: data\processed\artifacts\xai\shap_summary_beeswarm.png
Saved: data\processed\artifacts\xai\shap_summary_beeswarm.png
Saved: data\processed\artifacts\xai\shap_summary_bar.png
Saved: data\processed\artifacts\xai\shap_summary_bar.png
Selected instance index for local explanations: 58 with p1= 0.9513452053070068
Saved local SHAP force plot: data\processed\artifacts\xai\shap_force_local.html
Saved: data\processed\artifacts\xai\shap_local_contributions.csv
Selected instance index for local explanations: 58 with p1= 0.9513452053070068
Saved local SHAP force plot: data\processed\artifacts\xai\shap_force_local.html
Saved: data\processed\artifacts\xai\shap_local_contributions.csv


## LIME: Local Explanation
We use LIME to fit a simple local surrogate model around a chosen instance and visualize the feature contributions toward fraud vs. non-fraud.

In [42]:
# LIME local explanation for the same selected instance
lime_html = os.path.join(XAI_DIR, "lime_local_explanation.html")
try:
    # Treat all features as continuous for simplicity; if you have categoricals, pass categorical_features indices.
    lime_explainer = LimeTabularExplainer(
        training_data=Xn,
        mode="classification",
        feature_names=feature_names,
        class_names=["non-fraud", "fraud"],
        discretize_continuous=True,
        random_state=42,
    )
    lime_exp = lime_explainer.explain_instance(
        Xn[idx],
        predict_fn=lambda Z: wrapped.predict_proba(Z),
        num_features=min(10, Xn.shape[1]),
    )
    with open(lime_html, "w", encoding="utf-8") as f:
        f.write(lime_exp.as_html())
    print("Saved LIME explanation ->", lime_html)
    print("Top contributions:")
    print(lime_exp.as_list())
except Exception as e:
    print("LIME explanation failed:", e)

LIME explanation failed: LimeTabularExplainer.explain_instance() got an unexpected keyword argument 'classifier_fn'


## Permutation Feature Importance (PFI)
We compute global importance by measuring the drop in ROC AUC when each feature is permuted on the validation set.

In [48]:
# Compute PFI and save outputs
# permutation_importance requires an estimator with .fit; our wrapper provides a no-op fit.
wrapped.fit(Xn, y_valid)
pfi = permutation_importance(
    estimator=wrapped,
    X=Xn,
    y=y_valid,
    scoring="roc_auc",
    n_repeats=5,
    random_state=42,
    n_jobs=-1,
    )
    # estimator,
    # X,
    # y,
    # *,
    # scoring=None,
    # n_repeats=5,
    # n_jobs=None,
    # random_state=None,
    # sample_weight=None,
pfi_df = pd.DataFrame({
    "feature": feature_names,
    "importance_mean": pfi.importances_mean,
    "importance_std": pfi.importances_std,
}).sort_values("importance_mean", ascending=False)
pfi_csv = os.path.join(XAI_DIR, "pfi_importance.csv")
pfi_df.to_csv(pfi_csv, index=False)
print("Saved:", pfi_csv)

# Bar plot
plt.figure(figsize=(9, 6))
top = pfi_df.head(30)
sns.barplot(x="importance_mean", y="feature", data=top, orient="h", palette="viridis")
plt.xlabel("Permutation Importance (mean Δ AUC)")
plt.ylabel("Feature")
plt.tight_layout()
pfi_png = os.path.join(XAI_DIR, "pfi_bar_top30.png")
plt.savefig(pfi_png, dpi=150)
plt.close()
print("Saved:", pfi_png)

Saved: data\processed\artifacts\xai\pfi_importance.csv
Saved: data\processed\artifacts\xai\pfi_bar_top30.png
Saved: data\processed\artifacts\xai\pfi_bar_top30.png


## Partial Dependence Plots (PDPs)
We visualize the marginal effect of key features on the predicted fraud probability (`predict_proba`).

In [49]:
# Generate PDPs for key features
# Prefer named features if available; else use top SHAP features
pdp_candidates = [f for f in ["V12", "C2", "C14"] if f in feature_names]
if len(pdp_candidates) == 0:
    try:
        imp_fallback = pd.read_csv(os.path.join(XAI_DIR, "shap_importance_xai.csv"))
        pdp_candidates = imp_fallback.head(min(3, len(imp_fallback)))['feature'].tolist()
    except Exception:
        pdp_candidates = feature_names[: min(3, len(feature_names))]
print("PDP features:", pdp_candidates)

# Ensure estimator exposes fit (no-op) and predict_proba
wrapped.fit(Xn, y_valid)
fig, ax = plt.subplots(figsize=(12, 4 * max(1, len(pdp_candidates))))
PartialDependenceDisplay.from_estimator(
    estimator=wrapped,
    X=Xn_df,
    features=pdp_candidates,
    kind="average",
    response_method="predict_proba",
    ax=ax,
    )
plt.tight_layout()
pdp_png = os.path.join(XAI_DIR, "pdp_plots.png")
plt.savefig(pdp_png, dpi=150)
plt.close()
print("Saved:", pdp_png)

PDP features: ['f25', 'f12', 'f4']
Saved: data\processed\artifacts\xai\pdp_plots.png
Saved: data\processed\artifacts\xai\pdp_plots.png
