In [1]:
# Install required packages (Colab)
!pip install xgboost shap scikit-learn pandas numpy matplotlib seaborn joblib reportlab --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.0 MB[0m [31m19.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m37.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import json
import joblib
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
import shap

RND = 42
np.random.seed(RND)
random.seed(RND)

ARTIFACTS_DIR = "artifacts_project3"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

sns.set(style="whitegrid")


In [3]:
# Parameters
N_TRANSACTIONS = 80000
N_CUSTOMERS = 5000   # many customers, multiple txns each
start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 12, 5)

# Helper distributions
uk_countries = ["GB"]  # primary
eu_countries = ["FR","DE","ES","IT","NL","SE","BE","IE"]
high_risk_countries = ["RU","IR","KP","SY","NG","VE"]  # synthetic "high risk"
channels = ["ATM", "POS", "ONLINE", "BRANCH", "MOBILE"]
merchant_categories = ["Retail","Gambling","Trading","CryptoExchange","Utilities","Travel","Salary","P2P"]
customer_kyc_categories = ["Low","Medium","High"]

# Create customers
customers = []
for i in range(N_CUSTOMERS):
    cust = {
        "customer_id": f"CUST_{i+1:05d}",
        "account_open_days": int(np.random.exponential(scale=1200)),  # account age in days
        "avg_monthly_inflow": round(max(200, np.random.normal(2500, 1500)), 2),
        "monthly_txn_avg": max(1, int(np.random.poisson(15))),
        "kyc_risk": np.random.choice(customer_kyc_categories, p=[0.7,0.25,0.05]),
        "residence": np.random.choice(uk_countries + eu_countries, p=[0.5]+[0.5/len(eu_countries)]*len(eu_countries)),
        "has_prior_flags": np.random.choice([0,1], p=[0.95,0.05])
    }
    customers.append(cust)
customers_df = pd.DataFrame(customers)

# Transaction generator
rows = []
for t in range(N_TRANSACTIONS):
    cust = customers_df.sample(1, random_state=None).iloc[0]
    customer_id = cust['customer_id']
    # time uniformly between start and end, but with more recent bias
    days_range = (end_date - start_date).days
    ts = start_date + timedelta(days=int(np.random.beta(2,1.8)*days_range),
                                seconds=int(np.random.uniform(0,86400)))
    # amount lognormal
    amount = round(max(1, np.random.lognormal(mean=6.0, sigma=1.5)/10), 2)  # variable scale
    # channel
    channel = np.random.choice(channels, p=[0.15,0.2,0.35,0.1,0.2])
    merchant = np.random.choice(merchant_categories, p=[0.4,0.02,0.03,0.01,0.2,0.05,0.15,0.14])
    # is international? small probability but higher for some merchant categories
    is_international = 0
    country = cust['residence']
    if np.random.rand() < 0.05:
        # choose EU or high risk
        country = np.random.choice(eu_countries + high_risk_countries, p=[0.9/len(eu_countries)]*len(eu_countries)+[0.1/len(high_risk_countries)]*len(high_risk_countries))
        is_international = 1 if country != cust['residence'] else 0
    # structuring (smurfing) pattern injection: amounts just below threshold, frequent small deposits
    # We'll label later but insert small probability for structuring-like txns
    rows.append({
        "transaction_id": f"TX_{t+1:07d}",
        "customer_id": customer_id,
        "timestamp": ts,
        "amount": amount,
        "channel": channel,
        "merchant_category": merchant,
        "country": country,
        "is_international": is_international,
        "kyc_risk": cust['kyc_risk'],
        "account_open_days": cust['account_open_days'],
        "avg_monthly_inflow": cust['avg_monthly_inflow'],
        "has_prior_flags": cust['has_prior_flags']
    })

df = pd.DataFrame(rows)
# sort by timestamp
df.sort_values("timestamp", inplace=True)
df.reset_index(drop=True, inplace=True)
print("Generated transactions:", df.shape)
df.head()


Generated transactions: (80000, 12)


Unnamed: 0,transaction_id,customer_id,timestamp,amount,channel,merchant_category,country,is_international,kyc_risk,account_open_days,avg_monthly_inflow,has_prior_flags
0,TX_0075493,CUST_02790,2023-01-01 16:39:09,1.0,MOBILE,Retail,GB,0,Low,751,2561.95,0
1,TX_0073265,CUST_03383,2023-01-02 04:51:47,10.96,ONLINE,Retail,DE,0,Low,935,2932.62,0
2,TX_0055131,CUST_00966,2023-01-03 15:04:48,3.8,ONLINE,Travel,GB,0,Medium,2368,3204.37,0
3,TX_0054876,CUST_01574,2023-01-04 17:25:28,47.21,ONLINE,CryptoExchange,GB,0,Medium,101,200.0,0
4,TX_0068517,CUST_01706,2023-01-04 23:34:28,11.42,ATM,Travel,FR,0,Low,437,4160.98,0


In [4]:
# Helper functions to mark patterns
df['label'] = 'Legit'  # default

# 1) Structuring (smurfing): many small deposits under threshold within short window
# Define deposit-like transactions: channel BRANCH or CASH-like merchant 'Salary' not included; for simplicity, use BRANCH or ATM as deposit proxies
df['is_deposit_proxy'] = df['channel'].isin(['BRANCH','ATM']) | (df['merchant_category']=='Salary')

# For each customer, flag runs of many deposits under a threshold
structuring_threshold = 950  # GBP threshold
for cust, group in df.groupby('customer_id'):
    small_deposits = group[(group['is_deposit_proxy']) & (group['amount'] < structuring_threshold)]
    if len(small_deposits) >= 5 and np.random.rand() < 0.15:
        # mark a random subset of these as structuring
        idxs = small_deposits.sample(min(5, len(small_deposits))).index
        df.loc[idxs, 'label'] = 'Structuring'

# 2) High-velocity: many txns in short time window
# For each customer, compute rolling counts within 1-day windows (approx)
df['date'] = df['timestamp'].dt.date
txn_counts = df.groupby(['customer_id','date']).size().reset_index(name='daily_count')
df = df.merge(txn_counts, on=['customer_id','date'], how='left')

# Mark high-velocity: daily_count > threshold
for idx, row in df[df['daily_count']>=20].iterrows():
    if np.random.rand() < 0.6:  # probabilistic to avoid overlabeling
        df.at[idx, 'label'] = 'HighVelocity'

# 3) International risky transfers: transactions to/from high-risk country with large amount or to unfamiliar counterparties
# Let's mark txns with is_international AND country in high_risk_countries AND amount > quantile 0.6
amount_cut = df['amount'].quantile(0.6)
intl_risky_idx = df[(df['is_international']==1) & (df['country'].isin(high_risk_countries)) & (df['amount'] > amount_cut)].index
df.loc[intl_risky_idx, 'label'] = 'InternationalRisky'

# 4) Round-tripping: simple heuristic = transfer out then back in within 3 days (simulate by finding two txns of similar amount for same customer crossing direction)
# Since we don't have explicit counterparty IDs with flow direction, simulate round-trip by marking small percentage of transactions preceded or followed by similar amount in short window
for cust, group in df.groupby('customer_id'):
    g = group.sort_values('timestamp')
    amounts = g['amount'].values
    times = g['timestamp'].values
    for i in range(len(g)-1):
        if abs(amounts[i] - amounts[i+1]) < amounts[i]*0.05 and (times[i+1] - times[i]).astype('timedelta64[h]').astype(int) <= 72:
            if np.random.rand() < 0.5:
                df_index = g.index[i]
                df.loc[df_index, 'label'] = 'RoundTripping'
                df.loc[g.index[i+1], 'label'] = 'RoundTripping'

# After injection, ensure some legitimate transactions remain
label_counts = df['label'].value_counts()
label_counts


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Legit,77237
Structuring,2511
InternationalRisky,162
RoundTripping,90


In [5]:
print("Label distribution (sample):")
display(df['label'].value_counts(normalize=False))
df[['transaction_id','customer_id','timestamp','amount','channel','merchant_category','country','label']].head(20)
# Save sample CSV
df.sample(5000, random_state=RND).to_csv(os.path.join(ARTIFACTS_DIR, "transactions_sample_5k.csv"), index=False)


Label distribution (sample):


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Legit,77237
Structuring,2511
InternationalRisky,162
RoundTripping,90


In [6]:
# Feature engineering
# numeric features: amount, hour_of_day, days_since_account_open, avg_monthly_inflow, daily_count
df['hour'] = df['timestamp'].dt.hour
df['days_since_open'] = df['account_open_days']
df['is_online'] = (df['channel']=='ONLINE').astype(int)
df['is_mobile'] = (df['channel']=='MOBILE').astype(int)
df['is_pos'] = (df['channel']=='POS').astype(int)
df['is_branch'] = (df['channel']=='BRANCH').astype(int)
df['is_atm'] = (df['channel']=='ATM').astype(int)
df['is_salary'] = (df['merchant_category']=='Salary').astype(int)
df['amount_log'] = np.log1p(df['amount'])

# customer-level aggregates: total_txns, mean_amount, total_deposits
cust_agg = df.groupby('customer_id').agg({
    'amount': ['count','mean','sum'],
    'is_deposit_proxy': 'sum',
    'has_prior_flags': 'max'
})
cust_agg.columns = ['cust_txn_count','cust_mean_amount','cust_sum_amount','cust_deposit_count','cust_has_prior_flag']
cust_agg = cust_agg.reset_index()

df = df.merge(cust_agg, on='customer_id', how='left')

# Label encode categorical cols
le_channel = LabelEncoder().fit(df['channel'])
df['channel_enc'] = le_channel.transform(df['channel'])
le_merchant = LabelEncoder().fit(df['merchant_category'])
df['merchant_enc'] = le_merchant.transform(df['merchant_category'])

# final feature list
features = [
    'amount','amount_log','hour','is_online','is_mobile','is_pos','is_branch','is_atm',
    'is_salary','is_international','days_since_open','avg_monthly_inflow',
    'cust_txn_count','cust_mean_amount','cust_sum_amount','cust_deposit_count','cust_has_prior_flag',
    'channel_enc','merchant_enc'
]

# drop any rows with missing required features
X = df[features].fillna(0)
y = df['label']

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)


Feature matrix shape: (80000, 19)
Labels shape: (80000,)


In [7]:
le_target = LabelEncoder()
y_enc = le_target.fit_transform(y)  # classes e.g., [0..4]
classes = le_target.classes_
print("Classes:", classes)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.20, random_state=RND, stratify=y_enc)
print("Train / Test sizes:", X_train.shape, X_test.shape)


Classes: ['InternationalRisky' 'Legit' 'RoundTripping' 'Structuring']
Train / Test sizes: (64000, 19) (16000, 19)


In [8]:
# We'll scale numeric-only columns for NN; tree models don't strictly require scaling
numeric_cols = ['amount','amount_log','hour','days_since_open','avg_monthly_inflow','cust_txn_count','cust_mean_amount','cust_sum_amount','cust_deposit_count']
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Save scaler
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, "scaler_project3.joblib"))


['artifacts_project3/scaler_project3.joblib']

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Models trained and saved.




In [10]:
from sklearn.metrics import roc_auc_score
def evaluate_model(name, model, Xs, ys):
    preds = model.predict(Xs)
    probs = None
    try:
        probs = model.predict_proba(Xs)
    except:
        pass
    print(f"\n=== {name} ===")
    print(classification_report(ys, preds, target_names=classes, digits=4))
    cm = confusion_matrix(ys, preds)
    print("Confusion Matrix:\n", cm)
    if probs is not None:
        try:
            # compute macro ROC AUC (multi-class)
            auc = roc_auc_score(pd.get_dummies(ys), probs, average='macro', multi_class='ovr')
            print("Macro ROC AUC:", round(auc,4))
        except Exception as e:
            print("ROC AUC error:", e)

evaluate_model("XGBoost", xgb, X_test, y_test)
evaluate_model("Random Forest", rf, X_test, y_test)
evaluate_model("MLP (scaled)", mlp, X_test_scaled, y_test)



=== XGBoost ===
                    precision    recall  f1-score   support

InternationalRisky     0.0000    0.0000    0.0000        32
             Legit     0.9670    0.9992    0.9828     15448
     RoundTripping     0.0000    0.0000    0.0000        18
       Structuring     0.7027    0.0518    0.0965       502

          accuracy                         0.9663     16000
         macro avg     0.4174    0.2627    0.2698     16000
      weighted avg     0.9557    0.9663    0.9520     16000

Confusion Matrix:
 [[    0    32     0     0]
 [    1 15435     1    11]
 [    0    18     0     0]
 [    0   476     0    26]]
Macro ROC AUC: 0.904

=== Random Forest ===
                    precision    recall  f1-score   support

InternationalRisky     0.0993    0.4688    0.1639        32
             Legit     0.9929    0.7832    0.8757     15448
     RoundTripping     0.0000    0.0000    0.0000        18
       Structuring     0.1208    0.8805    0.2124       502

          accuracy        

In [11]:
# SHAP explainer (may take a moment)
explainer = shap.TreeExplainer(xgb)
# sample to speed up
sample_idx = np.random.choice(X_test.shape[0], size=min(2000, X_test.shape[0]), replace=False)
X_shap = X_test.iloc[sample_idx]
shap_values = explainer.shap_values(X_shap)

# Summary plot (saved)
shap.summary_plot(shap_values, X_shap, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(os.path.join(ARTIFACTS_DIR, "shap_summary_bar.png"), dpi=200)
plt.close()

# Detailed summary dot plot
shap.summary_plot(shap_values, X_shap, show=False)
plt.tight_layout()
plt.savefig(os.path.join(ARTIFACTS_DIR, "shap_summary_dot.png"), dpi=200)
plt.close()

print("SHAP plots saved to artifacts.")


  shap.summary_plot(shap_values, X_shap, plot_type="bar", show=False)
  shap.summary_plot(shap_values, X_shap, show=False)
  summary_legacy(
  summary_legacy(
  summary_legacy(
  summary_legacy(


SHAP plots saved to artifacts.


In [12]:
# create rule flags
def compute_rule_flags(row):
    flags = {}
    flags['flag_structuring'] = int((row['is_deposit_proxy']==1) and (row['amount'] < structuring_threshold))
    flags['flag_high_velocity'] = int(row['daily_count'] >= 20)
    flags['flag_international_high'] = int((row['is_international']==1) and (row['amount'] > amount_cut))
    flags['flag_prior'] = int(row['cust_has_prior_flag']==1)
    return flags

# apply to df (we'll create a small dataframe matching X_test indexes)
df_features = df.loc[X_test.index].copy()
rule_flags = df_features.apply(compute_rule_flags, axis=1, result_type='expand')
# get model probability (XGBoost)
probs = xgb.predict_proba(X_test)
# map predicted probability for suspicious classes: anything not 'Legit'
# create ML suspicious score as 1 - probability_of_Legit
# find index for 'Legit'
idx_legit = list(classes).index('Legit') if 'Legit' in classes else 0
ml_susp_score = 1 - probs[:, idx_legit]

hybrid = rule_flags.copy()
hybrid['ml_susp_score'] = ml_susp_score
# final alert score: weighted sum
hybrid['alert_score'] = 0.4*hybrid['ml_susp_score'] + 0.15*hybrid['flag_structuring'] + 0.25*hybrid['flag_high_velocity'] + 0.2*hybrid['flag_international_high'] + 0.1*hybrid['flag_prior']
# normalize 0-1
hybrid['alert_score'] = hybrid['alert_score'] / hybrid['alert_score'].max()

# attach final alert to test rows for inspection
alerts_df = df_features[['transaction_id','customer_id','timestamp','amount']].copy()
alerts_df = pd.concat([alerts_df.reset_index(drop=True), hybrid.reset_index(drop=True)], axis=1)
alerts_df.sort_values('alert_score', ascending=False).head(20).to_csv(os.path.join(ARTIFACTS_DIR, "top_alerts_sample.csv"), index=False)
print("Saved top alerts sample to artifacts.")
alerts_df.head()


Saved top alerts sample to artifacts.


Unnamed: 0,transaction_id,customer_id,timestamp,amount,flag_structuring,flag_high_velocity,flag_international_high,flag_prior,ml_susp_score,alert_score
0,TX_0016210,CUST_00475,2025-11-14 23:52:23,20.15,0,0,0,0,0.000624,0.00044
1,TX_0030625,CUST_01868,2024-01-16 04:07:16,41.44,0,0,0,0,0.000588,0.000415
2,TX_0000248,CUST_03115,2024-08-29 03:07:56,14.48,0,0,0,1,0.000264,0.176438
3,TX_0025741,CUST_02675,2023-02-28 19:03:54,50.21,0,0,0,0,0.000504,0.000355
4,TX_0047161,CUST_01294,2024-05-02 05:47:12,6.1,1,0,0,0,0.299846,0.475772
