In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_parquet("../data/Silver/df_reduced.parquet") # df = pd.read_parquet("s3://bass-risk-monitoring/Silver/df_reduced.parquet")

In [3]:
df.columns

Index(['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering',
       'log_amount_paid', 'hour', 'date', 'month', 'day_of_week', 'is_weekend',
       'daily_txn_count', 'is_night_txn', 'time_since_last_txn',
       'txn_count_24h_excl', 'rolling_avg_paid_7d', 'avg_txn_gap', 'week',
       'amount_ratio', 'log_amount_ratio', 'currency_diversity',
       'format_diversity', 'amount_paid_zscore', 'rolling_std_paid_7d',
       'unique_receivers', 'unique_senders', 'is_self_transfer',
       'transfer_direction_ratio', 'is_high_freq_receiver'],
      dtype='object')

In [4]:
df.dtypes

Timestamp                   datetime64[ns]
From Bank                            int32
Account                     string[python]
To Bank                              int32
Account.1                   string[python]
Amount Received                    float64
Receiving Currency                category
Amount Paid                        float64
Payment Currency                  category
Payment Format                    category
Is Laundering                         int8
log_amount_paid                    float64
hour                                 int32
date                                object
month                            period[M]
day_of_week                          int32
is_weekend                           int64
daily_txn_count                      int64
is_night_txn                         int64
time_since_last_txn                float64
txn_count_24h_excl                   int32
rolling_avg_paid_7d                float64
avg_txn_gap                        float64
week       

In [5]:
df.shape

(31898238, 35)

Encode categorical

In [6]:

# Target
y = df["Is Laundering"]

# Frequency encode high-cardinality identifiers
id_cols = ["From Bank", "Account", "To Bank", "Account.1"]
for col in id_cols:
    freq_map = df[col].value_counts().to_dict()
    df[col + "_freq"] = df[col].map(freq_map)

# Drop  identifiers
X = df.drop(columns=[
    "Is Laundering", "Timestamp", "From Bank", "Account", 
    "To Bank", "Account.1", "date", "month"
])

print("Feature set shape after freq encoding:", X.shape)
print("Positive class ratio:", y.mean())


Feature set shape after freq encoding: (31898238, 31)
Positive class ratio: 0.0011044497191349566


In [7]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ["Receiving Currency", "Payment Currency", "Payment Format"]
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))


Time-Based Split

In [8]:
df["week"].unique()
df["week"].value_counts().sort_index()

week
35     9146808
36    12002335
37    10745719
38        3285
39          91
Name: count, dtype: Int64

In [9]:
# Train on weeks 35–36
train_mask = df["week"].isin([35, 36])
X_train, y_train = X[train_mask], y[train_mask]

# Test on weeks 37–39
test_mask = df["week"].isin([37, 38, 39])
X_test, y_test = X[test_mask], y[test_mask]

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Positive class ratio in train:", y_train.mean())
print("Positive class ratio in test:", y_test.mean())


Train size: (21149143, 31) Test size: (10749095, 31)
Positive class ratio in train: 0.0009343168184167085
Positive class ratio in test: 0.0014391909272361998


Training

In [11]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

#  scale_pos_weight for imbalance
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
print("Scale pos weight:", scale_pos_weight)



Scale pos weight: 1069.3007591093117


In [13]:

params_weighted = {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "scale_pos_weight": scale_pos_weight,
    "verbose": -1
}

# Train (early stopping)
model_weighted = lgb.train(
    params_weighted,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.976817	training's binary_logloss: 3.79064	valid_1's auc: 0.978941	valid_1's binary_logloss: 4.05646


In [14]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, average_precision_score

def evaluate_model(model, X_test, y_test, name="Model"):
    # Use best iteration if available
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_binary = (y_pred > 0.5).astype(int)

    print(f"\n{name} Results:")
    print(classification_report(y_test, y_pred_binary, digits=4))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))

    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    ap_score = average_precision_score(y_test, y_pred)
    print("Average Precision (AP):", ap_score)



In [15]:
# Evaluate the class-weighted model on weeks 37–39
evaluate_model(model_weighted, X_test, y_test, "Class-Weighted")


Class-Weighted Results:
              precision    recall  f1-score   support

           0     1.0000    0.8437    0.9152  10733625
           1     0.0091    0.9928    0.0180     15470

    accuracy                         0.8439  10749095
   macro avg     0.5045    0.9183    0.4666  10749095
weighted avg     0.9986    0.8439    0.9139  10749095

ROC-AUC: 0.9701639982456102
Average Precision (AP): 0.031095014183867724


In [16]:
print("Best iteration:", model_weighted.best_iteration)

Best iteration: 1


Threshold tuning

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def tune_thresholds(model, X_test, y_test, thresholds=np.arange(0.1, 1.0, 0.1)):
    #  best iteration
    y_pred = model.predict(X_test, num_iteration=getattr(model, "best_iteration", None))
    results = []
    for t in thresholds:
        y_pred_binary = (y_pred > t).astype(int)
        precision = precision_score(y_test, y_pred_binary, zero_division=0)
        recall = recall_score(y_test, y_pred_binary, zero_division=0)
        f1 = f1_score(y_test, y_pred_binary, zero_division=0)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_binary).ravel()
        results.append((t, precision, recall, f1, tp, fp, fn, tn))
    return results

threshold_results = tune_thresholds(model_weighted, X_test, y_test)

# Print 
for t, p, r, f1, tp, fp, fn, tn in threshold_results:
    print(f"Threshold={t:.1f} | Precision={p:.4f} | Recall={r:.4f} | F1={f1:.4f} | TP={tp} FP={fp} FN={fn} TN={tn}")



Threshold=0.1 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.2 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.3 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.4 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.5 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.6 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.7 | Precision=0.0091 | Recall=0.9928 | F1=0.0180 | TP=15359 FP=1677360 FN=111 TN=9056265
Threshold=0.8 | Precision=0.0094 | Recall=0.9921 | F1=0.0186 | TP=15348 FP=1622276 FN=122 TN=9111349
Threshold=0.9 | Precision=0.0094 | Recall=0.9921 | F1=0.0186 | TP=15348 FP=1622276 FN=122 TN=9111349


In [26]:
from sklearn.calibration import CalibratedClassifierCV
import warnings

clf_weighted = lgb.LGBMClassifier(
    objective="binary",
    learning_rate=0.1,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    scale_pos_weight=scale_pos_weight,
    n_estimators=2000
)

# Fit with early stopping
clf_weighted.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=200)],
)

# isotonic calibration
calibrated_model = CalibratedClassifierCV(clf_weighted, method="isotonic", cv="prefit")
calibrated_model.fit(X_test, y_test)


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.978941	valid_0's binary_logloss: 5.22086




0,1,2
,estimator,LGBMClassifie...3007591093117)
,method,'isotonic'
,cv,'prefit'
,n_jobs,
,ensemble,'auto'

0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.1
,n_estimators,2000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:

from sklearn.metrics import precision_score, recall_score, f1_score

# Train
clf_weighted = lgb.LGBMClassifier(
    objective="binary",
    learning_rate=0.1,  
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    scale_pos_weight=scale_pos_weight,
    n_estimators=2000
)

clf_weighted.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=200)]
)

print("Best iteration:", clf_weighted.best_iteration_)

# Wrap with isotonic calibration
calibrated_model = CalibratedClassifierCV(clf_weighted, method="isotonic", cv="prefit")
calibrated_model.fit(X_test, y_test)

# Threshold sweep on calibrated probabilities
def tune_thresholds_calibrated(model, X_test, y_test, thresholds=np.arange(0.1, 1.0, 0.1)):
    y_pred = model.predict_proba(X_test)[:, 1]  # calibrated probabilities
    results = []
    for t in thresholds:
        y_pred_binary = (y_pred > t).astype(int)
        precision = precision_score(y_test, y_pred_binary, zero_division=0)
        recall = recall_score(y_test, y_pred_binary, zero_division=0)
        f1 = f1_score(y_test, y_pred_binary, zero_division=0)
        results.append((t, precision, recall, f1))
    return results

threshold_results_cal = tune_thresholds_calibrated(calibrated_model, X_test, y_test)

for t, p, r, f1 in threshold_results_cal:
    print(f"Threshold={t:.1f} | Precision={p:.4f} | Recall={r:.4f} | F1={f1:.4f}")


In [None]:
# Threshold 
def tune_thresholds_calibrated(model, X_test, y_test, thresholds=np.arange(0.1, 1.0, 0.1)):
    y_pred = model.predict_proba(X_test)[:, 1] 
    results = []
    for t in thresholds:
        y_pred_binary = (y_pred > t).astype(int)
        precision = precision_score(y_test, y_pred_binary, zero_division=0)
        recall = recall_score(y_test, y_pred_binary, zero_division=0)
        f1 = f1_score(y_test, y_pred_binary, zero_division=0)
        results.append((t, precision, recall, f1))
    return results

threshold_results_cal = tune_thresholds_calibrated(calibrated_model, X_test, y_test)

for t, p, r, f1 in threshold_results_cal:
    print(f"Threshold={t:.1f} | Precision={p:.4f} | Recall={r:.4f} | F1={f1:.4f}")

resampling

In [None]:
from sklearn.utils import resample

# X_train and y_train 
train_df = X_train.copy()
train_df["Is Laundering"] = y_train

# majority and minority classes
majority = train_df[train_df["Is Laundering"] == 0]
minority = train_df[train_df["Is Laundering"] == 1]

print("Majority size:", len(majority))
print("Minority size:", len(minority))

Majority size: 21129383
Minority size: 19760


In [None]:

# Undersample majority to match minority size * factor
factor = 10
majority_downsampled = resample(
    majority,
    replace=False,
    n_samples=len(minority) * factor,
    random_state=42
)

# Combine minority + downsampled majority
train_resampled = pd.concat([majority_downsampled, minority])

X_train_resampled = train_resampled.drop(columns=["Is Laundering"])
y_train_resampled = train_resampled["Is Laundering"]

print("Resampled train size:", X_train_resampled.shape)
print("Positive class ratio:", y_train_resampled.mean())


Resampled train size: (217360, 27)
Positive class ratio: 0.09090909090909091


In [None]:

train_data_resampled = lgb.Dataset(X_train_resampled, label=y_train_resampled)
valid_data_resampled = lgb.Dataset(X_test, label=y_test)

params_resampled = {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

# Train with early stopping
model_resampled = lgb.train(
    params_resampled,
    train_data_resampled,
    valid_sets=[train_data_resampled, valid_data_resampled],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[267]	training's auc: 0.994486	training's binary_logloss: 0.0574288	valid_1's auc: 0.989649	valid_1's binary_logloss: 0.0327002


In [None]:

# Evaluate
evaluate_model(model_resampled, X_test, y_test, "Resampled")


Resampled Results:
              precision    recall  f1-score   support

           0     0.9997    0.9919    0.9958  10733625
           1     0.1282    0.8268    0.2220     15470

    accuracy                         0.9917  10749095
   macro avg     0.5640    0.9094    0.6089  10749095
weighted avg     0.9985    0.9917    0.9947  10749095

ROC-AUC: 0.9896492337002901
PR-AUC: 0.6552163899584174


SMOTE

In [None]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(sampling_strategy=0.1, random_state=42)  

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE train size:", X_train_smote.shape)
print("Positive class ratio:", y_train_smote.mean())


ValueError: Cannot cast object dtype to float64

In [None]:

train_data_smote = lgb.Dataset(X_train_smote, label=y_train_smote)
valid_data_smote = lgb.Dataset(X_test, label=y_test)

params_smote = {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}


model_smote = lgb.train(
    params_smote,
    train_data_smote,
    valid_sets=[train_data_smote, valid_data_smote],
    num_boost_round=500,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)


In [None]:
# Evaluate
evaluate_model(model_smote, X_test, y_test, "SMOTE")
