In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_parquet("../data/Bronze/cleaned_HI-Medium_Trans.parquet")
# df = pd.read_parquet("s3://bass-risk-monitoring/Bronze/cleaned_HI-Medium_Trans.parquet") 

In [6]:
df.head(2)

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,date,hour
0,2022-09-01 00:17:00,20,800104D70,20,800104D70,6794.63,US Dollar,6794.63,US Dollar,Reinvestment,0,2022-09-01,0
1,2022-09-01 00:02:00,3196,800107150,3196,800107150,7739.29,US Dollar,7739.29,US Dollar,Reinvestment,0,2022-09-01,0


In [7]:
df.shape

(31898218, 13)

Feature engineering


In [8]:
df.dtypes

Timestamp             datetime64[ns]
From Bank                      int64
Account                       object
To Bank                        int64
Account.1                     object
Amount Received              float64
Receiving Currency          category
Amount Paid                  float64
Payment Currency            category
Payment Format              category
Is Laundering                  int64
date                          object
hour                           int32
dtype: object

In [None]:
# time features
df['day_of_week'] = pd.to_datetime(df['Timestamp']).dt.dayofweek     # 0 = Mon
df['is_weekend']  = (df['day_of_week'] >= 5).astype('int8')    
df['month']       = pd.to_datetime(df['Timestamp']).dt.month

In [11]:
df

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,date,hour,day_of_week,is_weekend,month,log_amount_paid,log_amount_received
0,2022-09-01 00:17:00,20,800104D70,20,800104D70,6.794630e+03,US Dollar,6.794630e+03,US Dollar,Reinvestment,0,2022-09-01,0,3,0,9,8.824035,8.824035
1,2022-09-01 00:02:00,3196,800107150,3196,800107150,7.739290e+03,US Dollar,7.739290e+03,US Dollar,Reinvestment,0,2022-09-01,0,3,0,9,8.954194,8.954194
2,2022-09-01 00:17:00,1208,80010E430,1208,80010E430,1.880230e+03,US Dollar,1.880230e+03,US Dollar,Reinvestment,0,2022-09-01,0,3,0,9,7.539681,7.539681
3,2022-09-01 00:03:00,1208,80010E650,20,80010E6F0,7.396688e+07,US Dollar,7.396688e+07,US Dollar,Cheque,0,2022-09-01,0,3,0,9,18.119128,18.119128
4,2022-09-01 00:02:00,1208,80010E650,20,80010EA30,4.586845e+07,US Dollar,4.586845e+07,US Dollar,Cheque,0,2022-09-01,0,3,0,9,17.641288,17.641288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31898213,2022-09-16 23:37:00,215117,84F680E11,223744,8521DCB81,5.675900e-02,Bitcoin,5.675900e-02,Bitcoin,Bitcoin,0,2022-09-16,23,4,0,9,0.055207,0.055207
31898214,2022-09-16 23:59:00,1209449,84E4F43D1,123,8521E3901,1.229300e-02,Bitcoin,1.229300e-02,Bitcoin,Bitcoin,0,2022-09-16,23,4,0,9,0.012218,0.012218
31898215,2022-09-16 23:47:00,214637,8521E8FF1,114,8521E8941,8.959400e-02,Bitcoin,8.959400e-02,Bitcoin,Bitcoin,0,2022-09-16,23,4,0,9,0.085805,0.085805
31898216,2022-09-16 23:38:00,114,8521E9E41,114,8521E8941,3.956390e-01,Bitcoin,3.956390e-01,Bitcoin,Bitcoin,0,2022-09-16,23,4,0,9,0.333352,0.333352


In [12]:
df.columns

Index(['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1',
       'Amount Received', 'Receiving Currency', 'Amount Paid',
       'Payment Currency', 'Payment Format', 'Is Laundering', 'date', 'hour',
       'day_of_week', 'is_weekend', 'month', 'log_amount_paid',
       'log_amount_received'],
      dtype='object')

In [None]:
# Log amounts
df['log_amount_paid']     = np.log1p(df['Amount Paid'])
df['log_amount_received'] = np.log1p(df['Amount Received'])

Insight: we add temporal signals and log-transform the heavy-tailed amounts.

features & target



In [13]:
TARGET = "Is Laundering"
NUMERIC = ["log_amount_paid", "log_amount_received","hour", "day_of_week", "month", "is_weekend"]
CATEGORICAL = ["Payment Currency", "Receiving Currency", "Payment Format"]


In [14]:

use_cols = NUMERIC + CATEGORICAL + [TARGET]
df = df[use_cols].copy()

print("Model frame shape:", df.shape)
df.head(2)

Model frame shape: (31898218, 10)


Unnamed: 0,log_amount_paid,log_amount_received,hour,day_of_week,month,is_weekend,Payment Currency,Receiving Currency,Payment Format,Is Laundering
0,8.824035,8.824035,0,3,9,0,US Dollar,US Dollar,Reinvestment,0
1,8.954194,8.954194,0,3,9,0,US Dollar,US Dollar,Reinvestment,0



we now have a clean, with only relevant numeric, categorical, and target columns, ready for preprocessing.



Train/Val/Test Split 60/20/20 

In [15]:
from sklearn.model_selection import train_test_split

# features & target
X = df.drop(columns=[TARGET])
y = df[TARGET]

# first split: train (60%) vs temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.40,
    stratify=y,
    random_state=42
)

# second split: validate (20%) vs test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,  
    stratify=y_temp,
    random_state=42
)

print("Train shape:", X_train.shape, "Target dist:", y_train.mean())
print("Val shape:", X_val.shape, "Target dist:", y_val.mean())
print("Test shape:", X_test.shape, "Target dist:", y_test.mean())


Train shape: (19138930, 9) Target dist: 0.0011044504577842125
Val shape: (6379644, 9) Target dist: 0.0011044503423702013
Test shape: (6379644, 9) Target dist: 0.0011044503423702013


All splits preserve the same class imbalance ratio as the full dataset.

encoder & scaler

In [None]:
# from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer 

encoder = OneHotEncoder() 
scaler = StandardScaler(with_mean=False) 

preprocessor = ColumnTransformer(
    transformers=[
        ("num", scaler, NUMERIC),
        ("cat", encoder,   CATEGORICAL),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

preprocessor.fit(X_train)

encoder_names = preprocessor.named_transformers_["cat"].get_feature_names_out(CATEGORICAL).tolist()
print("Numeric feats:", len(NUMERIC), " | encoder feats:", len(encoder_names), " | Total ~=", len(NUMERIC)+len(encoder_names))


Numeric feats: 6  | encoder feats: 37  | Total ~= 43


transform to matrices

In [19]:
# all splits
X_train_mat = preprocessor.transform(X_train)
X_val_mat   = preprocessor.transform(X_val)
X_test_mat  = preprocessor.transform(X_test)

print("X_train_mat:", X_train_mat.shape)
print("X_val_mat:  ", X_val_mat.shape)
print("X_test_mat: ", X_test_mat.shape)

X_train_mat: (19138930, 43)
X_val_mat:   (6379644, 43)
X_test_mat:  (6379644, 43)


In [None]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.array([0, 1], dtype="int8")
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
CLASS_WEIGHT = {int(c): float(w) for c, w in zip(classes, weights)}
CLASS_WEIGHT

{0: 0.5005528358086541, 1: 452.71383290755983}

Insight:
- Class 0 (Not Laundering) =  0.5, each negative counts half as much.

- Class 1 (Laundering) = 452.7: each positive counts 453× more.

This reweights the loss so the model can’t ignore the rare class (0.11%), but model learns patterns that catch laundering, improving recall (and PR-AUC).

We handled the extreme class imbalance using `class weights`   instead of `resampling`, giving laundering cases 453× more importance in the loss function.

 This preserved all data, avoided uplication, and forced the model to learn from the minority class.

Save the preprocessor

In [None]:
# with a good network i will update the path to S3 bucket
# features 

X_train.to_parquet("../data/Silver/X_train.parquet", index=False)

X_val.to_parquet("../data/Silver/X_val.parquet", index=False)
X_test.to_parquet("../data/Silver/X_test.parquet", index=False)

# labels
y_train.to_frame().to_parquet("../data/Silver/y_train.parquet", index=False)
y_val.to_frame().to_parquet("../data/Silver/y_val.parquet", index=False)
y_test.to_frame().to_parquet("../data/Silver/y_test.parquet", index=False)


Save the modeling components

In [27]:
import joblib,  json

# scaler and  encoder 
joblib.dump(preprocessor.named_transformers_["num"], "../data/Gold/scaler.joblib")
joblib.dump(preprocessor.named_transformers_["cat"], "../data/Gold/encoder.joblib")

# class weights
with open("../data/Gold/class_weight.json", "w") as f:
    json.dump(CLASS_WEIGHT, f)