In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_class_weight


import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_parquet("../data/Silver/df_reduced.parquet") 
# df = pd.read_parquet("s3://bass-risk-monitoring/Bronze/df_reduced.parquet")

In [3]:
df.shape

(31898238, 25)

In [4]:
df.dtypes

Timestamp              datetime64[ns]
From Bank                       int32
Account                string[python]
To Bank                         int32
Account.1              string[python]
Amount Received               float64
Receiving Currency           category
Amount Paid                   float64
Payment Currency             category
Payment Format               category
Is Laundering                    int8
log_amount_paid               float64
hour                            int32
date                           object
month                       period[M]
day_of_week                     int32
is_weekend                      int64
daily_txn_count                 int64
time_since_last_txn           float64
txn_count_24h_excl              int32
amount_ratio                  float64
currency_diversity              int64
format_diversity                int64
unique_receivers                int64
is_self_transfer                 int8
dtype: object

features for our models

In [5]:
UTIL = ['Timestamp']

TARGET = "Is Laundering"
CATEGORICAL = ['Receiving Currency', 'Payment Currency', 'Payment Format']

ID_LIKE = ['Account', 'Account.1', 'From Bank', 'To Bank']
DROP_RAW_AMOUNTS = ['Amount Paid', 'Amount Received']

#  numeric
num_all = df.select_dtypes(include=[np.number]).columns.tolist()
NUMERIC = [c for c in num_all if c not in ([TARGET] + ID_LIKE)]
NUMERIC = [c for c in NUMERIC if c not in DROP_RAW_AMOUNTS]


KEEP = list(dict.fromkeys(NUMERIC + CATEGORICAL + [TARGET] + UTIL))
df = df[KEEP].copy()
print( df.columns.tolist())
print( len(df.columns))


['log_amount_paid', 'hour', 'day_of_week', 'is_weekend', 'daily_txn_count', 'time_since_last_txn', 'txn_count_24h_excl', 'amount_ratio', 'currency_diversity', 'format_diversity', 'unique_receivers', 'is_self_transfer', 'Receiving Currency', 'Payment Currency', 'Payment Format', 'Is Laundering', 'Timestamp']
17


#### Time-based split (60/20/20) and laundering rates

AML patterns evolve over time.
we will train on earlier normal behavior and testing on later data simulates real monitoring conditions and avoids future information leakage.

In [6]:
# chronological order
df = df.sort_values("Timestamp").reset_index(drop=True)

t1 = df["Timestamp"].quantile(0.60)
t2 = df["Timestamp"].quantile(0.80)

train_df = df[df["Timestamp"] <= t1].copy()
val_df   = df[(df["Timestamp"] > t1) & (df["Timestamp"] <= t2)].copy()
test_df  = df[df["Timestamp"] >  t2].copy()


In [7]:
def summarize(name, d, target="Is Laundering"):
    n = len(d)
    pos = int(d[target].sum())
    rate = 100.0 * pos / n if n else 0.0
    print(f"{name:5s} | shape={d.shape} | positives={pos} | rate={rate:.1f}%")

print(" Laundering rates")
summarize("Train", train_df)
summarize("Val",   val_df)
summarize("Test",  test_df)

 Laundering rates
Train | shape=(19139453, 17) | positives=15536 | rate=0.1%
Val   | shape=(6379894, 17) | positives=9054 | rate=0.1%
Test  | shape=(6378891, 17) | positives=10640 | rate=0.2%


In [8]:

FEATURES = NUMERIC + CATEGORICAL 

X_train, y_train = train_df[FEATURES].copy(), train_df[TARGET].astype("int8")
X_val,   y_val   = val_df[FEATURES].copy(),   val_df[TARGET].astype("int8")
X_test,  y_test  = test_df[FEATURES].copy(),  test_df[TARGET].astype("int8")

print("Shapes -> Train:", X_train.shape, "| Val:", X_val.shape, "| Test:", X_test.shape)


Shapes -> Train: (19139453, 15) | Val: (6379894, 15) | Test: (6378891, 15)


Encoding and scalling training set 

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), NUMERIC),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CATEGORICAL),
    ],
    remainder="drop",
    sparse_threshold=1.0,  
    verbose_feature_names_out=False
)


In [10]:
X_train_pre = preprocessor.fit_transform(X_train)
X_val_pre   = preprocessor.transform(X_val)
X_test_pre  = preprocessor.transform(X_test)
try:
    feat_names = preprocessor.get_feature_names_out()
except Exception:
    feat_names = np.array([f"f{i}" for i in range(X_train_pre.shape[1])])


In [11]:
print("Prepared shapes:", X_train_pre.shape, X_val_pre.shape, X_test_pre.shape)
print("Sparse matrix? ->", hasattr(X_train_pre, "tocsr"))

Prepared shapes: (19139453, 49) (6379894, 49) (6378891, 49)
Sparse matrix? -> False


Class weights

In [12]:
clases_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
classes = np.array([0, 1], dtype=int)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
class_weights


{np.int64(0): np.float64(0.5004061929363112),
 np.int64(1): np.float64(615.9710671987642)}

Save artifacts & splits

In [16]:
import joblib
joblib.dump(preprocessor, "../data/models/preprocessor.joblib")
joblib.dump(NUMERIC, "../data/models/numeric_cols.joblib")
joblib.dump(CATEGORICAL, "../data/models/categorical_cols.joblib")

joblib.dump(feat_names, "../data/models/prepared_feature_names.joblib")
joblib.dump(class_weights,"../data/models/class_weights.joblib")

pd.DataFrame(y_train).to_parquet("../data/Gold/y_train.parquet", index=False)
pd.DataFrame(y_val).to_parquet("../data/Gold/y_val.parquet",   index=False)
pd.DataFrame(y_test).to_parquet("../data/Gold/y_test.parquet",  index=False)

joblib.dump(X_train_pre,"../data/models/X_train_pre.joblib")
joblib.dump(X_val_pre, "../data/models/X_val_pre.joblib")
joblib.dump(X_test_pre, "../data/models/X_test_pre.joblib")



['../data/models/X_test_pre.joblib']