In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib

print("Libraries imported successfully.")


Libraries imported successfully.


Load the New Dataset

In [2]:
try:
    df = pd.read_csv('../data/raw/Big_Black_Money_Dataset.csv')
    print("Global Black Money Dataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'Big_Black_Money_Dataset.csv' not found. Please download it and place it in 'ml/data/raw/'.")
    df = None


Global Black Money Dataset loaded successfully.
Dataset shape: (10000, 14)


Preprocessing and Feature Engineering

In [5]:
# Cell 3: Preprocessing and Feature Engineering (Corrected for Big_Black_Money_Dataset.csv)

if df is not None:
    # Select relevant features for tracing using the correct column names
    features = [
        'Country', 
        'Destination Country', 
        'Amount (USD)',
        'Tax Haven Country',
        'Money Laundering Risk Score'
    ]
    df_processed = df[features].copy()

    # Clean up column names by removing spaces and special characters
    df_processed.rename(columns={
        'Amount (USD)': 'AmountUSD',
        'Tax Haven Country': 'TaxHavenCountry',
        'Money Laundering Risk Score': 'RiskScore'
    }, inplace=True)
    
    # Handle potential missing values
    df_processed['TaxHavenCountry'].fillna('None', inplace=True)
    df_processed.dropna(subset=['Country', 'Destination Country'], inplace=True)

    # --- CORRECTED LOGIC ---
    # Create the target label BEFORE encoding the features.
    # This is more robust and avoids the "unseen label" error.
    is_tax_haven = df_processed['TaxHavenCountry'] != 'None'
    is_high_risk_score = df_processed['RiskScore'] > 7
    df_processed['isSuspiciousPath'] = (is_tax_haven | is_high_risk_score).astype(int)
    
    # Now, encode the categorical features for the model's input
    encoders = {}
    for col in ['Country', 'Destination Country', 'TaxHavenCountry']:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        encoders[col] = le
    
    print("Data preprocessed and features engineered successfully.")
    display(df_processed.head())



Data preprocessed and features engineered successfully.


Unnamed: 0,Country,Destination Country,AmountUSD,TaxHavenCountry,RiskScore,isSuspiciousPath
0,0,9,3267530.0,4,6,1
1,1,5,4965767.0,0,9,1
2,8,6,94167.5,5,1,1
3,7,3,386420.1,3,7,1
4,5,9,643378.4,2,1,1


Create Transaction Sequences

In [6]:
if 'df_processed' in locals():
    # Group transactions by the sender to create sequences
    # For this example, we'll use an implicit grouping (the whole file is one big trace)
    # In a real scenario, you'd group by a case ID or a primary account.
    
    sequences = []
    labels = []
    
    # We'll treat every 10 transactions as a potential sequence
    sequence_length = 10 
    for i in range(0, len(df_processed) - sequence_length):
        sequence = df_processed.iloc[i:i+sequence_length].drop('isSuspiciousPath', axis=1).values
        label = df_processed.iloc[i+sequence_length]['isSuspiciousPath']
        sequences.append(sequence)
        labels.append(label)

    X = np.array(sequences)
    y = np.array(labels)

    print(f"Created {len(X)} sequences of length {sequence_length}.")
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")


Created 9990 sequences of length 10.
Shape of X: (9990, 10, 5)
Shape of y: (9990,)


Scale and Pad Data

In [7]:
if 'X' in locals():
    # Scale the numerical features (amount)
    # We reshape to 2D, scale, then reshape back to 3D for the LSTM
    scaler = StandardScaler()
    X_reshaped = X.reshape(-1, X.shape[-1])
    X_scaled = scaler.fit_transform(X_reshaped)
    X = X_scaled.reshape(X.shape)

    print("Data scaled successfully.")


Data scaled successfully.


Build the LSTM Model

In [8]:
if 'X' in locals():
    model = Sequential([
        # The Masking layer ignores any padding we might add
        Masking(mask_value=0., input_shape=(X.shape[1], X.shape[2])),
        LSTM(64, return_sequences=True),
        LSTM(32),
        Dense(1, activation='sigmoid') # Output is a single probability (0 to 1)
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 10, 5)             0         
                                                                 
 lstm (LSTM)                 (None, 10, 64)            17920     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 30,369
Trainable params: 30,369
Non-trainable params: 0
_________________________________________________________________


Train the Model

In [9]:
if 'model' in locals():
    print("Training the LSTM trace model...")
    history = model.fit(X, y, epochs=10, batch_size=64, validation_split=0.2)
    print("Model training completed.")


Training the LSTM trace model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training completed.


Save the Model and Artifacts

In [10]:
if 'model' in locals():
    # Save the trained LSTM model
    model.save('../models/lstm_trace_model.h5')
    print("LSTM model saved successfully to ../models/lstm_trace_model.h5")

    # Save the scaler
    joblib.dump(scaler, '../models/trace_scaler.pkl')
    print("Scaler saved successfully to ../models/trace_scaler.pkl")

    # Save the label encoders
    joblib.dump(encoders, '../models/trace_encoders.pkl')
    print("Encoders saved successfully to ../models/trace_encoders.pkl")


LSTM model saved successfully to ../models/lstm_trace_model.h5
Scaler saved successfully to ../models/trace_scaler.pkl
Encoders saved successfully to ../models/trace_encoders.pkl
