In [5]:
pip install pandas numpy scikit-learn imblearn matplotlib plotly dash joblib

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting plotly
  Downloading plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-1.42.1-py3-none-any.whl.metadata (11 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.0-py3-none-any.whl.metadata (7.5 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading plotly-6.1.2-py3-none-any.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading dash-3.0.4-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading narwhals-1.42.1-py3-none-any.whl (359 kB)
Downloading retrying-1.4.0-py3-none-any.whl (11 kB)
Installing

In [4]:
!pip install hdbscan


Collecting hdbscan
  Downloading hdbscan-0.8.40-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading hdbscan-0.8.40-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: hdbscan
Successfully installed hdbscan-0.8.40


1. Preprocessing Pipeline

This updates the preprocessing to exclude Is_laundering from scaling while keeping it as a feature.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os

# Load data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/data/raw/SAML-D.csv')

# Convert datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S', errors='coerce')

# Feature Engineering
sender_diversity = df.groupby('Sender_account')['Receiver_account'].nunique().rename('Recipient_diversity')
receiver_diversity = df.groupby('Receiver_account')['Sender_account'].nunique().rename('Sender_diversity')
df = df.merge(sender_diversity, on='Sender_account', how='left')
df = df.merge(receiver_diversity, on='Receiver_account', how='left')

daily_txn_count = df.groupby(['Sender_account', 'Date']).size().rename('Daily_frequency')
df = df.merge(daily_txn_count, on=['Sender_account', 'Date'], how='left')
avg_velocity = df.groupby('Sender_account')['Daily_frequency'].mean().rename('Avg_velocity')
df = df.merge(avg_velocity, on='Sender_account', how='left')

inflow = df.groupby('Receiver_account')['Amount'].sum().rename('Total_inflow')
outflow = df.groupby('Sender_account')['Amount'].sum().rename('Total_outflow')
df = df.merge(inflow, left_on='Sender_account', right_index=True, how='left')
df = df.merge(outflow, left_on='Receiver_account', right_index=True, how='left', suffixes=('_inflow', '_outflow'))
df['Inflow_Outflow_Ratio'] = df['Total_inflow'] / (df['Total_outflow'] + 1e-6)

df = df.sort_values(by=['Sender_account', 'Date', 'Time'])
df['Txn_sequence'] = df.groupby('Sender_account').cumcount() + 1
df['Rolling_avg_amt'] = df.groupby('Sender_account')['Amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True)

df['Hour'] = df['Time'].dt.hour
df['Minute'] = df['Time'].dt.minute
df['Weekday'] = df['Date'].dt.weekday
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df = df.drop(columns=['Time', 'Laundering_type'])

# Downcast
df['Sender_account'] = df['Sender_account'].astype('int32')
df['Receiver_account'] = df['Receiver_account'].astype('int32')
df['Amount'] = df['Amount'].astype('float32')
df['Is_laundering'] = df['Is_laundering'].astype('int8')
new_features = ['Recipient_diversity', 'Sender_diversity', 'Daily_frequency', 
                'Avg_velocity', 'Total_inflow', 'Total_outflow', 
                'Inflow_Outflow_Ratio', 'Txn_sequence', 'Rolling_avg_amt']
for col in new_features:
    df[col] = df[col].astype('float32')

# Encode categoricals
categorical_cols = ['Payment_currency', 'Received_currency', 'Sender_bank_location', 
                   'Receiver_bank_location', 'Payment_type']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Scale numerical features
numerical_cols = ['Amount', 'Recipient_diversity', 'Sender_diversity', 'Daily_frequency', 
                  'Avg_velocity', 'Total_inflow', 'Total_outflow', 
                  'Inflow_Outflow_Ratio', 'Txn_sequence', 'Rolling_avg_amt', 
                  'Hour', 'Minute', 'Weekday', 'Day', 'Month', 'Is_laundering']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
joblib.dump(scaler, '/home/students/Documents/AML CFT_dashboard_project/models/scaler.pkl')

# Save encoded columns
joblib.dump(df.columns.tolist(), '/home/students/Documents/AML CFT_dashboard_project/models/encoded_columns.pkl')

# Handle NaNs
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median(numeric_only=True))

# Save
output_dir = "/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed"
os.makedirs(output_dir, exist_ok=True)
df.to_csv(os.path.join(output_dir, "cleaned_data.csv"), index=False)
print("Cleaned data saved.")

# Verify
saved_df = pd.read_csv(os.path.join(output_dir, "cleaned_data.csv"))
print("Saved data shape:", saved_df.shape)
print("Saved data columns:", saved_df.columns.tolist())
print("Missing values:", saved_df.isnull().sum())

Cleaned data saved.
Saved data shape: (9504852, 83)
Saved data columns: ['Date', 'Sender_account', 'Receiver_account', 'Amount', 'Is_laundering', 'Recipient_diversity', 'Sender_diversity', 'Daily_frequency', 'Avg_velocity', 'Total_inflow', 'Total_outflow', 'Inflow_Outflow_Ratio', 'Txn_sequence', 'Rolling_avg_amt', 'Hour', 'Minute', 'Weekday', 'Day', 'Month', 'Payment_currency_Dirham', 'Payment_currency_Euro', 'Payment_currency_Indian rupee', 'Payment_currency_Mexican Peso', 'Payment_currency_Moroccan dirham', 'Payment_currency_Naira', 'Payment_currency_Pakistani rupee', 'Payment_currency_Swiss franc', 'Payment_currency_Turkish lira', 'Payment_currency_UK pounds', 'Payment_currency_US dollar', 'Payment_currency_Yen', 'Received_currency_Dirham', 'Received_currency_Euro', 'Received_currency_Indian rupee', 'Received_currency_Mexican Peso', 'Received_currency_Moroccan dirham', 'Received_currency_Naira', 'Received_currency_Pakistani rupee', 'Received_currency_Swiss franc', 'Received_currency

Unsupervised Models

In [5]:
 # HDBSCAN

import pandas as pd
import numpy as np
import hdbscan
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os
import datetime
import logging
import json

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
log = logging.info

# Load cleaned data
data_path = '/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv'
df = pd.read_csv(data_path)

# Sample data
sample_size = min(10000, len(df))
df_sample = df.sample(n=sample_size, random_state=42)

# Features and target
y_true = df_sample['Is_laundering'].values
X = df_sample.drop(columns=['Is_laundering', 'Date'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit HDBSCAN
log(f" Fitting HDBSCAN on sample size: {sample_size}")
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, prediction_data=True)
cluster_labels = clusterer.fit_predict(X_scaled)
anomaly_labels = (cluster_labels == -1).astype(int)

# Evaluation
report = classification_report(y_true, anomaly_labels, zero_division=0, output_dict=True)
conf_matrix = confusion_matrix(y_true, anomaly_labels)

log("HDBSCAN Classification Report:")
print(classification_report(y_true, anomaly_labels, zero_division=0))
log(f"Confusion Matrix:\n{conf_matrix}")

# Timestamped file names
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
base_path = '/home/students/Documents/AML CFT_dashboard_project/models'
model_path = os.path.join(base_path, f'hdbscan_model_{timestamp}.pkl')
pred_path = os.path.join(base_path, f'hdbscan_predictions_{timestamp}.csv')
conf_path = os.path.join(base_path, f'hdbscan_confusion_matrix_{timestamp}.csv')
json_path = os.path.join(base_path, f'hdbscan_report_{timestamp}.json')

# Save model
joblib.dump(clusterer, model_path)
log(f"Model saved: {model_path}")

# Save predictions
pred_df = pd.DataFrame({
    'True_Label': y_true,
    'HDBSCAN_Prediction': anomaly_labels,
    'Outlier_Score': clusterer.outlier_scores_
})
pred_df.to_csv(pred_path, index=False)
log(f"Predictions saved: {pred_path}")

# Save confusion matrix
pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1']).to_csv(conf_path)
log(f"Confusion matrix saved: {conf_path}")

# Save report as JSON (for dashboard or API)
with open(json_path, 'w') as f:
    json.dump(report, f, indent=4)
log(f"Classification report saved: {json_path}")


 Fitting HDBSCAN on sample size: 10000
HDBSCAN Classification Report:
Confusion Matrix:
[[8762 1226]
 [   6    6]]
Model saved: /home/students/Documents/AML CFT_dashboard_project/models/hdbscan_model_20250627_234910.pkl
Predictions saved: /home/students/Documents/AML CFT_dashboard_project/models/hdbscan_predictions_20250627_234910.csv
Confusion matrix saved: /home/students/Documents/AML CFT_dashboard_project/models/hdbscan_confusion_matrix_20250627_234910.csv
Classification report saved: /home/students/Documents/AML CFT_dashboard_project/models/hdbscan_report_20250627_234910.json


              precision    recall  f1-score   support

           0       1.00      0.88      0.93      9988
           1       0.00      0.50      0.01        12

    accuracy                           0.88     10000
   macro avg       0.50      0.69      0.47     10000
weighted avg       1.00      0.88      0.93     10000



In [10]:
## isolation forest 

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv')

# Select features (include Is_laundering, exclude Date)
features = [col for col in df.columns if col not in ['Date']]
X = df[features]
y = df['Is_laundering']

# Load preprocessing artifacts
encoded_columns = joblib.load('/home/students/Documents/AML CFT_dashboard_project/models/encoded_columns.pkl')

# Align features with saved columns
X = X.reindex(columns=encoded_columns, fill_value=0)

print('Data shape:', X.shape)
print('Target shape:', y.shape)

# Sample 10% of data
X_sample = X.sample(frac=0.1, random_state=42)
y_sample = y[X_sample.index]

# Clear memory
del df, X, y

# Initialize Isolation Forest
iso_forest = IsolationForest(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'contamination': [0.001, 0.01]
}

# Fine-tune model
best_score = 0
best_params = None
best_model = None

for params in ParameterGrid(param_grid):
    model = IsolationForest(**params, random_state=42)
    model.fit(X_sample)
    y_pred = model.predict(X_sample)
    y_pred = np.where(y_pred == -1, 1, 0)  # Convert -1 (anomaly) to 1, 1 (normal) to 0
    report = classification_report(y_sample, y_pred, output_dict=True, zero_division=0)
    f1_score = report['1']['f1-score']
    if f1_score > best_score:
        best_score = f1_score
        best_params = params
        best_model = model

# Evaluate best model
y_pred = best_model.predict(X_sample)
y_pred = np.where(y_pred == -1, 1, 0)
print(f'Best Parameters: {best_params}')
print('Isolation Forest Results:')
print(classification_report(y_sample, y_pred, zero_division=0))

# Save model
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(output_dir, 'isolation_forest.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "isolation_forest.pkl")}')

# Clear memory
del X_sample, y_sample, best_model

Data shape: (9504852, 83)
Target shape: (9504852,)
Best Parameters: {'contamination': 0.01, 'n_estimators': 100}
Isolation Forest Results:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    949502
           1       0.01      0.11      0.02       983

    accuracy                           0.99    950485
   macro avg       0.51      0.55      0.51    950485
weighted avg       1.00      0.99      0.99    950485

Model saved to: /home/students/Documents/AML CFT_dashboard_project/models/isolation_forest.pkl


In [11]:
## One class SVM

import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report
import joblib
import os
from sklearn.model_selection import ParameterGrid

# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv')

# Select features (include Is_laundering, exclude Date)
features = [col for col in df.columns if col not in ['Date']]
X = df[features]
y = df['Is_laundering']

# Load preprocessing artifacts
encoded_columns = joblib.load('/home/students/Documents/AML CFT_dashboard_project/models/encoded_columns.pkl')

# Align features with saved columns
X = X.reindex(columns=encoded_columns, fill_value=0)

print('Data shape:', X.shape)
print('Target shape:', y.shape)

# Sample 1% of data
X_sample = X.sample(frac=0.01, random_state=42)
y_sample = y[X_sample.index]

# Clear memory
del df, X, y

# Initialize One-Class SVM
oc_svm = OneClassSVM(kernel='rbf')

# Hyperparameter grid
param_grid = {
    'nu': [0.001, 0.01],
    'gamma': ['scale', 'auto']
}

# Fine-tune model
best_score = 0
best_params = None
best_model = None

for params in ParameterGrid(param_grid):
    model = OneClassSVM(**params)
    model.fit(X_sample)
    y_pred = model.predict(X_sample)
    y_pred = np.where(y_pred == -1, 1, 0)  # Convert -1 (anomaly) to 1, 1 (normal) to 0
    report = classification_report(y_sample, y_pred, output_dict=True, zero_division=0)
    f1_score = report['1']['f1-score']
    if f1_score > best_score:
        best_score = f1_score
        best_params = params
        best_model = model

# Evaluate best model
y_pred = best_model.predict(X_sample)
y_pred = np.where(y_pred == -1, 1, 0)
print(f'Best Parameters: {best_params}')
print('One-Class SVM Results:')
print(classification_report(y_sample, y_pred, zero_division=0))

# Save model
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(output_dir, 'one_class_svm.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "one_class_svm.pkl")}')

# Clear memory
del X_sample, y_sample, best_model

Data shape: (9504852, 83)
Target shape: (9504852,)
Best Parameters: {'gamma': 'scale', 'nu': 0.01}
One-Class SVM Results:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     94958
           1       0.00      0.02      0.00        91

    accuracy                           0.99     95049
   macro avg       0.50      0.51      0.50     95049
weighted avg       1.00      0.99      0.99     95049

Model saved to: /home/students/Documents/AML CFT_dashboard_project/models/one_class_svm.pkl


In [14]:
# Local Outlier Factor (LOF)

import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report
import joblib
import os
from sklearn.model_selection import ParameterGrid

# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv')

# Select features (exclude Date)
features = [col for col in df.columns if col not in ['Date']]
X = df[features]
y = df['Is_laundering']

# Load preprocessing artifacts
encoded_columns = joblib.load('/home/students/Documents/AML CFT_dashboard_project/models/encoded_columns.pkl')

# Align features with saved columns
X = X.reindex(columns=encoded_columns, fill_value=0)

print('Data shape:', X.shape)
print('Target shape:', y.shape)

# Sample 1% of data due to LOF's computational intensity
X_sample = X.sample(frac=0.01, random_state=42)
y_sample = y[X_sample.index]

# Convert to NumPy array to avoid feature name warning
X_sample_np = X_sample.to_numpy()
feature_names = X_sample.columns  # Store for reference

# Clear memory
del df, X, y

# Initialize LOF with default parameters
lof = LocalOutlierFactor(novelty=True)  # novelty=True for dashboard predictions
default_params = {'n_neighbors': 20, 'contamination': 0.01}
best_model = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=True)
best_model.fit(X_sample_np)  # Default model to avoid NoneType
best_score = 0
best_params = default_params

# Hyperparameter grid
param_grid = {
    'n_neighbors': [20, 50],
    'contamination': [0.001, 0.01]
}

# Fine-tune model
for params in ParameterGrid(param_grid):
    try:
        model = LocalOutlierFactor(n_neighbors=params['n_neighbors'], contamination=params['contamination'], novelty=True)
        model.fit(X_sample_np)
        y_pred = model.predict(X_sample_np)
        y_pred = np.where(y_pred == -1, 1, 0)  # Convert -1 (outlier) to 1, 1 (inlier) to 0
        report = classification_report(y_sample, y_pred, output_dict=True, zero_division=0)
        f1_score = report['1']['f1-score']
        if f1_score > best_score:
            best_score = f1_score
            best_params = params
            best_model = model
    except Exception as e:
        print(f"Error with parameters {params}: {e}")
        continue

# Evaluate best model
y_pred = best_model.predict(X_sample_np)
y_pred = np.where(y_pred == -1, 1, 0)
print(f'Best Parameters: {best_params}')
print('Local Outlier Factor Results:')
print(classification_report(y_sample, y_pred, zero_division=0))

# Save model and feature names
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(output_dir, 'local_outlier_factor.pkl'))
joblib.dump(feature_names, os.path.join(output_dir, 'lof_feature_names.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "local_outlier_factor.pkl")}')
print(f'Feature names saved to: {os.path.join(output_dir, "lof_feature_names.pkl")}')

# Clear memory
del X_sample, X_sample_np, y_sample, best_model

Data shape: (9504852, 83)
Target shape: (9504852,)
Best Parameters: {'n_neighbors': 20, 'contamination': 0.01}
Local Outlier Factor Results:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     94958
           1       0.00      0.00      0.00        91

    accuracy                           0.99     95049
   macro avg       0.50      0.50      0.50     95049
weighted avg       1.00      0.99      0.99     95049

Model saved to: /home/students/Documents/AML CFT_dashboard_project/models/local_outlier_factor.pkl
Feature names saved to: /home/students/Documents/AML CFT_dashboard_project/models/lof_feature_names.pkl


In [15]:
# K-Means Clustering

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import joblib
import os
from sklearn.model_selection import ParameterGrid

# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv')

# Select features (include Is_laundering, exclude Date)
features = [col for col in df.columns if col not in ['Date']]
X = df[features]
y = df['Is_laundering']

# Load preprocessing artifacts
encoded_columns = joblib.load('/home/students/Documents/AML CFT_dashboard_project/models/encoded_columns.pkl')

# Align features with saved columns
X = X.reindex(columns=encoded_columns, fill_value=0)

print('Data shape:', X.shape)
print('Target shape:', y.shape)

# Sample 10% of data
X_sample = X.sample(frac=0.1, random_state=42)
y_sample = y[X_sample.index]

# Clear memory
del df, X, y

# Initialize K-Means
kmeans = KMeans(random_state=42)

# Hyperparameter grid
param_grid = {
    'n_clusters': [5, 10],
    'contamination': [0.001, 0.01]  # Threshold for flagging outliers
}

# Fine-tune model
best_score = 0
best_params = None
best_model = None
best_threshold = None

for params in ParameterGrid(param_grid):
    model = KMeans(n_clusters=params['n_clusters'], random_state=42)
    model.fit(X_sample)
    # Compute distances to nearest centroid
    distances = np.min(model.transform(X_sample), axis=1)
    # Determine threshold for outliers based on contamination
    threshold = np.percentile(distances, 100 * (1 - params['contamination']))
    y_pred = np.where(distances > threshold, 1, 0)  # Far from centroid = 1 (anomaly)
    report = classification_report(y_sample, y_pred, output_dict=True, zero_division=0)
    f1_score = report['1']['f1-score']
    if f1_score > best_score:
        best_score = f1_score
        best_params = params
        best_model = model
        best_threshold = threshold

# Evaluate best model
distances = np.min(best_model.transform(X_sample), axis=1)
y_pred = np.where(distances > best_threshold, 1, 0)
print(f'Best Parameters: {best_params}')
print(f'Best Threshold: {best_threshold}')
print('K-Means Clustering Results:')
print(classification_report(y_sample, y_pred, zero_division=0))

# Save model and threshold
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(output_dir, 'kmeans_clustering.pkl'))
joblib.dump(best_threshold, os.path.join(output_dir, 'kmeans_threshold.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "kmeans_clustering.pkl")}')
print(f'Threshold saved to: {os.path.join(output_dir, "kmeans_threshold.pkl")}')

# Clear memory
del X_sample, y_sample, best_model, distances

Data shape: (9504852, 83)
Target shape: (9504852,)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Best Parameters: {'contamination': 0.01, 'n_clusters': 10}
Best Threshold: 920900863.5641096
K-Means Clustering Results:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    949502
           1       0.00      0.01      0.00       983

    accuracy                           0.99    950485
   macro avg       0.50      0.50      0.50    950485
weighted avg       1.00      0.99      0.99    950485

Model saved to: /home/students/Documents/AML CFT_dashboard_project/models/kmeans_clustering.pkl
Threshold saved to: /home/students/Documents/AML CFT_dashboard_project/models/kmeans_threshold.pkl


Supervised


In [1]:
# supervised data preparation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
import os
from sklearn.ensemble import RandomForestClassifier

# Set random seed for reproducibility
np.random.seed(42)

# Load preprocessed data
df = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv')

# Select features (exclude Date, Is_laundering)
features = [col for col in df.columns if col not in ['Date', 'Is_laundering']]
X = df[features]
y = df['Is_laundering']

print('Data shape:', X.shape)
print('Target shape:', y.shape)

# Sample 10% of data
X_sample = X.sample(frac=0.1, random_state=42)
y_sample = y[X_sample.index]

# Apply SMOTE for class balance
smote = SMOTE(sampling_strategy=1.0, random_state=42)  # 1:1 balance
X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)

# Feature selection (train temporary Random Forest)
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X_resampled, y_resampled)
importances = pd.Series(rf_temp.feature_importances_, index=X_resampled.columns)
top_features = importances.nlargest(20).index
X_resampled = X_resampled[top_features]
X_sample = X_sample[top_features]  # For consistency in later models

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)

# Save train/test data
output_dir = '/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed'
os.makedirs(output_dir, exist_ok=True)
pd.DataFrame(X_train).to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
pd.DataFrame(X_test).to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
pd.Series(y_train).to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
pd.Series(y_test).to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)

# Save top features for dashboard consistency
joblib.dump(top_features.tolist(), os.path.join(output_dir, 'top_features.pkl'))

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)
print('Balanced class counts:', pd.Series(y_resampled).value_counts())
print('Top features:', top_features.tolist())

# Clear memory
del df, X, y, X_sample, y_sample, X_resampled, y_resampled, X_train, X_test, y_train, y_test, rf_temp

Data shape: (9504852, 81)
Target shape: (9504852,)
Train shape: (1329302, 20)
Test shape: (569702, 20)
Balanced class counts: Is_laundering
0    949502
1    949502
Name: count, dtype: int64
Top features: ['Total_outflow', 'Sender_diversity', 'Recipient_diversity', 'Inflow_Outflow_Ratio', 'Daily_frequency', 'Total_inflow', 'Avg_velocity', 'Payment_type_Cross-border', 'Txn_sequence', 'Payment_type_Cash Deposit', 'Payment_type_Cash Withdrawal', 'Weekday', 'Rolling_avg_amt', 'Amount', 'Day', 'Month', 'Received_currency_Euro', 'Receiver_bank_location_UK', 'Sender_account', 'Received_currency_Moroccan dirham']


In [2]:
# Random forest

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

# Load train/test data
X_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_train.csv')
X_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_test.csv')
y_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_train.csv')
y_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_test.csv')

# Convert to numpy arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Load top features
top_features = joblib.load('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/top_features.pkl')
X_train = X_train[top_features]
X_test = X_test[top_features]

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Hyperparameter distribution
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2]
}

# Fine-tune with RandomizedSearchCV
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=12, scoring='f1', cv=3, n_jobs=-1, random_state=42, verbose=2)
random_search.fit(X_train, y_train)

# Evaluate best model
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
print(f'Best Parameters: {random_search.best_params_}')
print('Random Forest Results:')
print(classification_report(y_test, y_pred, zero_division=0))

# Create plots directory
plots_dir = '/home/students/Documents/AML CFT_dashboard_project/plots'
os.makedirs(plots_dir, exist_ok=True)

# Plot feature importance
importances = pd.Series(best_rf.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False).plot(kind='bar', title='Feature Importance')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'rf_feature_importance.png'))
plt.close()

# Save model
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_rf, os.path.join(output_dir, 'random_forest.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "random_forest.pkl")}')

# Clear memory
del X_train, X_test, y_train, y_test, best_rf, random_search


Train shape: (1329302, 20)
Test shape: (569702, 20)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=13.0min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=13.1min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=13.2min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=13.3min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.0min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.3min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.6min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=20

In [3]:
# Rogistic regression 

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

# Load train/test data
X_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_train.csv')
X_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_test.csv')
y_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_train.csv')
y_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_test.csv')

# Convert to numpy arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Load top features
top_features = joblib.load('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/top_features.pkl')
X_train = X_train[top_features]
X_test = X_test[top_features]

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

# Initialize Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=2000)

# Hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'penalty': ['l2']
}

# Fine-tune with RandomizedSearchCV
random_search = RandomizedSearchCV(lr, param_distributions=param_grid, n_iter=12, scoring='f1', cv=3, n_jobs=-1, random_state=42, verbose=2)
random_search.fit(X_train, y_train)

# Evaluate best model
best_lr = random_search.best_estimator_
y_pred = best_lr.predict(X_test)
print(f'Best Parameters: {random_search.best_params_}')
print('Logistic Regression Results:')
print(classification_report(y_test, y_pred, zero_division=0))

# Create plots directory
plots_dir = '/home/students/Documents/AML CFT_dashboard_project/plots'
os.makedirs(plots_dir, exist_ok=True)

# Plot coefficients
coefficients = pd.Series(best_lr.coef_[0], index=X_train.columns)
coefficients.sort_values(ascending=False).plot(kind='bar', title='Logistic Regression Coefficients')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'lr_coefficients.png'))
plt.close()

# Save model
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_lr, os.path.join(output_dir, 'logistic_regression.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "logistic_regression.pkl")}')

# Clear memory
del X_train, X_test, y_train, y_test, best_lr, random_search


Train shape: (1329302, 20)
Test shape: (569702, 20)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ..................C=0.001, penalty=l2, solver=lbfgs; total time=   2.1s
[CV] END ..................C=0.001, penalty=l2, solver=lbfgs; total time=   2.5s
[CV] END ..................C=0.001, penalty=l2, solver=lbfgs; total time=   2.6s
[CV] END ..............C=0.001, penalty=l2, solver=liblinear; total time=   2.4s
[CV] END ..............C=0.001, penalty=l2, solver=liblinear; total time=   3.3s
[CV] END ..............C=0.001, penalty=l2, solver=liblinear; total time=   2.8s




[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=28.4min
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=28.5min




[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=28.5min
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   1.2s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   1.2s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   1.2s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   1.7s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   2.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   1.7s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   1.7s




[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   2.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=28.7min
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   1.3s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   1.8s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   1.3s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   1.2s




[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=27.9min




[CV] END ....................C=0.01, penalty=l2, solver=saga; total time=28.0min




[CV] END .......................C=1, penalty=l2, solver=saga; total time=27.8min




[CV] END .......................C=1, penalty=l2, solver=saga; total time=27.9min




[CV] END .......................C=1, penalty=l2, solver=saga; total time=37.7min
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   1.6s




[CV] END ...................C=0.001, penalty=l2, solver=saga; total time=37.8min




[CV] END ...................C=0.001, penalty=l2, solver=saga; total time=37.9min




[CV] END ...................C=0.001, penalty=l2, solver=saga; total time=38.1min




[CV] END .....................C=100, penalty=l2, solver=saga; total time=24.9min




[CV] END .....................C=100, penalty=l2, solver=saga; total time=24.7min




[CV] END .....................C=100, penalty=l2, solver=saga; total time=25.5min
Best Parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.001}
Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.51      0.57      0.54    284851
           1       0.51      0.45      0.48    284851

    accuracy                           0.51    569702
   macro avg       0.51      0.51      0.51    569702
weighted avg       0.51      0.51      0.51    569702

Model saved to: /home/students/Documents/AML CFT_dashboard_project/models/logistic_regression.pkl


In [1]:
# SVM with Kernel Variants

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib
import os

# Set random seed
np.random.seed(42)

# Load data
X_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_train.csv')
X_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_test.csv')
y_train = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_train.csv')
y_test = pd.read_csv('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/y_test.csv')

# Flatten target arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Load top features
top_features = joblib.load('/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/top_features.pkl')
X_train = X_train[top_features]
X_test = X_test[top_features]

# Downsample
X_train = X_train.sample(n=100000, random_state=42)
y_train = y_train[X_train.index]
X_test = X_test.sample(n=50000, random_state=42)
y_test = y_test[X_test.index]

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize base SVM
svm = SVC(random_state=42)

# Expanded hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']
}

# Use GridSearchCV to explore all combinations
grid_search = GridSearchCV(
    svm,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)
print(f'Best Parameters: {grid_search.best_params_}')
print('SVM Results:')
print(classification_report(y_test, y_pred, zero_division=0))

# Save plot of coefficients (only works if linear)
if best_svm.kernel == 'linear':
    coefficients = pd.Series(best_svm.coef_[0], index=top_features)
    coefficients.sort_values(ascending=False).plot(kind='bar', title='SVM Linear Kernel Coefficients')
    plt.tight_layout()
    plots_dir = '/home/students/Documents/AML CFT_dashboard_project/plots'
    os.makedirs(plots_dir, exist_ok=True)
    plt.savefig(os.path.join(plots_dir, 'svm_coefficients.png'))
    plt.close()

# Save best model
output_dir = '/home/students/Documents/AML CFT_dashboard_project/models'
os.makedirs(output_dir, exist_ok=True)
joblib.dump(best_svm, os.path.join(output_dir, 'svm_classifier.pkl'))
print(f'Model saved to: {os.path.join(output_dir, "svm_classifier.pkl")}')

# Clean up
del X_train, X_test, y_train, y_test, best_svm, grid_search


Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END .............................C=0.001, kernel=linear; total time= 3.9min
[CV] END .............................C=0.001, kernel=linear; total time= 4.0min
[CV] END .............................C=0.001, kernel=linear; total time= 4.1min
[CV] END ................................C=0.001, kernel=rbf; total time=10.7min
[CV] END ................................C=0.001, kernel=rbf; total time= 9.9min
[CV] END ................................C=0.001, kernel=rbf; total time=10.0min
[CV] END ...............................C=0.001, kernel=poly; total time=10.1min
[CV] END ..............................C=0.01, kernel=linear; total time= 2.8min
[CV] END ..............................C=0.01, kernel=linear; total time= 2.7min
[CV] END ..............................C=0.01, kernel=linear; total time= 2.9min
[CV] END ...............................C=0.001, kernel=poly; total time= 9.7min
[CV] END .................................C=0.01

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Define Paths
data_path = "/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/cleaned_data.csv"
top_features_path = "/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/top_features.pkl"
output_path = "/home/students/Documents/AML CFT_dashboard_project/app/sample_for_dashboard.csv"

# Load Cleaned Data
df = pd.read_csv(data_path)
print("✅ Loaded cleaned_data.csv:", df.shape)

# Load Top Features
top_features = joblib.load(top_features_path)
print("✅ Loaded top_features.pkl with", len(top_features), "features:", top_features)

# Select Columns (Date, top features, Is_laundering)
columns_to_keep = ['Date'] + top_features + ['Is_laundering'] if 'Is_laundering' in df.columns else ['Date'] + top_features
df = df[columns_to_keep]

# Verify Features
missing_cols = set(top_features) - set(df.columns)
if missing_cols:
    print(f"⚠️ Missing features in data: {missing_cols}")
    for col in missing_cols:
        df[col] = 0

# Ensure correct order
df = df[['Date'] + top_features + (['Is_laundering'] if 'Is_laundering' in df.columns else [])]

# Numerical columns for scaling (only those in top_features)
numerical_cols = [col for col in top_features if col in [
    'Amount', 'Recipient_diversity', 'Sender_diversity', 'Daily_frequency', 
    'Avg_velocity', 'Total_inflow', 'Total_outflow', 'Inflow_Outflow_Ratio', 
    'Txn_sequence', 'Rolling_avg_amt', 'Weekday', 'Day', 'Month'
]]
print("✅ Numerical columns for scaling:", numerical_cols)

# Fit a new scaler on the numerical columns in top_features
if numerical_cols:
    scaler = StandardScaler()
    scaler.fit(df[numerical_cols])
    df[numerical_cols] = scaler.transform(df[numerical_cols])
    # Save the new scaler for consistency
    joblib.dump(scaler, "/home/students/Documents/AML CFT_dashboard_project/models/scaler_sample.pkl")
    print("✅ Saved new scaler_sample.pkl")

# Handle any NaNs (should be none, but ensure robustness)
df = df.fillna(0)

# Sample 10,000 rows
sample_df = df.sample(n=10000, random_state=42)
print("✅ Sample shape:", sample_df.shape)

# Save to CSV (No Index)
sample_df.to_csv(output_path, index=False)
print(f"✅ Sample saved for dashboard upload: {output_path}")

# Verify
saved_df = pd.read_csv(output_path)
print("✅ Saved data columns:", saved_df.columns.tolist())
print("✅ Missing values:", saved_df.isnull().sum())