In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Define paths
MODEL_PATH = "/home/students/Documents/AML CFT_dashboard_project/models"
DATA_PATH = "/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed"
TOP_FEATURES = joblib.load(f"{DATA_PATH}/top_features.pkl")

# Load saved models
models = {
    "Random Forest": joblib.load(f"{MODEL_PATH}/random_forest.pkl"),
    "Logistic Regression": joblib.load(f"{MODEL_PATH}/logistic_regression.pkl"),
    "HDBSCAN": joblib.load(f"{MODEL_PATH}/hdbscan_model.pkl"),
    "Isolation Forest": joblib.load(f"{MODEL_PATH}/isolation_forest.pkl")
}

# Load test data
X_test = pd.read_csv(f"{DATA_PATH}/X_test.csv")
y_test = pd.read_csv(f"{DATA_PATH}/y_test.csv", header=None, skiprows=1, dtype={0: int})[0]

# Ensure X_test matches TOP_FEATURES and align lengths
X_test = X_test[TOP_FEATURES]
if len(X_test) != len(y_test):
    min_length = min(len(X_test), len(y_test))
    X_test = X_test.iloc[:min_length]
    y_test = y_test.iloc[:min_length]
    print(f"Trimmed data to match lengths: {min_length}")

# Make predictions with each model
predictions = {}
for model_name, model in models.items():
    if model_name == "HDBSCAN":
        labels = model.fit_predict(X_test)
        predictions[model_name] = (labels == -1).astype(int)
    else:
        if model_name == "Isolation Forest":
            # Use first 2 numerical features (Amount, Recipient_diversity) as per dashboard
            numerical_cols = ['Amount', 'Recipient_diversity', 'Sender_diversity', 'Daily_frequency',
                              'Avg_velocity', 'Total_inflow', 'Total_outflow', 'Inflow_Outflow_Ratio',
                              'Txn_sequence', 'Rolling_avg_amt', 'Weekday', 'Day', 'Month']
            X_if = X_test[numerical_cols[:2]].values
            predictions[model_name] = np.where(model.predict(X_if) == -1, 1, 0)
        else:
            predictions[model_name] = model.predict(X_test)

    # Calculate accuracy (if y_test is available and lengths match)
    if y_test is not None and len(y_test) == len(predictions[model_name]):
        acc = accuracy_score(y_test, predictions[model_name])
        print(f"{model_name} Accuracy: {acc:.4f}")
    
    # Print unique predictions to check for uniformity
    unique_preds = np.unique(predictions[model_name])
    print(f"{model_name} Unique Predictions: {unique_preds}")
    print(f"{model_name} Prediction Distribution: {pd.Series(predictions[model_name]).value_counts()}\n")

Random Forest Accuracy: 1.0000
Random Forest Unique Predictions: [0 1]
Random Forest Prediction Distribution: 0    284862
1    284840
Name: count, dtype: int64

Logistic Regression Accuracy: 0.5120
Logistic Regression Unique Predictions: [0 1]
Logistic Regression Prediction Distribution: 0    318963
1    250739
Name: count, dtype: int64

HDBSCAN Accuracy: 0.5315
HDBSCAN Unique Predictions: [0 1]
HDBSCAN Prediction Distribution: 0    440996
1    128706
Name: count, dtype: int64

Isolation Forest Accuracy: 0.5000
Isolation Forest Unique Predictions: [0]
Isolation Forest Prediction Distribution: 0    569702
Name: count, dtype: int64



In [2]:
from sklearn.ensemble import IsolationForest
import joblib

X_train = pd.read_csv("/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed/X_train.csv")
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(X_train[TOP_FEATURES])  # Use all 20 features
joblib.dump(iso_forest, "/home/students/Documents/AML CFT_dashboard_project/models/isolation_forest.pkl")

['/home/students/Documents/AML CFT_dashboard_project/models/isolation_forest.pkl']

In [3]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# Define paths
MODEL_PATH = "/home/students/Documents/AML CFT_dashboard_project/models"
DATA_PATH = "/home/students/Documents/AML CFT_dashboard_project/notebooks/data/processed"
TOP_FEATURES = joblib.load(f"{DATA_PATH}/top_features.pkl")

# Load saved models
models = {
    "Random Forest": joblib.load(f"{MODEL_PATH}/random_forest.pkl"),
    "Logistic Regression": joblib.load(f"{MODEL_PATH}/logistic_regression.pkl"),
    "HDBSCAN": joblib.load(f"{MODEL_PATH}/hdbscan_model.pkl"),
    "Isolation Forest": joblib.load(f"{MODEL_PATH}/isolation_forest.pkl")
}

# Load test data
X_test = pd.read_csv(f"{DATA_PATH}/X_test.csv")
y_test = pd.read_csv(f"{DATA_PATH}/y_test.csv", header=None, skiprows=1, dtype={0: int})[0]

# Ensure X_test matches TOP_FEATURES and align lengths
X_test = X_test[TOP_FEATURES]
if len(X_test) != len(y_test):
    min_length = min(len(X_test), len(y_test))
    X_test = X_test.iloc[:min_length]
    y_test = y_test.iloc[:min_length]
    print(f"Trimmed data to match lengths: {min_length}")

# Make predictions with each model
predictions = {}
for model_name, model in models.items():
    if model_name == "HDBSCAN":
        labels = model.fit_predict(X_test)
        predictions[model_name] = (labels == -1).astype(int)
    else:
        predictions[model_name] = model.predict(X_test)
        if model_name == "Isolation Forest":
            # Use all features since the model was retrained with TOP_FEATURES
            predictions[model_name] = np.where(model.predict(X_test) == -1, 1, 0)

    # Calculate accuracy (if y_test is available and lengths match)
    if y_test is not None and len(y_test) == len(predictions[model_name]):
        acc = accuracy_score(y_test, predictions[model_name])
        print(f"{model_name} Accuracy: {acc:.4f}")
    
    # Print unique predictions to check for uniformity
    unique_preds = np.unique(predictions[model_name])
    print(f"{model_name} Unique Predictions: {unique_preds}")
    print(f"{model_name} Prediction Distribution: {pd.Series(predictions[model_name]).value_counts()}\n")

Random Forest Accuracy: 1.0000
Random Forest Unique Predictions: [0 1]
Random Forest Prediction Distribution: 0    284862
1    284840
Name: count, dtype: int64

Logistic Regression Accuracy: 0.5120
Logistic Regression Unique Predictions: [0 1]
Logistic Regression Prediction Distribution: 0    318963
1    250739
Name: count, dtype: int64

HDBSCAN Accuracy: 0.5315
HDBSCAN Unique Predictions: [0 1]
HDBSCAN Prediction Distribution: 0    440996
1    128706
Name: count, dtype: int64

Isolation Forest Accuracy: 0.5381
Isolation Forest Unique Predictions: [0 1]
Isolation Forest Prediction Distribution: 0    512590
1     57112
Name: count, dtype: int64



In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the raw dataset
data = pd.read_csv("/home/students/Documents/AML CFT_dashboard_project/data/raw/SAML-D.csv")

# Preprocess the data to match cleaned_data.csv structure
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S', errors='coerce')
data = data.sort_values(by=['Sender_account', 'Date', 'Time'])
data['Total_inflow'] = data.groupby('Receiver_account')['Amount'].cumsum()
data['Total_outflow'] = data.groupby('Sender_account')['Amount'].cumsum()
data['Inflow_Outflow_Ratio'] = data['Total_inflow'] / (data['Total_outflow'] + 1e-6)

# Calculate diversity using a rolling window with custom apply
def rolling_nunique(x, window=100):
    return x.rolling(window=window, min_periods=1).apply(lambda y: y.nunique(), raw=False)

data['Recipient_diversity'] = data.groupby('Sender_account')['Receiver_account'].apply(
    lambda x: rolling_nunique(x)
).reset_index(level=0, drop=True)

data['Sender_diversity'] = data.groupby('Receiver_account')['Sender_account'].apply(
    lambda x: rolling_nunique(x)
).reset_index(level=0, drop=True)

data['Daily_frequency'] = data.groupby(['Sender_account', 'Date']).transform('size')
data['Avg_velocity'] = data.groupby('Sender_account')['Daily_frequency'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)
data['Txn_sequence'] = data.groupby('Sender_account').cumcount() + 1
data['Rolling_avg_amt'] = data.groupby('Sender_account')['Amount'].rolling(
    window=3, min_periods=1).mean().reset_index(0, drop=True)

data['Hour'] = data['Time'].dt.hour
data['Minute'] = data['Time'].dt.minute
data['Weekday'] = data['Date'].dt.weekday
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data = data.drop(columns=['Time', 'Laundering_type'])
data['Sender_account'] = data['Sender_account'].astype('int32')
data['Receiver_account'] = data['Receiver_account'].astype('int32')
data['Amount'] = data['Amount'].astype('float32')
if 'Is_laundering' in data.columns:
    data['Is_laundering'] = data['Is_laundering'].astype('int8')
for col in ['Recipient_diversity', 'Sender_diversity', 'Daily_frequency', 'Avg_velocity',
            'Total_inflow', 'Total_outflow', 'Inflow_Outflow_Ratio', 'Txn_sequence', 'Rolling_avg_amt']:
    data[col] = data[col].astype('float32')
data = pd.get_dummies(data, columns=['Payment_currency', 'Received_currency', 'Sender_bank_location', 'Receiver_bank_location', 'Payment_type'], drop_first=True)

# Separate features and target
X = data.drop('Is_laundering', axis=1)
y = data['Is_laundering']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

# Save the processed files
X_train.to_csv("/home/students/Documents/AML CFT_dashboard_project/data/processed/X_train.csv", index=False)
X_test.to_csv("/home/students/Documents/AML CFT_dashboard_project/data/processed/X_test.csv", index=False)
y_train.to_csv("/home/students/Documents/AML CFT_dashboard_project/data/processed/y_train.csv", index=False, header=True)
y_test.to_csv("/home/students/Documents/AML CFT_dashboard_project/data/processed/y_test.csv", index=False, header=True)