In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, accuracy_score, mean_squared_error, r2_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
import xgboost as xgb

plt.style.use('seaborn-v0_8')


In [None]:
# Load the processed parquet samples
alerts = pd.read_parquet("data/processed/synthetic_alerts_sample.parquet")
transactions = pd.read_parquet("data/processed/synthetic_transactions_sample.parquet")

print(f"Alerts dataset shape: {alerts.shape}")
print(f"Transactions dataset shape: {transactions.shape}")

# Merge transactions and alerts by AlertID
merged = transactions.merge(alerts, on="AlertID", how="inner")
print(f"Merged dataset shape: {merged.shape}")

# Convert text labels to binary format
merged['Label'] = merged['Outcome'].map({'Report': 1, 'Dismiss': 0})


In [None]:
# Example of feature engineering
# Add new features based on existing columns
merged['Size_log'] = np.log1p(merged['Size'])      # Log-transform transaction size
merged['Size_sqrt'] = np.sqrt(merged['Size'])     # Square root transform

# More features could be added here, e.g., time-based or categorical encoding
# For demonstration, we'll use only these numeric features
features = ['Size', 'Size_log', 'Size_sqrt']
X = merged[features]
y = merged['Label']

print("Feature sample:")
X.head()


In [None]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features for models that require it (optional for tree-based models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:,1]

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Random Forest Confusion Matrix")
plt.show()


In [None]:
# XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:,1]

print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, y_prob_xgb))

# Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("XGBoost Confusion Matrix")
plt.show()


In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:,1]

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("ROC AUC:", roc_auc_score(y_test, y_prob_dt))

# Confusion Matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Oranges', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Decision Tree Confusion Matrix")
plt.show()


In [None]:
# Linear Regression (predict probability of being Report)
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

y_pred_lr = lr_model.predict(X_test_scaled)
# Convert probabilities to labels
y_pred_lr_label = (y_pred_lr > 0.5).astype(int)

print("Linear Regression Evaluation:")
print(classification_report(y_test, y_pred_lr_label))
print("Accuracy:", accuracy_score(y_test, y_pred_lr_label))
print("ROC AUC:", roc_auc_score(y_test, y_pred_lr))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr_label)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Purples', cbar=False,
            xticklabels=["Pred Dismiss", "Pred Report"],
            yticklabels=["True Dismiss", "True Report"])
plt.title("Linear Regression Confusion Matrix")
plt.show()


In [None]:
plt.figure(figsize=(7,6))

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr)

plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC={roc_auc_score(y_test, y_prob_rf):.2f})")
plt.plot(fpr_xgb, tpr_xgb, label=f"XGBoost (AUC={roc_auc_score(y_test, y_prob_xgb):.2f})")
plt.plot(fpr_dt, tpr_dt, label=f"Decision Tree (AUC={roc_auc_score(y_test, y_prob_dt):.2f})")
plt.plot(fpr_lr, tpr_lr, label=f"Linear Regression (AUC={roc_auc_score(y_test, y_pred_lr):.2f})")

plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()
