In [2]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Paths
data_path = "/Users/michealomotosho/Documents/EDUCATION DOCUMENTS/DATA SCIENCE SELF PROJECT/TransGuard-AI/data/Processed/transactions_cleaned.csv"
model_path = "/Users/michealomotosho/Documents/EDUCATION DOCUMENTS/DATA SCIENCE SELF PROJECT/TransGuard-AI/models/random_forest_model.pkl"

# Load processed data
df = pd.read_csv(data_path)

# Features & target
X = df.drop(['Class'], axis=1)
y = df['Class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Check class imbalance
print("Class distribution in training set:")
print(y_train.value_counts())

# Initialize Random Forest with class weights
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',  # helps with imbalanced data
    n_jobs=-1
)

# Train
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)[:, 1]

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_probs)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# Save model
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(clf, model_path)
print(f"\n✅ Model saved to: {model_path}")


Class distribution in training set:
Class
0    227451
1       394
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0     0.9995    0.9999    0.9997     56864
           1     0.9474    0.7347    0.8276        98

    accuracy                         0.9995     56962
   macro avg     0.9735    0.8673    0.9137     56962
weighted avg     0.9995    0.9995    0.9994     56962


Confusion Matrix:
[[56860     4]
 [   26    72]]

ROC-AUC Score: 0.9581

✅ Model saved to: /Users/michealomotosho/Documents/EDUCATION DOCUMENTS/DATA SCIENCE SELF PROJECT/TransGuard-AI/models/random_forest_model.pkl
