## Fraud Detection Modeling

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# import mlflow
import sys

# Data preparation

In [2]:
# Load datasets
fraud_data = pd.read_csv('../data/fraud_data_processed.csv')
creditcard_data = pd.read_csv('../data/credit_data_processed.csv')

# Data Preparation
# Drop the 'device_id' and 'ip_address' columns as they contain string values
fraud_data = fraud_data.drop(columns=['device_id', 'ip_address'])

# Convert datetime columns to numerical features
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time']).astype(int) / 10**9
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time']).astype(int) / 10**9

# Feature and Target Separation
X_fraud = fraud_data.drop('class', axis=1)
y_fraud = fraud_data['class']

X_credit = creditcard_data.drop('Class', axis=1)
y_credit = creditcard_data['Class']

# Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

# Model Selection
models = {
  'Logistic Regression': LogisticRegression(max_iter=1000),
  'Decision Tree': DecisionTreeClassifier(),
  'Random Forest': RandomForestClassifier(),
  'Gradient Boosting': GradientBoostingClassifier(),
  'Multi-Layer Perceptron': MLPClassifier(max_iter=500)
}

# Model Training and Evaluation

In [3]:
# Training and Evaluating Models

# import a function that performs training and evaluating from scripts folder
sys.path.append("../scripts")
from model_train_evaluate import train_and_evaluate

print("--- Fraud Data ---")
train_and_evaluate(models, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, "Fraud_Data")

--- Fraud Data ---
Logistic Regression on Fraud_Data: Accuracy=0.9057, Precision=1.0000, Recall=0.0000, F1=0.0000, ROC-AUC=0.5000
Decision Tree on Fraud_Data: Accuracy=0.9033, Precision=0.4891, Recall=0.5674, F1=0.5253, ROC-AUC=0.7528
Random Forest on Fraud_Data: Accuracy=0.9559, Precision=0.9884, Recall=0.5389, F1=0.6975, ROC-AUC=0.7691
Gradient Boosting on Fraud_Data: Accuracy=0.9557, Precision=0.9834, Recall=0.5393, F1=0.6966, ROC-AUC=0.7692
Multi-Layer Perceptron on Fraud_Data: Accuracy=0.6892, Precision=0.1161, Recall=0.3470, F1=0.1740, ROC-AUC=0.5359


In [4]:
print("--- Credit Card Data ---")
train_and_evaluate(models, X_credit_train, X_credit_test, y_credit_train, y_credit_test, "CreditCard_Data")

--- Credit Card Data ---
Logistic Regression on CreditCard_Data: Accuracy=0.9992, Precision=0.8889, Recall=0.5333, F1=0.6667, ROC-AUC=0.7666
Decision Tree on CreditCard_Data: Accuracy=0.9990, Precision=0.6882, Recall=0.7111, F1=0.6995, ROC-AUC=0.8553
Random Forest on CreditCard_Data: Accuracy=0.9995, Precision=0.9697, Recall=0.7111, F1=0.8205, ROC-AUC=0.8555
Gradient Boosting on CreditCard_Data: Accuracy=0.9993, Precision=0.8906, Recall=0.6333, F1=0.7403, ROC-AUC=0.8166
Multi-Layer Perceptron on CreditCard_Data: Accuracy=0.9995, Precision=0.9412, Recall=0.7111, F1=0.8101, ROC-AUC=0.8555
