In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("../data/synthetic_fraud_dataset.csv")

df.head()


In [None]:
#Preparing Features & Labels
X = df.drop(["is_fraud", "transaction_id"], axis=1)
y = df["is_fraud"]

X = pd.get_dummies(X, drop_first=True)


In [None]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
#Using SMOTE to oversample fraud cases

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Preserve column names to avoid warnings
X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)

# Check before and after counts
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())



In [None]:
#Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)


In [None]:
#MODEL 1: Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train_resampled)
y_pred_log = logreg.predict(X_test_scaled)
y_prob_log = logreg.predict_proba(X_test_scaled)[:,1]



In [None]:
#Evaluation Metrics for Logistic Regression
print("LOGISTIC REGRESSION RESULTS")
print(classification_report(y_test, y_pred_log))
print("AUC:", roc_auc_score(y_test, y_prob_log))



In [None]:
#MODEL 2: Random Forest
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_resampled, y_train_resampled)       # train on SMOTE data
y_pred_rf = rf.predict(X_test)                     # test on original test set



In [None]:
#Evaluation Metrics for Random Forest
y_prob_rf = rf.predict_proba(X_test)[:,1]

print("RANDOM FOREST RESULTS")
print(classification_report(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_prob_rf))



In [None]:
#Logistic Regression Confusion Matrix
cm = confusion_matrix(y_test, y_pred_log)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – Logistic Regression")
plt.show()


In [None]:
# Random Forest Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – Random Forest")
plt.show()
