In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

# Load the engineered dataset
df = pd.read_csv('engineered_transactions.csv')

# Sample a subset of the data for faster processing (adjust the sample size as needed)
df_sampled = df.sample(frac=0.1, random_state=42)  # Sample 10% of the data

# Define features and target
X = df_sampled[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'amount_percentage', 'balance_diff', 'type']]
y = df_sampled['isFraud']

# One-hot encode 'type' column (since it's categorical)
X = pd.get_dummies(X, columns=['type'], drop_first=True)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (important for many models like Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the models with default settings
logreg = LogisticRegression(max_iter=1000, random_state=42)  # Increased max_iter to ensure convergence
rf = RandomForestClassifier(random_state=42, n_jobs=-1)  # Use all CPUs for Random Forest

# Train the models (using a smaller sample will speed this up)
logreg.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)

# Make predictions
y_pred_logreg = logreg.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

# Evaluate the models
print("Logistic Regression Evaluation:")
print(classification_report(y_test, y_pred_logreg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

print("\nRandom Forest Evaluation:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Accuracy score
print("\nAccuracy (Logistic Regression):", accuracy_score(y_test, y_pred_logreg))
print("Accuracy (Random Forest):", accuracy_score(y_test, y_pred_rf))

# You can omit the ROC curve for now to speed up the process




Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    127079
           1       0.84      0.40      0.54       174

    accuracy                           1.00    127253
   macro avg       0.92      0.70      0.77    127253
weighted avg       1.00      1.00      1.00    127253

Confusion Matrix:
[[127066     13]
 [   105     69]]

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    127079
           1       0.97      0.73      0.83       174

    accuracy                           1.00    127253
   macro avg       0.98      0.86      0.92    127253
weighted avg       1.00      1.00      1.00    127253

Confusion Matrix:
[[127075      4]
 [    47    127]]

Accuracy (Logistic Regression): 0.9990727134134362
Accuracy (Random Forest): 0.9995992235939428
