In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
# Load dataset
df = pd.read_csv('../data/creditcard.csv')

# Scale Time and Amount columns
scaler = StandardScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

# Separate features and labels
X = df.drop('Class', axis=1)
y = df['Class']

print("Original class distribution:")
print(y.value_counts())


Original class distribution:
Class
0    284315
1       492
Name: count, dtype: int64


In [7]:
# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("After SMOTE:")
print(pd.Series(y_resampled).value_counts())


After SMOTE:
Class
0    284315
1    284315
Name: count, dtype: int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)


In [9]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

best_model = None
best_score = 0
best_name = ""

for name, model in models.items():
    print(f"\n🔍 Training: {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = model.score(X_test, y_test)

    print(f"✅ Accuracy: {score:.4f}")
    print(confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds))

    if score > best_score:
        best_score = score
        best_model = model
        best_name = name

print(f"\n🏆 Best Model: {best_name} with Accuracy: {best_score:.4f}")



🔍 Training: Logistic Regression
✅ Accuracy: 0.9484
[[55446  1417]
 [ 4453 52410]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     56863
           1       0.97      0.92      0.95     56863

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726


🔍 Training: Random Forest
✅ Accuracy: 0.9999
[[56847    16]
 [    0 56863]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726


🔍 Training: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.9997
[[56828    35]
 [    0 56863]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726


🏆 Best Model: Random Forest with Accuracy: 0.9999


In [10]:
joblib.dump(best_model, '../models/fraud_detector.pkl')
print("Model saved to ../models/fraud_detector.pkl")


Model saved to ../models/fraud_detector.pkl
