In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE

# Load dataset
df_cc = pd.read_csv('../data/raw/creditcard.csv')

print(f"Dataset Shape: {df_cc.shape}")
print(f"Class Distribution:\n{df_cc['Class'].value_counts(normalize=True)}")

Dataset Shape: (284807, 31)
Class Distribution:
Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64


In [2]:
# Scaling the only two non-transformed features
scaler = StandardScaler()
df_cc['Amount'] = scaler.fit_transform(df_cc[['Amount']])
df_cc['Time'] = scaler.fit_transform(df_cc[['Time']])

# Feature/Target Split
X = df_cc.drop('Class', axis=1)
y = df_cc['Class']

# Stratified Train-Test Split (BEFORE SMOTE to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Fraud Rate: {y_train.mean():.4f}")
print(f"Testing Fraud Rate: {y_test.mean():.4f}")

Training Fraud Rate: 0.0017
Testing Fraud Rate: 0.0017


In [3]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print(f"Resampled Training Set: {X_train_res.shape}")

Resampled Training Set: (454902, 30)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

# 1. Initialize the Ensemble Model
# We use max_depth=10 to ensure the model generalizes well and doesn't just memorize the SMOTE samples
rf_cc = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    random_state=42, 
    n_jobs=-1
)

# 2. Train on the Resampled Training Data
print("Training Random Forest on Credit Card data... This may take a moment.")
rf_cc.fit(X_train_res, y_train_res)

# 3. Predict on the Original (Unseen) Test Set
y_pred_cc = rf_cc.predict(X_test)
y_probs_cc = rf_cc.predict_proba(X_test)[:, 1]

# 4. Evaluation
print("\n--- Credit Card Ensemble Evaluation ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_cc))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_cc))

# 5. AUC-PR (The gold standard for the highly imbalanced Credit Card dataset)
precision, recall, _ = precision_recall_curve(y_test, y_probs_cc)
auc_pr_cc = auc(recall, precision)
print(f"\nEnsemble AUC-PR Score: {auc_pr_cc:.4f}")

Training Random Forest on Credit Card data... This may take a moment.
