 Load Preprocessed Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load preprocessed credit card data
credit_df = pd.read_csv("../data/creditcard_ready.csv")
print("✅ Loaded credit card dataset.")


✅ Loaded credit card dataset.


Train-Test Split

In [2]:
# Split features and target
X = credit_df.drop(columns=['Class'])
y = credit_df['Class']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Fraud ratio in train: {y_train.mean():.4f}, in test: {y_test.mean():.4f}")


Train shape: (226980, 30), Test shape: (56746, 30)
Fraud ratio in train: 0.0017, in test: 0.0017


Apply SMOTE

In [3]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("✅ SMOTE applied.")
print(f"Original train counts:\n{y_train.value_counts().to_dict()}")
print(f"Balanced train counts:\n{y_train_balanced.value_counts().to_dict()}")


✅ SMOTE applied.
Original train counts:
{0: 226602, 1: 378}
Balanced train counts:
{0: 226602, 1: 226602}
