# Train-Test Split

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load data
df = pd.read_csv('preprocessed_creditcard.csv')

In [3]:
# Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,        
    random_state=42,      
    stratify=y            
)

print("Training set:")
print(f"  Shape: {X_train.shape}")
print(f"  Fraud: {y_train.sum()} ({y_train.sum()/len(y_train)*100:.3f}%)")
print(f"\nTest set:")
print(f"  Shape: {X_test.shape}")
print(f"  Fraud: {y_test.sum()} ({y_test.sum()/len(y_test)*100:.3f}%)")

Training set:
  Shape: (227845, 30)
  Fraud: 394 (0.173%)

Test set:
  Shape: (56962, 30)
  Fraud: 98 (0.172%)


# Class Imbalance Handling

### SMOTE (Synthetic Minority Over-Sampling)
1. Identify minority class
2. Select random data point in minority class
3. Find K-nearest neighbors
4. Generate synthetic samples by inserting between original and chosen neighbor

Applied after splitting to avoid data leakage

In [4]:
from imblearn.over_sampling import SMOTE

In [5]:
# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE Resampled Training Data:")
print(f"Legitimate: {(y_train_smote == 0).sum():,}")
print(f"Fraud: {(y_train_smote == 1).sum():,}")



SMOTE Resampled Training Data:
Legitimate: 227,451
Fraud: 227,451


### Random Undersampling
Randomly remove instances from majority class until size matches minority class

In [6]:
from imblearn.under_sampling import RandomUnderSampler

In [7]:
# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

print("Undersampled Training Data:")
print(f"Legitimate: {(y_train_rus == 0).sum():,}")
print(f"Fraud: {(y_train_rus == 1).sum():,}")

Undersampled Training Data:
Legitimate: 394
Fraud: 394




Balanced but small