Cell 1: Imports & Split

In [1]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

df = pd.read_csv("../data/raw/Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Cell 2: Apply Sampling

In [2]:
samplers = {
    "No Sampling": None,
    "Random Under": RandomUnderSampler(random_state=42),
    "Random Over": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "NearMiss": NearMiss()
}

for name, sampler in samplers.items():
    if sampler is None:
        print(name, Counter(y_train))
    else:
        X_res, y_res = sampler.fit_resample(X_train, y_train)
        print(name, Counter(y_res))


No Sampling Counter({0: 610, 1: 7})
Random Under Counter({0: 7, 1: 7})
Random Over Counter({0: 610, 1: 610})
SMOTE Counter({0: 610, 1: 610})
NearMiss Counter({0: 7, 1: 7})
