In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek


In [3]:
df = pd.read_csv("Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]


In [4]:
print(y.value_counts())


Class
0    763
1      9
Name: count, dtype: int64


In [5]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)


In [6]:
samples = {
    "Sample1": train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=1),
    "Sample2": train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=2),
    "Sample3": train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=3),
    "Sample4": train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=4),
    "Sample5": train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=5),
}


In [7]:
sampling_methods = {
    "Sampling1": RandomOverSampler(random_state=42),
    "Sampling2": RandomUnderSampler(random_state=42),
    "Sampling3": SMOTE(random_state=42),
    "Sampling4": SMOTETomek(random_state=42),
    "Sampling5": None   # No sampling (baseline)
}


In [8]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": SVC(),
    "M5": GaussianNB()
}


In [9]:
results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

for s_name, (X_train, X_test, y_train, y_test) in samples.items():

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for samp_name, sampler in sampling_methods.items():
        if sampler is not None:
            X_res, y_res = sampler.fit_resample(X_train, y_train)
        else:
            X_res, y_res = X_train, y_train

        for m_name, model in models.items():
            model.fit(X_res, y_res)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)
            results.loc[m_name, samp_name] = round(acc * 100, 2)


In [10]:
print(results)


   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     92.16     92.16     92.16     92.16     92.16
M2     97.39     98.69     98.37     98.04     98.37
M3     99.02     99.02     99.02     99.02     99.02
M4     98.04     98.04     98.04     98.04     98.04
M5     87.25     86.93     86.27     86.27      86.6


In [11]:
best_sampling = results.astype(float).idxmax(axis=1)
print(best_sampling)


M1    Sampling1
M2    Sampling2
M3    Sampling1
M4    Sampling1
M5    Sampling1
dtype: object
