In [2]:

# 1. IMPORT LIBRARIES

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler


# 2. LOAD DATASET

df = pd.read_csv("Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]


# 3. BALANCE THE DATASET

ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

balanced_df = pd.concat([X_bal, y_bal], axis=1)


# 4. DEFINE SAMPLING TECHNIQUES


def simple_random_sampling(data, size=0.7):
    return data.sample(frac=size, random_state=42)

def systematic_sampling(data, step=2):
    return data.iloc[::step, :]

def stratified_sampling(data, size=0.7):
    X = data.drop("Class", axis=1)
    y = data["Class"]
    X_train, _, y_train, _ = train_test_split(
        X, y, train_size=size, stratify=y, random_state=42
    )
    return pd.concat([X_train, y_train], axis=1)

def cluster_sampling(data, n_clusters=10):
    data = data.copy()
    data["cluster"] = pd.qcut(data.index, n_clusters, labels=False)
    clusters = np.random.choice(data["cluster"].unique(), size=5, replace=False)
    sampled = data[data["cluster"].isin(clusters)]
    return sampled.drop("cluster", axis=1)

def bootstrap_sampling(data):
    return data.sample(frac=1, replace=True, random_state=42)

sampling_methods = {
    "Sampling1": simple_random_sampling,
    "Sampling2": systematic_sampling,
    "Sampling3": stratified_sampling,
    "Sampling4": cluster_sampling,
    "Sampling5": bootstrap_sampling
}

# 5. DEFINE ML MODELS

models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(n_estimators=100),
    "M4": SVC(),
    "M5": KNeighborsClassifier()
}


# 6. RUN EXPERIMENTS

results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

for samp_name, samp_func in sampling_methods.items():
    sampled_data = samp_func(balanced_df)

    X_s = sampled_data.drop("Class", axis=1)
    y_s = sampled_data["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        results.loc[model_name, samp_name] = round(acc * 100, 2)


# 7. DISPLAY RESULTS

print("\nAccuracy Table (%):\n")
print(results)


# 8. BEST SAMPLING PER MODEL

print("\nBest Sampling Technique per Model:\n")
for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_acc = results.loc[model].astype(float).max()
    print(f"{model}: {best_sampling} ({best_acc}%)")



Accuracy Table (%):

   Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1     90.97      91.7     91.28     84.28     93.67
M2     99.69     97.82     99.38     98.25     100.0
M3     100.0     100.0     100.0     100.0     100.0
M4     98.13     99.13      97.2     97.82     98.69
M5     98.13     96.07     96.88     97.38     98.25

Best Sampling Technique per Model:

M1: Sampling5 (93.67%)
M2: Sampling5 (100.0%)
M3: Sampling1 (100.0%)
M4: Sampling2 (99.13%)
M5: Sampling5 (98.25%)
