Cell 1 : Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss


Cell 2 : Data

In [2]:
df = pd.read_csv("../data/raw/Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Cell 3 : Samplers

In [3]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(class_weight="balanced", probability=True),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier()
}

samplers = {
    "No Sampling": None,
    "Random Under": RandomUnderSampler(random_state=42),
    "Random Over": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "NearMiss": NearMiss()
}


Cell 4 : Training Loop

In [4]:
results = []

for s_name, sampler in samplers.items():
    for m_name, model in models.items():

        steps = []

        if m_name in ["LogisticRegression", "SVM"]:
            steps.append(("scaler", StandardScaler()))

        if sampler is not None:
            steps.append(("sampler", sampler))

        steps.append(("model", model))

        pipe = ImbPipeline(steps)
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)
        y_prob = pipe.predict_proba(X_test)[:,1] if hasattr(pipe, "predict_proba") else y_pred

        results.append({
            "Sampler": s_name,
            "Model": m_name,
            "Accuracy": accuracy_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "ROC_AUC": roc_auc_score(y_test, y_prob)
        })

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Sampler,Model,Accuracy,F1,ROC_AUC
0,No Sampling,LogisticRegression,0.987097,0.0,0.408497
1,No Sampling,SVM,0.948387,0.0,0.529412
2,No Sampling,RandomForest,0.987097,0.0,0.620915
3,No Sampling,GradientBoosting,0.987097,0.0,0.859477
4,Random Under,LogisticRegression,0.6,0.03125,0.539216
5,Random Under,SVM,0.522581,0.026316,0.535948
6,Random Under,RandomForest,0.63871,0.034483,0.71732
7,Random Under,GradientBoosting,0.374194,0.039604,0.650327
8,Random Over,LogisticRegression,0.909677,0.0,0.539216
9,Random Over,SVM,0.954839,0.0,0.531046


Cell 5 : Results

In [5]:
results_df.to_csv("../results/final_results.csv", index=False)
results_df.sort_values("Accuracy", ascending=False)


Unnamed: 0,Sampler,Model,Accuracy,F1,ROC_AUC
15,SMOTE,GradientBoosting,0.993548,0.666667,0.552288
14,SMOTE,RandomForest,0.993548,0.666667,0.732026
11,Random Over,GradientBoosting,0.993548,0.666667,0.784314
10,Random Over,RandomForest,0.993548,0.666667,0.606209
2,No Sampling,RandomForest,0.987097,0.0,0.620915
3,No Sampling,GradientBoosting,0.987097,0.0,0.859477
0,No Sampling,LogisticRegression,0.987097,0.0,0.408497
13,SMOTE,SVM,0.96129,0.0,0.472222
9,Random Over,SVM,0.954839,0.0,0.531046
1,No Sampling,SVM,0.948387,0.0,0.529412
