In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv("creditcard.csv")


In [3]:
X = df.drop("Class", axis=1)
y = df["Class"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [5]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


Class
0    0.998271
1    0.001729
Name: proportion, dtype: float64
Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


In [6]:
import joblib
scaler = StandardScaler()
X_train[['Time','Amount']] = scaler.fit_transform(X_train[['Time','Amount']])
X_test[['Time','Amount']] = scaler.transform(X_test[['Time','Amount']])
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [7]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [8]:
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())


Before SMOTE: Class
0    227451
1       394
Name: count, dtype: int64
After SMOTE: Class
0    227451
1    227451
Name: count, dtype: int64


In [9]:
train_res = pd.concat([X_train_res, y_train_res], axis=1)
train_res.to_csv("train_res.csv", index=False)

test_set = pd.concat([X_test, y_test], axis=1)
test_set.to_csv("test_set.csv", index=False)
