In [4]:
import pandas as pd

# load dataset
df = pd.read_csv("Creditcard_data.csv")
# show first 5 rows
df.head()
# check class distribution
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


In [6]:
# separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# check shapes
X.shape, y.shape

((772, 30), (772,))

In [7]:
from sklearn.model_selection import train_test_split

# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# check shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((540, 30), (232, 30), (540,), (232,))

In [8]:
from imblearn.over_sampling import SMOTE

# apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# check new class distribution
pd.Series(y_train_bal).value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,534
1,534


In [9]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek, SMOTEENN

# Sampling 1: Random Under Sampling
rus = RandomUnderSampler(random_state=42)
X_s1, y_s1 = rus.fit_resample(X_train, y_train)

# Sampling 2: Random Over Sampling
ros = RandomOverSampler(random_state=42)
X_s2, y_s2 = ros.fit_resample(X_train, y_train)

# Sampling 3: SMOTE
smote = SMOTE(random_state=42)
X_s3, y_s3 = smote.fit_resample(X_train, y_train)

# Sampling 4: SMOTE + Tomek Links
smt = SMOTETomek(random_state=42)
X_s4, y_s4 = smt.fit_resample(X_train, y_train)

# Sampling 5: SMOTE + ENN
sme = SMOTEENN(random_state=42)
X_s5, y_s5 = sme.fit_resample(X_train, y_train)

# check class counts
print("S1:", pd.Series(y_s1).value_counts())
print("S2:", pd.Series(y_s2).value_counts())
print("S3:", pd.Series(y_s3).value_counts())
print("S4:", pd.Series(y_s4).value_counts())
print("S5:", pd.Series(y_s5).value_counts())


S1: Class
0    6
1    6
Name: count, dtype: int64
S2: Class
0    534
1    534
Name: count, dtype: int64
S3: Class
0    534
1    534
Name: count, dtype: int64
S4: Class
0    518
1    518
Name: count, dtype: int64
S5: Class
1    418
0    346
Name: count, dtype: int64


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# define models
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(random_state=42),
    "M3_RandomForest": RandomForestClassifier(random_state=42),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}

models


{'M1_LogisticRegression': LogisticRegression(max_iter=1000),
 'M2_DecisionTree': DecisionTreeClassifier(random_state=42),
 'M3_RandomForest': RandomForestClassifier(random_state=42),
 'M4_KNN': KNeighborsClassifier(),
 'M5_SVM': SVC()}

In [11]:
from sklearn.metrics import accuracy_score

# store results
results = {}

# train and test on Sampling 1
for name, model in models.items():
    model.fit(X_s1, y_s1)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {"Sampling1": acc}

results

{'M1_LogisticRegression': {'Sampling1': 0.5775862068965517},
 'M2_DecisionTree': {'Sampling1': 0.3879310344827586},
 'M3_RandomForest': {'Sampling1': 0.6681034482758621},
 'M4_KNN': {'Sampling1': 0.75},
 'M5_SVM': {'Sampling1': 0.7456896551724138}}

In [12]:
# train and test on Sampling 2
for name, model in models.items():
    model.fit(X_s2, y_s2)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name]["Sampling2"] = acc

results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'M1_LogisticRegression': {'Sampling1': 0.5775862068965517,
  'Sampling2': 0.9181034482758621},
 'M2_DecisionTree': {'Sampling1': 0.3879310344827586,
  'Sampling2': 0.9698275862068966},
 'M3_RandomForest': {'Sampling1': 0.6681034482758621,
  'Sampling2': 0.9913793103448276},
 'M4_KNN': {'Sampling1': 0.75, 'Sampling2': 0.978448275862069},
 'M5_SVM': {'Sampling1': 0.7456896551724138, 'Sampling2': 0.875}}

In [13]:
# train and test on Sampling 3
for name, model in models.items():
    model.fit(X_s3, y_s3)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name]["Sampling3"] = acc

results


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'M1_LogisticRegression': {'Sampling1': 0.5775862068965517,
  'Sampling2': 0.9181034482758621,
  'Sampling3': 0.9353448275862069},
 'M2_DecisionTree': {'Sampling1': 0.3879310344827586,
  'Sampling2': 0.9698275862068966,
  'Sampling3': 0.9870689655172413},
 'M3_RandomForest': {'Sampling1': 0.6681034482758621,
  'Sampling2': 0.9913793103448276,
  'Sampling3': 0.9870689655172413},
 'M4_KNN': {'Sampling1': 0.75,
  'Sampling2': 0.978448275862069,
  'Sampling3': 0.7241379310344828},
 'M5_SVM': {'Sampling1': 0.7456896551724138,
  'Sampling2': 0.875,
  'Sampling3': 0.4353448275862069}}

In [14]:
# train and test on Sampling 4
for name, model in models.items():
    model.fit(X_s4, y_s4)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name]["Sampling4"] = acc

results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'M1_LogisticRegression': {'Sampling1': 0.5775862068965517,
  'Sampling2': 0.9181034482758621,
  'Sampling3': 0.9353448275862069,
  'Sampling4': 0.9267241379310345},
 'M2_DecisionTree': {'Sampling1': 0.3879310344827586,
  'Sampling2': 0.9698275862068966,
  'Sampling3': 0.9870689655172413,
  'Sampling4': 0.978448275862069},
 'M3_RandomForest': {'Sampling1': 0.6681034482758621,
  'Sampling2': 0.9913793103448276,
  'Sampling3': 0.9870689655172413,
  'Sampling4': 0.9870689655172413},
 'M4_KNN': {'Sampling1': 0.75,
  'Sampling2': 0.978448275862069,
  'Sampling3': 0.7241379310344828,
  'Sampling4': 0.7370689655172413},
 'M5_SVM': {'Sampling1': 0.7456896551724138,
  'Sampling2': 0.875,
  'Sampling3': 0.4353448275862069,
  'Sampling4': 0.4353448275862069}}

In [15]:
# train and test on Sampling 5
for name, model in models.items():
    model.fit(X_s5, y_s5)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name]["Sampling5"] = acc

results


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'M1_LogisticRegression': {'Sampling1': 0.5775862068965517,
  'Sampling2': 0.9181034482758621,
  'Sampling3': 0.9353448275862069,
  'Sampling4': 0.9267241379310345,
  'Sampling5': 0.9353448275862069},
 'M2_DecisionTree': {'Sampling1': 0.3879310344827586,
  'Sampling2': 0.9698275862068966,
  'Sampling3': 0.9870689655172413,
  'Sampling4': 0.978448275862069,
  'Sampling5': 0.875},
 'M3_RandomForest': {'Sampling1': 0.6681034482758621,
  'Sampling2': 0.9913793103448276,
  'Sampling3': 0.9870689655172413,
  'Sampling4': 0.9870689655172413,
  'Sampling5': 0.9827586206896551},
 'M4_KNN': {'Sampling1': 0.75,
  'Sampling2': 0.978448275862069,
  'Sampling3': 0.7241379310344828,
  'Sampling4': 0.7370689655172413,
  'Sampling5': 0.6810344827586207},
 'M5_SVM': {'Sampling1': 0.7456896551724138,
  'Sampling2': 0.875,
  'Sampling3': 0.4353448275862069,
  'Sampling4': 0.4353448275862069,
  'Sampling5': 0.3620689655172414}}

In [16]:
# convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Sampling1,Sampling2,Sampling3,Sampling4,Sampling5
M1_LogisticRegression,0.577586,0.918103,0.935345,0.926724,0.935345
M2_DecisionTree,0.387931,0.969828,0.987069,0.978448,0.875
M3_RandomForest,0.668103,0.991379,0.987069,0.987069,0.982759
M4_KNN,0.75,0.978448,0.724138,0.737069,0.681034
M5_SVM,0.74569,0.875,0.435345,0.435345,0.362069


In [17]:
# find best sampling technique for each model
best_sampling = results_df.idxmax(axis=1)
best_sampling


Unnamed: 0,0
M1_LogisticRegression,Sampling3
M2_DecisionTree,Sampling3
M3_RandomForest,Sampling2
M4_KNN,Sampling2
M5_SVM,Sampling2
