<a href="https://colab.research.google.com/github/Agamjot12/Sampling/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier,BaggingClassifier
from imblearn.under_sampling import ClusterCentroids,RandomUnderSampler ,EditedNearestNeighbours
from imblearn.over_sampling import ADASYN ,RandomOverSampler , SMOTE , BorderlineSMOTE

In [16]:
df = pd.read_csv('https://raw.githubusercontent.com/Agamjot12/Sampling/main/Creditcard_data.csv')

In [17]:
df.Class.value_counts()

0    763
1      9
Name: Class, dtype: int64

In [18]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [20]:
os = RandomOverSampler(sampling_strategy='minority')
x_over, y_over = os.fit_resample(x_train,y_train)

In [21]:
df_over = pd.concat([x_over,y_over],axis=1)

In [22]:
# Determine the required sample size for the minority class using Cochran's formula
p = 0.5   #p=0.5 as exactly half of our dataset lies in strata 0 and the rest in strata 1
z = 1.96 # z-score for 95% confidence level
m = 0.05 # margin of error
n1 = int(np.ceil((z**2 * 0.1 * (1-0.1)) / (m**2)))
n2 = int(np.ceil((z**2 * 0.06 * (1-0.06)) / (m**2)))
n3 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n4 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n5 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))

In [33]:
samplings = {
    'Sampling1': ADASYN(),
    'Sampling2': SMOTE(),
    'Sampling3': RandomUnderSampler(),
    'Sampling4': BorderlineSMOTE(),
    'Sampling5': ClusterCentroids()
    
}
models = {

    'M1': LogisticRegression(),
    'M2': AdaBoostClassifier(),
    'M3': RandomForestClassifier(),
    'M4': SVC(),
    'M5': BaggingClassifier()
    
}
samples = {}    
for name, sampler in samplings.items():
    x_resampled, y_resampled = sampler.fit_resample(x_train, y_train)
    samples[name] = (x_resampled, y_resampled)

In [None]:
# Evaluate each model on each sampling technique
results = []
for sampler_name, sampler in samplings.items():
    if sampler_name == 'Sampling1':
        n = n1
    elif sampler_name == 'Sampling2':
        n = n2
    elif sampler_name == 'Sampling3':
        n = n3
    elif sampler_name == 'Sampling4':
        n = n4
    else:
        n = n5
            
results = []
for model_name, model in models.items():
    model_results = []
    for name, (x_resampled, y_resampled) in samples.items():
        model.fit(x_resampled, y_resampled)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        model_results.append(accuracy)
    results.append(model_results)

In [36]:
# Print the results in a table
columns = ['ADASYN', 'SMOTE', 'RandomUnderSampler', 'BorderlineSMOTE', 'ClusterCentroids']
df_results = pd.DataFrame(results, index=['LogisticRegression', 'AdaBoostClassifier', 'RandomForestClassifier', 'Support Vector Classifier', 'BaggingClassifier'], columns=columns)

formatted_results = df_results.applymap(lambda x: "{:.5f}".format(x))
display(formatted_results)

Unnamed: 0,ADASYN,SMOTE,RandomUnderSampler,BorderlineSMOTE,ClusterCentroids
LogisticRegression,0.89637,0.87565,0.6114,0.90674,0.76166
AdaBoostClassifier,0.98446,0.97927,0.81347,0.97927,0.41969
RandomForestClassifier,0.98446,0.98446,0.78238,0.98964,0.52332
Support Vector Classifier,0.66839,0.6943,0.68394,0.70984,0.50259
BaggingClassifier,0.98964,0.98446,0.84456,0.98446,0.62176
