In [1]:
# To handle the data
import pandas as pd

# Sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

# To meausure accuracy/recall
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import recall_score

In [2]:
credit_df = pd.read_csv('Creditcard_data.csv')

In [3]:
# Checking for unbalanced column - 'Class'
sum(credit_df['Class']==1)/(sum(credit_df['Class']==0)+sum(credit_df['Class']==1)) 


0.011658031088082901

In [4]:
def train_model(model_name , train_X , train_Y ):
    if model_name == 'LogisticRegression':
        model = LogisticRegression(max_iter=1000 , solver='newton-cg')
    elif model_name == 'RandomForestClassifier':
        model = RandomForestClassifier()
    elif model_name == 'SVC':
        model = SVC()
    elif model_name == 'KNeighborsClassifier':
        model = KNeighborsClassifier()
    elif model_name == 'XGBClassifier':
        model = xgb.XGBClassifier()
    else:
        raise ValueError(f"Unknown model: {model_name}")
    
    model.fit(train_X,train_Y)
    
    return model

def performance(measure , test_Y , predictions):
    if measure == 'recall':
        score = recall_score(test_Y, predictions)
    elif measure == 'accuracy':
        score = accuracy_score(test_Y, predictions)
    else:
        raise ValueError(f"Unknown model: {measure}")
    return score

In [5]:
# Creating columns for our resultant data frame
result_df = pd.DataFrame()
# For reproducibility
seed_value = 42 
# Defining a function to evaluate the accuracy of a model based on five different sampling techniques
def evaluate(models , measure):
    s1=[]
    s2=[]
    s3=[]
    s4=[]
    s5=[]
    #result_df = pd.DataFrame()
    for model_name in models:
        # A) Random Under-Sampling with imblearn

        rus_df = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
        train_X, train_Y = rus_df.fit_resample(credit_df.drop(columns=['Class']) , credit_df['Class'])

        test_X = credit_df.drop(columns=['Class'])
        test_Y = credit_df['Class']
    
        model = train_model(model_name,train_X,train_Y)
        predictions = model.predict(test_X)
        s1.append(performance(measure , test_Y , predictions))

        # B) Random Over-Sampling with imblearn
        ros_df = RandomOverSampler(random_state=42)
        train_X, train_Y = ros_df.fit_resample(credit_df.drop(columns=['Class']) , credit_df['Class'])

        #test_X = credit_df.drop(columns=['Class'])
        #test_Y = credit_df['Class']
        
        model = train_model(model_name,train_X,train_Y)
        predictions = model.predict(test_X)
        s2.append(performance(measure , test_Y , predictions))

        # C) Under-Sampling using Tomek Links
        tl_df = RandomOverSampler(sampling_strategy='minority')
        train_X, train_Y = tl_df.fit_resample(credit_df.drop(columns=['Class']) , credit_df['Class'])

        #test_X = credit_df.drop(columns=['Class'])
        #test_Y = credit_df['Class']
        
        model = train_model(model_name,train_X,train_Y)
        predictions = model.predict(test_X)
        s3.append(performance(measure , test_Y , predictions))

        # D) Synthetic Minority Oversampling Technique (SMOTE)
        smote = SMOTE()
        train_X, train_Y = smote.fit_resample(credit_df.drop(columns=['Class']) , credit_df['Class'])

        #test_X = credit_df.drop(columns=['Class'])
        #test_Y = credit_df['Class']
        
        model = train_model(model_name,train_X,train_Y)
        predictions = model.predict(test_X)
        s4.append(performance(measure , test_Y , predictions))

        # E) NearMiss
        nm = NearMiss()
        train_X, train_Y = nm.fit_resample(credit_df.drop(columns=['Class']) , credit_df['Class'])

        #test_X = credit_df.drop(columns=['Class'])
        #test_Y = credit_df['Class']
        
        model = train_model(model_name,train_X,train_Y)
        predictions = model.predict(test_X)
        s5.append(performance(measure , test_Y , predictions))


    result_df = pd.DataFrame({'S1':s1,'S2':s2,'S3':s3,'S4':s4,'S5':s5} , index=models)
    return result_df


In [6]:
models = ['LogisticRegression' , 'RandomForestClassifier' , 'SVC' , 'KNeighborsClassifier' ,'XGBClassifier']
recall_result_df = evaluate(models , 'recall')
accuracy_result_df = evaluate(models , 'accuracy')
print(recall_result_df)
print(accuracy_result_df)
recall_result_df.to_csv('recall_result.csv')
accuracy_result_df.to_csv('accuracy_result.csv')

                              S1        S2        S3        S4        S5
LogisticRegression      0.888889  1.000000  1.000000  0.888889  0.888889
RandomForestClassifier  1.000000  1.000000  1.000000  1.000000  1.000000
SVC                     0.666667  0.666667  0.777778  0.666667  0.444444
KNeighborsClassifier    0.555556  1.000000  1.000000  1.000000  0.444444
XGBClassifier           1.000000  1.000000  1.000000  1.000000  1.000000
                              S1        S2        S3        S4        S5
LogisticRegression      0.461140  0.866580  0.865285  0.896373  0.217617
RandomForestClassifier  0.722798  1.000000  1.000000  1.000000  0.560881
SVC                     0.554404  0.748705  0.742228  0.642487  0.362694
KNeighborsClassifier    0.597150  0.980570  0.980570  0.806995  0.190415
XGBClassifier           0.693005  1.000000  1.000000  1.000000  0.098446
