In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Data/preprocessed_df.csv")
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

Feature Scaling

In [3]:
pt = PowerTransformer(copy=False)

columns = X_train.columns
X_train[columns] = pt.fit_transform(X_train)
X_test[columns] = pt.transform(X_test)

SMOTE - Synthetic Minority Over-sampling Technique

In [4]:
sm = SMOTE()

X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
print('Before sampling class distribution:-',Counter(y_train))
print('New class distribution:-',Counter(y_train_sm))

Before sampling class distribution:- Counter({0: 227448, 1: 397})
New class distribution:- Counter({0: 227448, 1: 227448})


In [5]:
def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_res=True):
    if train_res:
        y_train_pred = model.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
        print("False Positive Rate: {}".format((fp / (fp + tn))))
        print("False Negative Rate: {}".format((fn / (tp + fn))))
        print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
        print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_train, y_train_pred)))
        print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_train, y_train_pred)))
        print("Accuracy: {}".format(metrics.accuracy_score(y_train, y_train_pred)))
        print("Log-loss: {}".format(metrics.log_loss(y_train, y_train_pred)))
        print("F1 Score: {}".format(f1_score(y_train, y_train_pred)))
        print("F2 Score: {}".format(metrics.fbeta_score(y_train, y_train_pred, beta=2)))
        print(metrics.confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))
        
        print("------------------------------------------------------------------------------------")
    # Evaluation on Test Set
    y_test_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    print("False Positive Rate: {}".format((fp / (fp + tn))))
    print("False Negative Rate: {}".format((fn / (tp + fn))))
    print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
    print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_test, y_test_pred)))
    print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_test, y_test_pred)))
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_test_pred)))
    print("Log-loss: {}".format(metrics.log_loss(y_test, y_test_pred)))
    print("F1 Score: {}".format(f1_score(y_test, y_test_pred)))
    print("F2 Score: {}".format(metrics.fbeta_score(y_test, y_test_pred, beta=2)))
    print(metrics.confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    return None

In [6]:
dt = DecisionTreeClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(dt, train_res=False)

False Positive Rate: 0.0025498092039319816
False Negative Rate: 0.24210526315789474
True Negative Rate (Specificity): 0.997450190796068
True Positive Rate (Sensitivity/Recall): 0.7578947368421053
ROC-AUC Score: 0.8776724638190867
Accuracy: 0.9970506653558513
Log-loss: 0.10186844520860273
F1 Score: 0.46153846153846156
F2 Score: 0.6030150753768844
[[56722   145]
 [   23    72]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.33      0.76      0.46        95

    accuracy                           1.00     56962
   macro avg       0.67      0.88      0.73     56962
weighted avg       1.00      1.00      1.00     56962



In [7]:
rf = RandomForestClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(rf, train_res=False)

False Positive Rate: 0.0002110186927391985
False Negative Rate: 0.18947368421052632
True Negative Rate (Specificity): 0.9997889813072608
True Positive Rate (Sensitivity/Recall): 0.8105263157894737
ROC-AUC Score: 0.9051576485483672
Accuracy: 0.9994733330992591
Log-loss: 0.018190598767889813
F1 Score: 0.8369565217391305
F2 Score: 0.8208955223880596
[[56855    12]
 [   18    77]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56867
           1       0.87      0.81      0.84        95

    accuracy                           1.00     56962
   macro avg       0.93      0.91      0.92     56962
weighted avg       1.00      1.00      1.00     56962



In [8]:
ada = AdaBoostClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(ada, train_res=False)

False Positive Rate: 0.02625424235496861
False Negative Rate: 0.09473684210526316
True Negative Rate (Specificity): 0.9737457576450314
True Positive Rate (Sensitivity/Recall): 0.9052631578947369
ROC-AUC Score: 0.9395044577698842
Accuracy: 0.9736315438362417
Log-loss: 0.9107551691323896
F1 Score: 0.10274790919952209
F2 Score: 0.2194997447677386
[[55374  1493]
 [    9    86]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56867
           1       0.05      0.91      0.10        95

    accuracy                           0.97     56962
   macro avg       0.53      0.94      0.54     56962
weighted avg       1.00      0.97      0.99     56962



In [9]:
lr = LogisticRegression().fit(X_train_sm, y_train_sm)
evaluate_model(lr, train_res=False)

False Positive Rate: 0.029384352963933387
False Negative Rate: 0.07368421052631578
True Negative Rate (Specificity): 0.9706156470360666
True Positive Rate (Sensitivity/Recall): 0.9263157894736842
ROC-AUC Score: 0.9484657182548755
Accuracy: 0.9705417646852288
Log-loss: 1.0174748589931186
F1 Score: 0.09492988133764833
F2 Score: 0.20570359981299677
[[55196  1671]
 [    7    88]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56867
           1       0.05      0.93      0.09        95

    accuracy                           0.97     56962
   macro avg       0.52      0.95      0.54     56962
weighted avg       1.00      0.97      0.98     56962

