In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("preprocessed_df.csv")
X = df.drop(["Class"], axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

Feature Scaling

In [3]:
pt = PowerTransformer(copy=False)

columns = X_train.columns
X_train[columns] = pt.fit_transform(X_train)
X_test[columns] = pt.transform(X_test)

Random Undersampling

In [4]:
rus = RandomUnderSampler()

X_train_us, y_train_us = rus.fit_resample(X_train, y_train)
print('Before sampling class distribution:-',Counter(y_train))
print('New class distribution:-',Counter(y_train_us))

Before sampling class distribution:- Counter({0: 227450, 1: 395})
New class distribution:- Counter({0: 395, 1: 395})


In [5]:
def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_res=True):
    if train_res:
        y_train_pred = model.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
        print("False Positive Rate: {}".format((fp / (fp + tn))))
        print("False Negative Rate: {}".format((fn / (tp + fn))))
        print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
        print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_train, y_train_pred)))
        print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_train, y_train_pred)))
        print("Accuracy: {}".format(metrics.accuracy_score(y_train, y_train_pred)))
        print("Log-loss: {}".format(metrics.log_loss(y_train, y_train_pred)))
        print("F1 Score: {}".format(f1_score(y_train, y_train_pred)))
        print("F2 Score: {}".format(metrics.fbeta_score(y_train, y_train_pred, beta=2)))
        print(metrics.confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))
        
        print("------------------------------------------------------------------------------------")
    # Evaluation on Test Set
    y_test_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    print("False Positive Rate: {}".format((fp / (fp + tn))))
    print("False Negative Rate: {}".format((fn / (tp + fn))))
    print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
    print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_test, y_test_pred)))
    print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_test, y_test_pred)))
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_test_pred)))
    print("Log-loss: {}".format(metrics.log_loss(y_test, y_test_pred)))
    print("F1 Score: {}".format(f1_score(y_test, y_test_pred)))
    print("F2 Score: {}".format(metrics.fbeta_score(y_test, y_test_pred, beta=2)))
    print(metrics.confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    return None

In [6]:
dt = DecisionTreeClassifier().fit(X_train_us, y_train_us)
evaluate_model(dt, train_res=False)

False Positive Rate: 0.10939945484920426
False Negative Rate: 0.08247422680412371
True Negative Rate (Specificity): 0.8906005451507958
True Positive Rate (Sensitivity/Recall): 0.9175257731958762
ROC-AUC Score: 0.904063159173336
Accuracy: 0.8906463958428426
Log-loss: 3.777027008523435
F1 Score: 0.02778211331356329
F2 Score: 0.06643774260973424
[[50644  6221]
 [    8    89]]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     56865
           1       0.01      0.92      0.03        97

    accuracy                           0.89     56962
   macro avg       0.51      0.90      0.48     56962
weighted avg       1.00      0.89      0.94     56962



In [7]:
rf = RandomForestClassifier().fit(X_train_us, y_train_us)
evaluate_model(rf, train_res=False)

False Positive Rate: 0.02796096016882089
False Negative Rate: 0.09278350515463918
True Negative Rate (Specificity): 0.9720390398311791
True Positive Rate (Sensitivity/Recall): 0.9072164948453608
ROC-AUC Score: 0.9396277673382699
Accuracy: 0.971928654190513
Log-loss: 0.9695722554576075
F1 Score: 0.09915492957746479
F2 Score: 0.21297192642788
[[55275  1590]
 [    9    88]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56865
           1       0.05      0.91      0.10        97

    accuracy                           0.97     56962
   macro avg       0.53      0.94      0.54     56962
weighted avg       1.00      0.97      0.98     56962



In [8]:
ada = AdaBoostClassifier().fit(X_train_us, y_train_us)
evaluate_model(ada, train_res=False)

False Positive Rate: 0.036894399015211465
False Negative Rate: 0.09278350515463918
True Negative Rate (Specificity): 0.9631056009847885
True Positive Rate (Sensitivity/Recall): 0.9072164948453608
ROC-AUC Score: 0.9351610479150746
Accuracy: 0.9630104280046347
Log-loss: 1.2776040065216354
F1 Score: 0.07709154621112571
F2 Score: 0.17094017094017094
[[54767  2098]
 [    9    88]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56865
           1       0.04      0.91      0.08        97

    accuracy                           0.96     56962
   macro avg       0.52      0.94      0.53     56962
weighted avg       1.00      0.96      0.98     56962



In [9]:
 lr = LogisticRegression().fit(X_train_us, y_train_us)
 evaluate_model(lr, train_res=False)

False Positive Rate: 0.03253319264925701
False Negative Rate: 0.07216494845360824
True Negative Rate (Specificity): 0.967466807350743
True Positive Rate (Sensitivity/Recall): 0.9278350515463918
ROC-AUC Score: 0.9476509294485673
Accuracy: 0.9673993188441417
Log-loss: 1.12601360592316
F1 Score: 0.08836524300441827
F2 Score: 0.19329896907216496
[[55015  1850]
 [    7    90]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     56865
           1       0.05      0.93      0.09        97

    accuracy                           0.97     56962
   macro avg       0.52      0.95      0.54     56962
weighted avg       1.00      0.97      0.98     56962

