In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Data/preprocessed_df.csv")
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

Feature Scaling

In [3]:
pt = PowerTransformer(copy=False)

columns = X_train.columns
X_train[columns] = pt.fit_transform(X_train)
X_test[columns] = pt.transform(X_test)

Random Oversampling

In [4]:
rus = RandomOverSampler()

X_train_os, y_train_os = rus.fit_resample(X_train, y_train)
print('Before sampling class distribution:-',Counter(y_train))
print('New class distribution:-',Counter(y_train_os))

Before sampling class distribution:- Counter({0: 227447, 1: 398})
New class distribution:- Counter({0: 227447, 1: 227447})


In [5]:
def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_res=True):
    if train_res:
        y_train_pred = model.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
        print("False Positive Rate: {}".format((fp / (fp + tn))))
        print("False Negative Rate: {}".format((fn / (tp + fn))))
        print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
        print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_train, y_train_pred)))
        print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_train, y_train_pred)))
        print("Accuracy: {}".format(metrics.accuracy_score(y_train, y_train_pred)))
        print("Log-loss: {}".format(metrics.log_loss(y_train, y_train_pred)))
        print("F1 Score: {}".format(f1_score(y_train, y_train_pred)))
        print("F2 Score: {}".format(metrics.fbeta_score(y_train, y_train_pred, beta=2)))
        print(metrics.confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))
        
        print("------------------------------------------------------------------------------------")
    # Evaluation on Test Set
    y_test_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    print("False Positive Rate: {}".format((fp / (fp + tn))))
    print("False Negative Rate: {}".format((fn / (tp + fn))))
    print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
    print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_test, y_test_pred)))
    print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_test, y_test_pred)))
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_test_pred)))
    print("Log-loss: {}".format(metrics.log_loss(y_test, y_test_pred)))
    print("F1 Score: {}".format(f1_score(y_test, y_test_pred)))
    print("F2 Score: {}".format(metrics.fbeta_score(y_test, y_test_pred, beta=2)))
    print(metrics.confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    return None

In [6]:
dt = DecisionTreeClassifier().fit(X_train_os, y_train_os)
evaluate_model(dt, train_res=False)

False Positive Rate: 0.00036927621861152144
False Negative Rate: 0.26595744680851063
True Negative Rate (Specificity): 0.9996307237813885
True Positive Rate (Sensitivity/Recall): 0.7340425531914894
ROC-AUC Score: 0.866836638486439
Accuracy: 0.9991924440855307
Log-loss: 0.02789228794129388
F1 Score: 0.75
F2 Score: 0.740343347639485
[[56847    21]
 [   25    69]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.77      0.73      0.75        94

    accuracy                           1.00     56962
   macro avg       0.88      0.87      0.87     56962
weighted avg       1.00      1.00      1.00     56962



In [7]:
rf = RandomForestClassifier().fit(X_train_os, y_train_os)
evaluate_model(rf, train_res=False)

False Positive Rate: 0.00012309207287050715
False Negative Rate: 0.22340425531914893
True Negative Rate (Specificity): 0.9998769079271295
True Positive Rate (Sensitivity/Recall): 0.776595744680851
ROC-AUC Score: 0.8882363263039903
Accuracy: 0.9995084442259752
Log-loss: 0.01697783322635384
F1 Score: 0.8390804597701149
F2 Score: 0.8004385964912281
[[56861     7]
 [   21    73]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56868
           1       0.91      0.78      0.84        94

    accuracy                           1.00     56962
   macro avg       0.96      0.89      0.92     56962
weighted avg       1.00      1.00      1.00     56962



In [8]:
ada = AdaBoostClassifier().fit(X_train_os, y_train_os)
evaluate_model(ada, train_res=False)

False Positive Rate: 0.01596680030948864
False Negative Rate: 0.10638297872340426
True Negative Rate (Specificity): 0.9840331996905114
True Positive Rate (Sensitivity/Recall): 0.8936170212765957
ROC-AUC Score: 0.9388251104835535
Accuracy: 0.9838839928373302
Log-loss: 0.5566399137143125
F1 Score: 0.15469613259668505
F2 Score: 0.3070175438596491
[[55960   908]
 [   10    84]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56868
           1       0.08      0.89      0.15        94

    accuracy                           0.98     56962
   macro avg       0.54      0.94      0.57     56962
weighted avg       1.00      0.98      0.99     56962



In [9]:
lr = LogisticRegression().fit(X_train_os, y_train_os)
evaluate_model(lr, train_res=False)

False Positive Rate: 0.0236864317366533
False Negative Rate: 0.09574468085106383
True Negative Rate (Specificity): 0.9763135682633467
True Positive Rate (Sensitivity/Recall): 0.9042553191489362
ROC-AUC Score: 0.9402844437061415
Accuracy: 0.9761946560865138
Log-loss: 0.8222263587872162
F1 Score: 0.11140235910878113
F2 Score: 0.23506637168141595
[[55521  1347]
 [    9    85]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56868
           1       0.06      0.90      0.11        94

    accuracy                           0.98     56962
   macro avg       0.53      0.94      0.55     56962
weighted avg       1.00      0.98      0.99     56962

