In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import ADASYN
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Data/preprocessed_df.csv")
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [3]:
pt = PowerTransformer(copy=False)

columns = X_train.columns
X_train[columns] = pt.fit_transform(X_train)
X_test[columns] = pt.transform(X_test)

In [4]:
ad = ADASYN()

X_train_ad, y_train_ad = ad.fit_resample(X_train, y_train)
print('Before sampling class distribution:-',Counter(y_train))
print('New class distribution:-',Counter(y_train_ad))

Before sampling class distribution:- Counter({0: 227446, 1: 399})
New class distribution:- Counter({0: 227446, 1: 227439})


In [11]:
def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_res=True):
    if train_res:
        y_train_pred = model.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()

        print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
        print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_train, y_train_pred)))
        print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_train, y_train_pred)))
        print("Average Precision: {}".format(metrics.average_precision_score(y_train, y_train_pred)))
        print("F1 Score: {}".format(f1_score(y_train, y_train_pred)))
        print("F2 Score: {}".format(metrics.fbeta_score(y_train, y_train_pred, beta=2)))
        print(metrics.confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))
        
        print("------------------------------------------------------------------------------------")
    # Evaluation on Test Set
    y_test_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

    print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
    print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_test, y_test_pred)))
    print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_test, y_test_pred)))
    print("Average Precision: {}".format(metrics.average_precision_score(y_test, y_test_pred)))
    print("F1 Score: {}".format(f1_score(y_test, y_test_pred)))
    print("F2 Score: {}".format(metrics.fbeta_score(y_test, y_test_pred, beta=2)))
    print(metrics.confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    return None

In [6]:
dt = DecisionTreeClassifier().fit(X_train_ad, y_train_ad)
evaluate_model(dt, train_res=False)

False Positive Rate: 0.0015650002637640893
False Negative Rate: 0.25806451612903225
True Negative Rate (Specificity): 0.998434999736236
True Positive Rate (Sensitivity/Recall): 0.7419354838709677
ROC-AUC Score: 0.8701852418036019
Accuracy: 0.9980162213405428
Average Precision: 0.32443113343892666
F1 Score: 0.5498007968127491
F2 Score: 0.6509433962264153
[[56780    89]
 [   24    69]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56869
           1       0.44      0.74      0.55        93

    accuracy                           1.00     56962
   macro avg       0.72      0.87      0.77     56962
weighted avg       1.00      1.00      1.00     56962



In [7]:
rf = RandomForestClassifier().fit(X_train_ad, y_train_ad)
evaluate_model(rf, train_res=False)

False Positive Rate: 0.00022859554414531643
False Negative Rate: 0.23655913978494625
True Negative Rate (Specificity): 0.9997714044558547
True Positive Rate (Sensitivity/Recall): 0.7634408602150538
ROC-AUC Score: 0.8816061323354542
Accuracy: 0.999385555282469
Average Precision: 0.6456755209089816
F1 Score: 0.8022598870056498
F2 Score: 0.7785087719298246
[[56856    13]
 [   22    71]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56869
           1       0.85      0.76      0.80        93

    accuracy                           1.00     56962
   macro avg       0.92      0.88      0.90     56962
weighted avg       1.00      1.00      1.00     56962



In [15]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [17]:
rfc=RandomForestClassifier(random_state=1)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_ad, y_train_ad)

In [None]:
CV_rfc.best_params_

In [9]:
ada = AdaBoostClassifier().fit(X_train_ad, y_train_ad)
evaluate_model(ada, train_res=False)

False Positive Rate: 0.06145703282983699
False Negative Rate: 0.0967741935483871
True Negative Rate (Specificity): 0.9385429671701631
True Positive Rate (Sensitivity/Recall): 0.9032258064516129
ROC-AUC Score: 0.920884386810888
Accuracy: 0.9384853059934694
Average Precision: 0.02135692930798014
F1 Score: 0.04575163398692811
F2 Score: 0.10630220197418375
[[53374  3495]
 [    9    84]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97     56869
           1       0.02      0.90      0.05        93

    accuracy                           0.94     56962
   macro avg       0.51      0.92      0.51     56962
weighted avg       1.00      0.94      0.97     56962



In [10]:
lr = LogisticRegression().fit(X_train_ad, y_train_ad)
evaluate_model(lr, train_res=False)

False Positive Rate: 0.09198333010955002
False Negative Rate: 0.07526881720430108
True Negative Rate (Specificity): 0.9080166698904499
True Positive Rate (Sensitivity/Recall): 0.9247311827956989
ROC-AUC Score: 0.9163739263430745
Accuracy: 0.9080439591306485
Average Precision: 0.015079985373904937
F1 Score: 0.03179297597042514
F2 Score: 0.0755844612409914
[[51638  5231]
 [    7    86]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.95     56869
           1       0.02      0.92      0.03        93

    accuracy                           0.91     56962
   macro avg       0.51      0.92      0.49     56962
weighted avg       1.00      0.91      0.95     56962

