In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import SVMSMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [27]:
df = pd.read_csv("Data/preprocessed_df.csv")
X = df.drop("Class", axis=1)
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

Feature Scaling

In [28]:
pt = PowerTransformer(copy=False)

columns = X_train.columns
X_train[columns] = pt.fit_transform(X_train)
X_test[columns] = pt.transform(X_test)

SVM-SMOTE

In [29]:
sm = SVMSMOTE()

X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
print('Before sampling class distribution:-',Counter(y_train))
print('New class distribution:-',Counter(y_train_sm))

Before sampling class distribution:- Counter({0: 227443, 1: 402})
New class distribution:- Counter({0: 227443, 1: 227443})


In [30]:
def evaluate_model(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, train_res=True):
    if train_res:
        y_train_pred = model.predict(X_train)
        tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
        print("False Positive Rate: {}".format((fp / (fp + tn))))
        print("False Negative Rate: {}".format((fn / (tp + fn))))
        print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
        print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_train, y_train_pred)))
        print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_train, y_train_pred)))
        print("Accuracy: {}".format(metrics.accuracy_score(y_train, y_train_pred)))
        print("Log-loss: {}".format(metrics.log_loss(y_train, y_train_pred)))
        print("F1 Score: {}".format(f1_score(y_train, y_train_pred)))
        print("F2 Score: {}".format(metrics.fbeta_score(y_train, y_train_pred, beta=2)))
        print(metrics.confusion_matrix(y_train, y_train_pred))
        print(classification_report(y_train, y_train_pred))
        
        print("------------------------------------------------------------------------------------")
    # Evaluation on Test Set
    y_test_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    print("False Positive Rate: {}".format((fp / (fp + tn))))
    print("False Negative Rate: {}".format((fn / (tp + fn))))
    print("True Negative Rate (Specificity): {}".format((tn / (tn + fp))))
    print("True Positive Rate (Sensitivity/Recall): {}".format(metrics.recall_score(y_test, y_test_pred)))
    print("ROC-AUC Score: {}".format(metrics.roc_auc_score(y_test, y_test_pred)))
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_test_pred)))
    print("Log-loss: {}".format(metrics.log_loss(y_test, y_test_pred)))
    print("F1 Score: {}".format(f1_score(y_test, y_test_pred)))
    print("F2 Score: {}".format(metrics.fbeta_score(y_test, y_test_pred, beta=2)))
    print(metrics.confusion_matrix(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))

    return None

In [31]:
dt = DecisionTreeClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(dt, train_res=False)

False Positive Rate: 0.0007209171472780982
False Negative Rate: 0.18888888888888888
True Negative Rate (Specificity): 0.9992790828527219
True Positive Rate (Sensitivity/Recall): 0.8111111111111111
ROC-AUC Score: 0.9051950969819165
Accuracy: 0.9989817773252344
Log-loss: 0.03516874081667628
F1 Score: 0.7156862745098039
F2 Score: 0.770042194092827
[[56831    41]
 [   17    73]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56872
           1       0.64      0.81      0.72        90

    accuracy                           1.00     56962
   macro avg       0.82      0.91      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [32]:
rf = RandomForestClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(rf, train_res=False)

KeyboardInterrupt: 

In [None]:
ada = AdaBoostClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(ada, train_res=False)

False Positive Rate: 0.004853600633078344
False Negative Rate: 0.18556701030927836
True Negative Rate (Specificity): 0.9951463993669216
True Positive Rate (Sensitivity/Recall): 0.8144329896907216
ROC-AUC Score: 0.9047896945288216
Accuracy: 0.9948386643727397
Log-loss: 0.17827009144683348
F1 Score: 0.34955752212389385
F2 Score: 0.531628532974428
[[56589   276]
 [   18    79]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56865
           1       0.22      0.81      0.35        97

    accuracy                           0.99     56962
   macro avg       0.61      0.90      0.67     56962
weighted avg       1.00      0.99      1.00     56962



In [None]:
lr = LogisticRegression().fit(X_train_sm, y_train_sm)
evaluate_model(lr, train_res=False)

False Positive Rate: 0.007473841554559043
False Negative Rate: 0.17525773195876287
True Negative Rate (Specificity): 0.992526158445441
True Positive Rate (Sensitivity/Recall): 0.8247422680412371
ROC-AUC Score: 0.9086342132433392
Accuracy: 0.9922404409957516
Log-loss: 0.2680116392587762
F1 Score: 0.26578073089700993
F2 Score: 0.44792833146696537
[[56440   425]
 [   17    80]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56865
           1       0.16      0.82      0.27        97

    accuracy                           0.99     56962
   macro avg       0.58      0.91      0.63     56962
weighted avg       1.00      0.99      0.99     56962



In [None]:
gbc = GradientBoostingClassifier().fit(X_train_sm, y_train_sm)
evaluate_model(gbc, train_res=False)

False Positive Rate: 0.0024971423546997274
False Negative Rate: 0.17525773195876287
True Negative Rate (Specificity): 0.9975028576453002
True Positive Rate (Sensitivity/Recall): 0.8247422680412371
ROC-AUC Score: 0.9111225628432686
Accuracy: 0.9972086654260736
Log-loss: 0.09641127400066613
F1 Score: 0.5015673981191222
F2 Score: 0.6557377049180326
[[56723   142]
 [   17    80]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56865
           1       0.36      0.82      0.50        97

    accuracy                           1.00     56962
   macro avg       0.68      0.91      0.75     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard').fit(X_train_sm, y_train_sm)

evaluate_model(eclf, train_res=False)

False Positive Rate: 0.0007913479293062516
False Negative Rate: 0.18556701030927836
True Negative Rate (Specificity): 0.9992086520706938
True Positive Rate (Sensitivity/Recall): 0.8144329896907216
ROC-AUC Score: 0.9068208208807077
Accuracy: 0.9988939995084443
Log-loss: 0.03820053535275777
F1 Score: 0.7149321266968326
F2 Score: 0.7714843749999999
[[56820    45]
 [   18    79]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56865
           1       0.64      0.81      0.71        97

    accuracy                           1.00     56962
   macro avg       0.82      0.91      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:
knn = KNeighborsClassifier(n_neighbors=2).fit(X_train_sm, y_train_sm)
evaluate_model(knn, train_res=False)

False Positive Rate: 0.00017585509540138925
False Negative Rate: 0.2268041237113402
True Negative Rate (Specificity): 0.9998241449045986
True Positive Rate (Sensitivity/Recall): 0.7731958762886598
ROC-AUC Score: 0.8865100105966291
Accuracy: 0.9994382219725431
Log-loss: 0.019403266047742364
F1 Score: 0.8241758241758242
F2 Score: 0.7928118393234673
[[56855    10]
 [   22    75]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56865
           1       0.88      0.77      0.82        97

    accuracy                           1.00     56962
   macro avg       0.94      0.89      0.91     56962
weighted avg       1.00      1.00      1.00     56962

