In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN


# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, fbeta_score, accuracy_score

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
from data_preprocessing import get_cleaned_data_final

# read data normally to get the field docs
df = get_cleaned_data_final(convert_categorical=True)


In [3]:
X = df.drop(['TARGET'],axis = 1)
y = df['TARGET']

In [4]:
def model_Evaluate(model, X_test_scaled, y_test):
    # Predict values for Test dataset
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[::,1]

     # accuracy of model on test data
    acc_test = accuracy_score(y_test, y_pred)
    
    print('Accuracy of model on testing data : {} \n'.format(acc_test*100))
    
    
    # precision of model on test data
    pre_test = precision_score(y_test, y_pred)
    
    # recall of model on test data
    rec_test = recall_score(y_test, y_pred)
    
    # f1 of model on test data
    f1_test = f1_score(y_test, y_pred)
    
    # f2 of model on test data
    f2_test = fbeta_score(y_test, y_pred, beta=2, average='macro')
    
    # AUC of model on test data
    auc_test = roc_auc_score(y_test, y_pred_proba)
    
    
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    print(f'f2 score: {f2_test}')
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    d = {'Test_Accuracy': [acc_test], 
         'Precision': [pre_test], 'Recall': [rec_test],
         'AUC': [auc_test], 'F1_Score': [f1_test], 'F2_Score': [f2_test], 
         'Roc_Auc_score': auc_test}
    
    return pd.DataFrame(data=d)



In [5]:
# Before undersampling we take a fraction of the data to test the model
# X_train, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# rus = RandomUnderSampler(random_state = 0)
# X_rus, y_rus = rus.fit_resample(X_train_raw, y_train_raw)


In [5]:
bayes = GaussianNB()

bayes.fit(X, y)

In [8]:
model_Evaluate(bayes, X[100_000: 102_001], y[100_000: 102_001])

Accuracy of model on testing data : 91.25437281359319 

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1826
           1       0.00      0.00      0.00       175

    accuracy                           0.91      2001
   macro avg       0.46      0.50      0.48      2001
weighted avg       0.83      0.91      0.87      2001

f2 score: 0.4905964535196131


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.912544,0.0,0.0,0.568515,0.0,0.490596,0.568515


In [12]:
# smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'))

In [13]:
# X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
# X_resampled, y_resampled = smote_enn.fit_resample(X_train_raw, y_train_raw)

In [15]:
# scaler = RobustScaler().fit(X_resampled)
# X_train_scaled = scaler.transform(X_resampled)

In [16]:
# bays_smoteenn = BernoulliNB()

# bays_smoteenn.fit(X_train_scaled, y_resampled)

In [17]:
# testing_data_scaled = scaler.transform(X_test_raw)

In [18]:
# model_Evaluate(bays_smoteenn, testing_data_scaled, y_test_raw)

Accuracy of model on testing data : 63.41466100015414 

              precision    recall  f1-score   support

           0       0.93      0.65      0.77     83311
           1       0.11      0.46      0.17      7515

    accuracy                           0.63     90826
   macro avg       0.52      0.55      0.47     90826
weighted avg       0.86      0.63      0.72     90826

f2 score: 0.4828040040430286


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.634147,0.105299,0.45642,0.571212,0.171119,0.482804,0.571212
