In [13]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
# xgb
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, fbeta_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [14]:
from data_preprocessing import get_cleaned_data_final

# read data normally to get the field docs
df = get_cleaned_data_final(convert_categorical=True)


In [15]:
X = df.drop(['TARGET'],axis = 1)
y = df['TARGET']

In [16]:
def model_Evaluate(model, X_test_scaled, y_test):
    # Predict values for Test dataset
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[::,1]

     # accuracy of model on test data
    acc_test = accuracy_score(y_test, y_pred)
    
    print('Accuracy of model on testing data : {} \n'.format(acc_test*100))
    
    
    # precision of model on test data
    pre_test = precision_score(y_test, y_pred)
    
    # recall of model on test data
    rec_test = recall_score(y_test, y_pred)
    
    # f1 of model on test data
    f1_test = f1_score(y_test, y_pred)
    
    # f2 of model on test data
    f2_test = fbeta_score(y_test, y_pred, beta=2, average='macro')
    
    # AUC of model on test data
    auc_test = roc_auc_score(y_test, y_pred_proba)
    
    
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    print(f'f2 score: {f2_test}')
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    d = {'Test_Accuracy': [acc_test], 
         'Precision': [pre_test], 'Recall': [rec_test],
         'AUC': [auc_test], 'F1_Score': [f1_test], 'F2_Score': [f2_test], 
         'Roc_Auc_score': auc_test}
    
    return pd.DataFrame(data=d)



In [17]:
# Before undersampling we take a fraction of the data to test the model
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train_raw, y_train_raw)

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size= 0.2, random_state = 2020)
# Take the entire data for training
X_train, y_train = X_rus, y_rus

In [20]:
print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_rus))

Original dataset shape Counter({0: 282686, 1: 24824})
Resampled dataset shape Counter({0: 17311, 1: 17311})


In [21]:
scaler = RobustScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [23]:
xgb_model_rus = XGBClassifier(n_estimators=300, max_depth=20, scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train))
xgb_model_rus.fit(X_train_scaled, y_train)

In [None]:
# testing_data = df.drop(['TARGET'],axis = 1)
# testing_y = df['TARGET']



In [24]:
testing_data_scaled = scaler.transform(X_test_raw)
model_Evaluate(xgb_model_rus, testing_data_scaled, y_test_raw)

Accuracy of model on testing data : 63.66947416344184 

              precision    recall  f1-score   support

           0       0.95      0.64      0.76     84740
           1       0.13      0.63      0.22      7513

    accuracy                           0.64     92253
   macro avg       0.54      0.63      0.49     92253
weighted avg       0.88      0.64      0.72     92253

f2 score: 0.521156340503349


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.636695,0.133006,0.62718,0.679867,0.219469,0.521156,0.679867


In [25]:

smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'))

In [26]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
X_resampled, y_resampled = smote_enn.fit_resample(X_train_raw, y_train_raw)

In [28]:
scaler = RobustScaler().fit(X_resampled)
X_train_scaled = scaler.transform(X_resampled)

In [33]:
xgb_model_smote = XGBClassifier(n_estimators = 300, criterion = 'entropy', max_depth=20, class_weight='balanced')

xgb_model_smote.fit(X_train_scaled, y_resampled)

In [34]:
testing_data_scaled = scaler.transform(X_test_raw)

In [35]:
model_Evaluate(xgb_model_smote, testing_data_scaled, y_test_raw)

Accuracy of model on testing data : 90.10221889803042 

              precision    recall  f1-score   support

           0       0.92      0.97      0.95     84740
           1       0.25      0.11      0.15      7513

    accuracy                           0.90     92253
   macro avg       0.59      0.54      0.55     92253
weighted avg       0.87      0.90      0.88     92253

f2 score: 0.5424651193112274


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.901022,0.251993,0.10941,0.676599,0.152575,0.542465,0.676599
