In [9]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, fbeta_score, accuracy_score
from data_preprocessing import get_cleaned_data_final

# Import xgboost
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

In [10]:
def model_Evaluate(model, model_Name, model_sample_method, dataset, X_test_scaled, y_test):
    # Predict values for Test dataset
    y_pred = model.predict(X_test_scaled)

     # accuracy of model on test data
    acc_test = accuracy_score(y_test, y_pred)
    
    print('Accuracy of model on testing data : {} \n'.format(acc_test*100))
    
    
    # precision of model on test data
    pre_test = precision_score(y_test, y_pred)
    
    # recall of model on test data
    rec_test = recall_score(y_test, y_pred)
    
    # f1 of model on test data
    f1_test = f1_score(y_test, y_pred)
    
    # f2 of model on test data
    f2_test = fbeta_score(y_test, y_pred, beta=2, average='macro')
    
    
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    print(f'f2 score: {f2_test}')
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    d = {'Name': [model_Name], 'Sampling_Method': [model_sample_method], 'Dataset': [dataset],
         'Accuracy': [acc_test], 'Precision': [pre_test], 'Recall': [rec_test],
         'F1_Score': [f1_test], 'F2_Score': [f2_test]}
    
    return pd.DataFrame(data=d)


In [11]:
def get_under_sampling_data(x_train, y_train):
    rus = RandomUnderSampler(random_state = 0)
    X_rus, y_rus = rus.fit_resample(x_train, y_train)
    scaler = RobustScaler().fit(X_rus)
    X_train_scaled = scaler.transform(X_rus)

    return X_train_scaled, y_rus, scaler

def get_over_sampling_data(x_train, y_train):
    smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'))    
    
    X_resampled, y_resampled = smote_enn.fit_resample(x_train, y_train)
    scaler = RobustScaler().fit(X_resampled)
    X_train_scaled = scaler.transform(X_resampled)

    return X_train_scaled, y_resampled, scaler

def evaluate_models(models_sampled_definitions, x_train, y_train, x_test, y_test):
    # Get the under sampling data
    X_train_scaled_under_sampling, y_train_under_sampling, scaler_under_sampling = get_under_sampling_data(x_train, y_train)
    X_test_scaled_under_sampling = scaler_under_sampling.transform(x_test)

    # Get the over sampling data
    X_train_scaled_over_sampling, y_train_over_sampling, scaler_over_sampling = get_over_sampling_data(x_train, y_train)
    X_test_scaled_over_sampling = scaler_over_sampling.transform(x_test)

    # Evaluate the models
    evaluation = []
    under_sampling = 'Under Sampling'
    over_sampling = 'Over Sampling'
    train_dataset = 'Train'
    test_dataset = 'Test'
    for model_definition in models_sampled_definitions:
        under_sampling_model = model_definition[0].fit(X_train_scaled_under_sampling, y_train_under_sampling)
        model_name = model_definition[0].__class__.__name__


        print(f'\nUnder Sampling Model Name: {under_sampling_model.__class__.__name__}')
        print(f'Train Data Evaluation')
        evaluation_under_sampling_train = model_Evaluate(under_sampling_model, model_name, under_sampling, train_dataset, X_train_scaled_under_sampling, y_train_under_sampling)

        print('\nTest Data Evaluation')
        evaluation_under_sampling_test = model_Evaluate(under_sampling_model, model_name, under_sampling, test_dataset, X_test_scaled_under_sampling, y_test)

        over_sampling_model = model_definition[1].fit(X_train_scaled_over_sampling, y_train_over_sampling)
        print(f'\nOver Sampling Model Name: {over_sampling_model.__class__.__name__}')
        print(f'Train Data Evaluation')
        evaluation_over_sampling_train = model_Evaluate(over_sampling_model, model_name, over_sampling, train_dataset, X_train_scaled_over_sampling, y_train_over_sampling)
        print('\nTest Data Evaluation')
        evaluation_over_sampling_test = model_Evaluate(over_sampling_model, model_name, over_sampling, test_dataset, X_test_scaled_over_sampling, y_test)

        evaluation.append(evaluation_under_sampling_train)
        evaluation.append(evaluation_under_sampling_test)
        evaluation.append(evaluation_over_sampling_train)
        evaluation.append(evaluation_over_sampling_test)

    return evaluation



In [12]:
# read data normally to get the field docs
df = get_cleaned_data_final(convert_categorical=True)

X = df.drop(['TARGET'],axis = 1)
y = df['TARGET']

In [13]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
xgb_model_definition_under_sampling = xgb.XGBClassifier(n_estimators=50, max_depth=8)
xgb_model_definition_over_sampling = xgb.XGBClassifier(n_estimators=50, max_depth=8)

rf_model_definition_under_sampling = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', max_depth=15, class_weight='balanced')
rf_model_definition_over_sampling = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', max_depth=15, class_weight='balanced')

ada_model_definition_under_sampling = AdaBoostClassifier(
    base_estimator=None,
    n_estimators=300,
    algorithm='SAMME.R',
    random_state=42
)
ada_model_definition_over_sampling = AdaBoostClassifier(
    base_estimator=None,
    n_estimators=250,
    algorithm='SAMME.R',
    random_state=42
)


bayes_model_definition_under_sampling = GaussianNB()
bayes_model_definition_over_sampling = GaussianNB()


# Evaluate this model only
evaluation = evaluate_models([
        (xgb_model_definition_under_sampling, xgb_model_definition_over_sampling),
        (rf_model_definition_under_sampling, rf_model_definition_over_sampling),
        (ada_model_definition_under_sampling, ada_model_definition_over_sampling),
        (bayes_model_definition_under_sampling, bayes_model_definition_over_sampling)
    ], X_train_raw, y_train_raw, X_test_raw, y_test_raw)


Under Sampling Model Name: XGBClassifier
Train Data Evaluation
Accuracy of model on testing data : 84.59993899959332 

              precision    recall  f1-score   support

           0       0.84      0.85      0.85     19672
           1       0.85      0.84      0.85     19672

    accuracy                           0.85     39344
   macro avg       0.85      0.85      0.85     39344
weighted avg       0.85      0.85      0.85     39344

f2 score: 0.8459983763638652

Test Data Evaluation
Accuracy of model on testing data : 64.3787881290152 

              precision    recall  f1-score   support

           0       0.95      0.64      0.77     55592
           1       0.14      0.63      0.23      4959

    accuracy                           0.64     60551
   macro avg       0.54      0.64      0.50     60551
weighted avg       0.88      0.64      0.72     60551

f2 score: 0.5280243022694093

Over Sampling Model Name: XGBClassifier
Train Data Evaluation
Accuracy of model on testing

In [24]:
# Show all the evaluation metrics as one dataframe so the first column is the model name
# So row after row only (Using loop)
evaluation_df = pd.concat(evaluation)

# Find the best precision, Recall, F1, F2 for dataset = Test
best_precision = evaluation_df[evaluation_df['Dataset'] == 'Test'].sort_values('Precision', ascending=False)
best_recall = evaluation_df[evaluation_df['Dataset'] == 'Test'].sort_values('Recall', ascending=False)
best_f1 = evaluation_df[evaluation_df['Dataset'] == 'Test'].sort_values('F1_Score', ascending=False)
best_f2 = evaluation_df[evaluation_df['Dataset'] == 'Test'].sort_values('F2_Score', ascending=False)


print(f'Best Precision model Name: {best_precision.iloc[0]["Name"]}')
print(f'Best Recall model Name: {best_recall.iloc[0]["Name"]}')
print(f'Best F1 model Name: {best_f1.iloc[0]["Name"]}')
print(f'Best F2 model Name: {best_f2.iloc[0]["Name"]}')

evaluation_df

Best Precision model Name: XGBClassifier
Best Recall model Name: RandomForestClassifier
Best F1 model Name: RandomForestClassifier
Best F2 model Name: AdaBoostClassifier


Unnamed: 0,Name,Sampling_Method,Dataset,Accuracy,Precision,Recall,F1_Score,F2_Score
0,XGBClassifier,Under Sampling,Train,0.845999,0.847359,0.844042,0.845697,0.845998
0,XGBClassifier,Under Sampling,Test,0.643788,0.136956,0.631781,0.225112,0.528024
0,XGBClassifier,Over Sampling,Train,0.943922,0.985432,0.919353,0.951246,0.946468
0,XGBClassifier,Over Sampling,Test,0.891199,0.216893,0.125832,0.159265,0.544853
0,RandomForestClassifier,Under Sampling,Train,0.884785,0.874709,0.898231,0.886314,0.884739
0,RandomForestClassifier,Under Sampling,Test,0.659758,0.144299,0.639847,0.235491,0.542014
0,RandomForestClassifier,Over Sampling,Train,0.911527,0.948735,0.899952,0.9237,0.912008
0,RandomForestClassifier,Over Sampling,Test,0.8421,0.135572,0.172615,0.151867,0.534955
0,AdaBoostClassifier,Under Sampling,Train,0.660101,0.661687,0.655195,0.658425,0.660094
0,AdaBoostClassifier,Under Sampling,Test,0.653053,0.141555,0.63904,0.231771,0.536671
