In [17]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, fbeta_score, accuracy_score

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [18]:
from data_preprocessing import get_cleaned_data_final

# read data normally to get the field docs
df = get_cleaned_data_final(convert_categorical=True)


In [19]:
X = df.drop(['TARGET'],axis = 1)
y = df['TARGET']

In [20]:
def model_Evaluate(model, X_test_scaled, y_test):
    # Predict values for Test dataset
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[::,1]

     # accuracy of model on test data
    acc_test = accuracy_score(y_test, y_pred)
    
    print('Accuracy of model on testing data : {} \n'.format(acc_test*100))
    
    
    # precision of model on test data
    pre_test = precision_score(y_test, y_pred)
    
    # recall of model on test data
    rec_test = recall_score(y_test, y_pred)
    
    # f1 of model on test data
    f1_test = f1_score(y_test, y_pred)
    
    # f2 of model on test data
    f2_test = fbeta_score(y_test, y_pred, beta=2, average='macro')
    
    # AUC of model on test data
    auc_test = roc_auc_score(y_test, y_pred_proba)
    
    
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    print(f'f2 score: {f2_test}')
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    d = {'Test_Accuracy': [acc_test], 
         'Precision': [pre_test], 'Recall': [rec_test],
         'AUC': [auc_test], 'F1_Score': [f1_test], 'F2_Score': [f2_test], 
         'Roc_Auc_score': auc_test}
    
    return pd.DataFrame(data=d)



In [21]:
# Before undersampling we take a fraction of the data to test the model
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
rus = RandomUnderSampler(random_state = 0)
X_rus, y_rus = rus.fit_resample(X_train_raw, y_train_raw)

In [23]:
# X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size= 0.2, random_state = 2020)
# Take the entire data for training
X_train, y_train = X_rus, y_rus

In [24]:
print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_rus))

Original dataset shape Counter({0: 282686, 1: 24824})
Resampled dataset shape Counter({0: 17311, 1: 17311})


In [25]:
scaler = RobustScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [26]:
# rf_rus = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', max_depth=20, class_weight='balanced')
# rf_rus.fit(X_train_scaled, y_train)
# model_Evaluate(rf_rus, X_test_scaled, y_test)

ada = AdaBoostClassifier(
    base_estimator=None,
    n_estimators=200,
    learning_rate=1.0,
    algorithm='SAMME.R',
    random_state=42
)


ada.fit(X_train_scaled, y_train)

In [27]:
testing_data_scaled = scaler.transform(X_test_raw)
model_Evaluate(ada, testing_data_scaled, y_test_raw)

Accuracy of model on testing data : 65.26291827908035 

              precision    recall  f1-score   support

           0       0.95      0.65      0.78     84740
           1       0.14      0.64      0.23      7513

    accuracy                           0.65     92253
   macro avg       0.55      0.65      0.50     92253
weighted avg       0.89      0.65      0.73     92253

f2 score: 0.5364852947627172


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.652629,0.141131,0.642087,0.703872,0.2314,0.536485,0.703872


In [28]:

smote_enn = SMOTEENN(smote=SMOTE(sampling_strategy='minority'))

In [29]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
X_resampled, y_resampled = smote_enn.fit_resample(X_train_raw, y_train_raw)

In [31]:
scaler = RobustScaler().fit(X_resampled)
X_train_scaled = scaler.transform(X_resampled)

In [32]:
ada_smoteenn = AdaBoostClassifier(
    base_estimator=None,      # Default base estimator (decision stump)
    n_estimators=500,         # Larger number of weak learners
    learning_rate=0.1,        # Smaller learning rate
    algorithm='SAMME.R',      # Suitable for binary classification
    random_state=42           # Random seed for reproducibility
)

ada_smoteenn.fit(X_train_scaled, y_resampled)

In [33]:
testing_data_scaled = scaler.transform(X_test_raw)

In [34]:
model_Evaluate(ada_smoteenn, testing_data_scaled, y_test_raw)

Accuracy of model on testing data : 77.84787486585802 

              precision    recall  f1-score   support

           0       0.93      0.82      0.87     84740
           1       0.13      0.30      0.18      7513

    accuracy                           0.78     92253
   macro avg       0.53      0.56      0.53     92253
weighted avg       0.87      0.78      0.82     92253

f2 score: 0.5405302575281536


Unnamed: 0,Test_Accuracy,Precision,Recall,AUC,F1_Score,F2_Score,Roc_Auc_score
0,0.778479,0.130835,0.304805,0.620596,0.183083,0.54053,0.620596
