# Code for the MA981: Dissertation

## Data Pre-processing

### Importing libraries

In [None]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import RobustScaler 
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold,TimeSeriesSplit
import matplotlib.gridspec as gridspec

from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer,accuracy_score,precision_score, recall_score, matthews_corrcoef, auc
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, KFold
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials, space_eval
from hyperopt.pyll.base import scope
from sklearn import ensemble
from functools import partial

from tabulate import tabulate

from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
import smote_variants as sv
import random

### Reading data

In [None]:
base_data = pd.read_excel('Training_dataset_Original.xlsx', engine='openpyxl')
base_data.head()

### Data types

In [None]:
# replacing nan with empty string using replace() function 
nomiss_data = base_data.replace('missing', np.nan)
nomiss_data=nomiss_data.replace('na', np.nan)
nomiss_data=nomiss_data.replace(' ', np.nan)
nomiss_data=nomiss_data.replace('', np.nan)
# checking the nan values
nomiss_data.info()

In [None]:
# checking for categorical variables
print(nomiss_data.nunique())

In [None]:
# Map ordinal values to numbers
density_map = {
    'C' : 0,
    'L' : 1
}
nomiss_data['mvar47'] = nomiss_data['mvar47'].map(density_map)
nomiss_data.head()

### Duplicates

In [None]:
#size of data
print("The data size is: {} ".format(nomiss_data.shape))

In [None]:
nomiss_data.drop_duplicates(inplace=True)
#size of data after deleting duplicates
print("The data size is: {} ".format(nomiss_data.shape))

### Some tidying

In [None]:
list = ['application_key']
tidy_data = nomiss_data.drop(list,axis = 1 )
tidy_data

### Feature selection

In [None]:
#correlation map
%matplotlib inline
f,ax = plt.subplots(figsize=(25,25))
plt.title('Correlation Heatmap of the Dataset')

corr_df =  tidy_data.corr(method='pearson') 
corr_bottom = corr_df.where(np.tril(np.ones(corr_df.shape), k=-1).astype(bool))

hmap=sns.heatmap(corr_bottom, annot=True, linewidths=.5, fmt= '.1f',ax=ax)
hmap.figure.savefig("Correlation_Heatmap_Lower_Triangle_with_Seaborn.png",
                    format='png',
                    dpi=150)

In [None]:
# Find index of feature columns with correlation greater than 0.9
to_drop_plus = [column for column in corr_bottom.columns if any(corr_bottom[column] > 0.9)]
print(to_drop_plus)

In [None]:
# Drop Marked Features
nocorr_data = tidy_data.drop(to_drop_plus, axis='columns')
print(nocorr_data)

### Dealing with missing values

In [None]:
# Missing values
print('Dataset columns with missing values:\n', nocorr_data.isnull().sum())

In [None]:
imputer = IterativeImputer(random_state=42)
cat_feat = nocorr_data[['mvar47', 'default_ind']]
int_feat = nocorr_data.drop(['mvar47', 'default_ind'], axis=1)
imputed = imputer.fit_transform(int_feat)
int_imputed = pd.DataFrame(imputed, columns=int_feat.columns)
imputted_data = pd.merge(int_imputed, cat_feat, left_index=True, right_index=True)
imputted_data.head()

In [None]:
# Checking null value
imputted_data.isnull().sum().sort_values(ascending=False)

In [None]:
# Checking na value
imputted_data.isna().any()

In [None]:
df_numerics_only = imputted_data.select_dtypes(include=np.number)
df_numerics_only.columns

### Fraud ratio

In [None]:
val_counts = imputted_data.default_ind.value_counts()
print(val_counts)

print('---------------------------------------')
print('No Frauds', round(imputted_data['default_ind'].value_counts()[0]/len(imputted_data) * 100,2), '% of the dataset')
print('Frauds', round(imputted_data['default_ind'].value_counts()[1]/len(imputted_data) * 100,2), '% of the dataset')

### Splitting the Data

In [None]:
X = imputted_data.drop('default_ind', axis=1)
y = imputted_data['default_ind']
 
original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(
X, y, test_size=0.33, random_state=42, shuffle=True, stratify=y)  
    
    
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

In [None]:
# Creating the new dataframe to keep all the results
Comparingdata = pd.DataFrame(columns =['Classifier',
                                       'Accuracy',
                                       'Balanced_Accuracy',
                                       'Precision',
                                       'Recall',
                                       'MCC',
                                       'Execution_time',
                                       'Best_param'])
Comparingdata

# Classifiers tuned with GridSearchCV

In [None]:
RScal = RobustScaler()
sss = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scoring = {'accuracy': 'accuracy',
           'balanced_accuracy': 'balanced_accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'MCC': make_scorer(matthews_corrcoef)
          }

## Naive Bayes Classifier - NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

param_grid = {
 'classifier__var_smoothing': [0.0001,0.001,0.01]}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=3)


start_time = time.time()
scores = cross_validate (search, original_Xtrain, original_ytrain, scoring=scoring ,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe

Comparingdata = Comparingdata.append({'Classifier' : 'NB_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## Decision Tree Classifier - DT

In [None]:
param_grid = {    'max_depth': [2,3,4,5,10,15.20,25,30,35,40,45],
                  'max_features': ['sqrt', 'log2'],
                  'min_samples_leaf': [2,4,6,8],
                  'min_samples_split': [2,3,5,7]
}

search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, verbose=3)


start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain,scoring=scoring,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'DT_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## Super vector classifier - SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

param_grid = {
                  'classifier__C': [10,100,200],
                  'classifier__kernel': ['poly', 'rbf','sigmoid'],
                  'classifier__gamma': [0.01, 0.001]
                 }

search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=3)


Sstart_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain, scoring=scoring,cv=sss, verbose = 3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'SVC_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## K-Nearest Neighbour - KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

param_grid = {    'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
                  'classifier__n_neighbors': [17,19,21,23]}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=3)


start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain, scoring=scoring,cv=sss, verbose = 3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time)) 
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'KNN_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## Bagging on DT

In [None]:
param_grid = {      'n_estimators': [10, 30, 50],
                    'bootstrap': [True, False]
                    }

search = GridSearchCV(BaggingClassifier(), param_grid, n_jobs=-1, verbose=3)


start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain,scoring=scoring,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))   
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'Bagging_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## Random Forest Classifier - RF

In [None]:
param_grid = {   'bootstrap': [True, False],
                 'max_depth': [10, 15, 20],
                 'max_features': [0.01,0.1,0.5],
                 'n_estimators': [10, 50, 100]}

search = GridSearchCV(RandomForestClassifier(), param_grid, n_jobs=-1, verbose=3)

start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain,scoring=scoring,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))    
print('-------------------------------')

# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'RF_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(), 
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## XGBClassifier - XGBoost

In [None]:
param_grid = {      'n_estimators': [10, 30,50],
                    'colsample_bytree': [0.4, 0.5 , 0.7],
                    'max_depth': [10,20,30],
                    'learning_rate': [0.001,0.01]
                    }

search = GridSearchCV(xgb.XGBClassifier(), param_grid, n_jobs=-1, verbose=3)

start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain,scoring=scoring,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))    
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'XGBoost_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(), 
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(),
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

## LightGBM 

In [None]:
param_grid = {      'n_estimators': [900,1000,1100],
                    'max_depth': [8,9,10,11,12],
                    'learning_rate': [0.001, 0.01,0.1],
                    'feature_fraction': [0.2,0.3,0.4],
                    'boosting_type': ['gbdt', 'dart', 'goss']
                    }

search = GridSearchCV(lgb.LGBMClassifier(), param_grid, n_jobs=-1, verbose=3)


start_time = time.time()
scores = cross_validate(search, original_Xtrain, original_ytrain,scoring=scoring,cv=sss, verbose=3)
execution_time = time.time()-start_time

print("Average Accuracy: {}".format(scores['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(scores['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(scores['test_precision'].mean()),
      "Average Recall: {}".format(scores['test_recall'].mean()),
      "Average MCC: {}".format(scores['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))
print('-------------------------------')


# Searching for the best parameters
search.fit(original_Xtrain, original_ytrain)
search.best_params_
print(search.best_params_)

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'LightGBM_GridSearch',
                                      'Accuracy': scores['test_accuracy'].mean(),
                                      'Balanced_Accuracy': scores['test_balanced_accuracy'].mean(), 
                                      'Precision': scores['test_precision'].mean(), 
                                      'Recall': scores['test_recall'].mean(),
                                      'MCC': scores['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': search.best_params_}, ignore_index = True)

________________________________________________________________________________________________________________________________

# Classifiers tuned with HyperOpt

In [None]:
def objective(params, random_state=42, cv=sss, X=original_Xtrain, y=original_ytrain):
    score = cross_val_score(pipe, X, y,cv=sss, scoring = 'accuracy', n_jobs=-1).mean()
    return score

In [None]:
def metric(params, random_state=42, cv=sss, X=original_Xtrain, y=original_ytrain):

    score = cross_validate(pipe, X, y,cv=sss, scoring = scoring, n_jobs=-1)
    return score

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
    'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)
}


# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'NB_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 3,15,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'DT_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'SVC_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'KNN_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging 

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,1),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'Bagging_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'max_depth': hp.quniform('max_depth', 1, 30, 1),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'RF_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 10)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 10)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 100))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'XGBoost_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM 

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(1,1000,1)),
        'max_depth': hp.choice('max_depth', range(1,20,1)),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.quniform('learning_rate', 0, 0.5, 0.001),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(100,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=100, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time

metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'LightGBM_HyperOpt',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

# Sampling

## Random Undersampling

## Single Classifiers

In [None]:
def resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total =0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=RandomUnderSampler()
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        s=pipe.score(X_test, y_test)
        total= total + s
    score = total/5            
    return score

In [None]:
from sklearn.metrics import balanced_accuracy_score
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=RandomUnderSampler()
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'NB_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 3,15,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'DT_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'SVC_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'KNN_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,1),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'Bagging_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'max_depth': hp.quniform('max_depth', 1, 30, 1),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'RF_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 10)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 10)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 100))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'XGBoost_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,15,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'LightGBM_RUS',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SMOTE & TomekLink

In [None]:
def resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total =0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=SMOTETomek(tomek=TomekLinks())
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        s=pipe.score(X_test, y_test)
        total= total + s
    score = total/5            
    return score

In [None]:
from sklearn.metrics import balanced_accuracy_score
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=SMOTETomek(tomek=TomekLinks())
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
        'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'NB_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 3,15,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'DT_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'SVC_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'KNN_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,1),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'Bagging_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'max_depth': hp.quniform('max_depth', 1, 30, 1),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'RF_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 10)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 10)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 100))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'XGBoost_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,15,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'LightGBM_SmoteTomek',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## IW-SMOTE

In [None]:
# the classfication and regression tree
def CART(X=original_Xtrain, y=original_ytrain, XX=original_Xtest):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    predicted = model.predict(XX)
    return predicted


class IW_SMOTE():
    def __init__(self, data, target, balance=1):
        self.balance = balance;  # Sampling rate
        self.data = original_Xtrain # The training set
        self.target = original_ytrain # The labels of training set

    """
            :param lamda: lamda*imbalance ratio = the number of cart
            :param thres: The threshold of filtering noise
            :param k_neighbor: k nearest neighbor
            :param divide_times: The ratio of under sampling minority samples
            :param gen_times: The ratio of generated minority class to majority class samples
            :return: The synthetic samples, attributes and labels
    """
    def IW_SMOTE(self, lamda=100, thres=0.5, divide_times=2, gen_times=1, k_neighbor=5):
        # x_x:Temporary variable, save the training set
        x_x = pd.DataFrame(self.data)
        x_x[len(x_x.columns)] = self.target
        m, n = len(x_x), len(x_x.columns) #m: the number of trainning set; n: the number of columns of the trainning set
        z = x_x[x_x[n - 1] == 1]  # acquire the minority set
        p = x_x[x_x[n - 1] == -1]  # acquire the majority set
        m1, n1 = len(z), len(z.columns) # m1: the number of minority samples; n1: the number of columns of the minority samples
        m2, n2 = len(p), len(p.columns) # m2: the number of majority samples; n2: the number of columns of the majority samples
        IR = m2 / m1  # imbalance ratio
        predict_min_labelset = pd.DataFrame(columns = range(int(IR * lamda))) # the predicted labels of the minority samples
        predict_maj_labelset = pd.DataFrame(columns = range(int(IR * lamda))) # the predicted labels of the majority samples
        # train under-bagging CART
        for i_1 in range(int(IR * lamda)):
            train_subset = z.sample(int(m1 / divide_times)) # train_subset: the subset of training set
            train_subset = train_subset.append(p.sample(int(m1 / divide_times), replace=True))
            predict_maj_labelset[i_1] = CART(np.array(train_subset.iloc[:, 0:n1 - 1]), np.array(train_subset[n1 - 1]),
                                             np.array(p.iloc[:, 0:n2 - 1]))
            predict_min_labelset[i_1] = CART(np.array(train_subset.iloc[:, 0:n1 - 1]), np.array(train_subset[n1 - 1]),
                                             np.array(z.iloc[:, 0:n1 - 1]))
        # filterring noise
        err_rate_min = []  # record the error rate of the reserved minority instance
        reserve_min = []  # record the reserved minority instances
        num_reserve_min = 0  # the number of minority samples after denoising
        z1 = np.array(z)
        predict_min_labelset = np.array(predict_min_labelset)
        for i_2 in range(m1):
            num_right = 0  # record the number of instance which is predicted accurately
            for j in range(int(IR * lamda)):
                if predict_min_labelset[i_2][j] == z1[i_2][n1 - 1]:
                    num_right = num_right + 1
            if ((int(IR * lamda) - num_right) / int(IR * lamda) < thres):
                num_reserve_min += 1
                reserve_min.append(z1[i_2])
                if (int(IR * lamda) - num_right) / int(IR * lamda) < 1 / int(IR * lamda):
                    err_rate_min.append(1 / int(IR * lamda))
                else:
                    err_rate_min.append((int(IR * lamda) - num_right) / int(IR * lamda))
        reserve_min = pd.DataFrame(reserve_min)
        err_rate_min = pd.DataFrame(err_rate_min)
        err_rate_maj = []  # record the error rate of the reserved minority instance
        reserve_maj = []  # record the reserved minority instances
        num_reserve_maj = 0  # the number of majority samples after denoising
        p1 = np.array(p)
        predict_maj_labelset = np.array(predict_maj_labelset)
        for i_3 in range(m2):
            num_right = 0  # record the number of instance which is predicted accurately
            for j in range(int(IR * lamda)):
                if predict_maj_labelset[i_3][j] == p1[i_3][n2 - 1]:
                    num_right = num_right + 1
            if ((int(IR * lamda) - num_right) / int(IR * lamda) < thres):
                num_reserve_maj += 1
                reserve_maj.append(p1[i_3])
                if (int(IR * lamda) - num_right) / int(IR * lamda) < 1 / int(IR * lamda):
                    err_rate_maj.append(1 / int(IR * lamda))
                else:
                    err_rate_maj.append((int(IR * lamda) - num_right) / int(IR * lamda))
        reserve_maj = pd.DataFrame(reserve_maj)

        # generate the synthetic minority instances
        weight = err_rate_min[0] / sum(err_rate_min[0])  # Record the importance of each sample
        num_need_generate = gen_times * num_reserve_maj - num_reserve_min  # The number of minority samples that need to be synthesized
        if num_need_generate == num_reserve_maj:
            return np.array(reserve_maj.iloc[:, 0:len(reserve_maj.columns) - 1]), np.array(
                reserve_maj[len(reserve_maj.columns) - 1])
        else:
            num_generate = 0 # the number of be generated
            new_set = pd.DataFrame(columns=range(n1))
            for i_4 in range(num_reserve_min):
                reserve_min_1 = reserve_min
                nums = pd.DataFrame(weight * (gen_times * num_reserve_maj - num_reserve_min)).iloc[i_4, 0]
                reserve_min_1 = np.array(reserve_min_1)
                dis = [0] * num_reserve_min # distance matrix of per sample
                for m in range(num_reserve_min):
                    dis[m] = np.linalg.norm(reserve_min_1[i_4] - reserve_min_1[m])
                b = sorted(enumerate(dis), key=lambda xxx: xxx[1]) #Sorted dis
                b = b[1:k_neighbor + 1] # choice knn
                for j in range(int(nums)):
                    num_generate = num_generate + 1
                    s_b = random.choice(b)
                    select_ins = reserve_min.iloc[s_b[0], :]
                    new_ins = (reserve_min.iloc[i_4, :] - pd.DataFrame(select_ins).T) * random.random() + pd.DataFrame(
                        select_ins).T  # generate funtion
                    new_set = new_set.append(pd.DataFrame(new_ins))  # add the new instance into a temporary set
            new_z = reserve_min.append(new_set) # The minority synthetic samples
            new_original_data = reserve_maj.append(new_z) # The synthetic samples
            new_original_data.index = range(len(new_original_data))
            # Returns an oversampled dataset
            return np.array(new_original_data.iloc[:, 0:len(new_original_data.columns) - 1]), np.array(new_original_data[len(new_original_data.columns) - 1])

In [None]:
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_sampling =IW_SMOTE(X_train, y_train).data
        y_sampling =IW_SMOTE(X_train, y_train).target 
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "/n Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "/n Average Precision: {}".format(metrics['test_precision'].mean()),
      "/n Average Recall: {}".format(metrics['test_recall'].mean()),
      "/n Average MCC: {}".format(metrics['test_MCC'].mean()),
      "/n Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'NB_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 3,15,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'DT_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'SVC_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'KNN_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,1),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'Bagging_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'max_depth': hp.quniform('max_depth', 1, 30, 1),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'RF_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 10)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 10)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 100))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'XGBoost_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM 

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,15,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=50, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata = Comparingdata.append({'Classifier' : 'LightGBM_IWSmote',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Table

In [None]:
Table = tabulate(Comparingdata, headers = 'keys', tablefmt = 'html')
Table

## Visualisations 

In [None]:
Visualisation = Comparingdata.set_index('Classifier')
plt.rcParams['figure.figsize']=[20,30]

In [None]:
MCC_plot = Visualisation.sort_values('MCC', ascending=False).MCC.plot.bar()
MCC_plot
MCC_plot.figure.savefig("MCC_plot.png",
                    format='png',
                    dpi=150)

In [None]:
Execution_time_plot = Visualisation.sort_values('Execution_time', ascending=False).Execution_time.plot.barh()
Execution_time_plot
Execution_time_plot.figure.savefig("Execution_time_plot.png",
                    format='png',
                    dpi=150)

In [None]:
Recall_plot = Visualisation.sort_values('Recall', ascending=False).Recall.plot.barh()
Recall_plot
Recall_plot.figure.savefig("Recall_plot.png",
                    format='png',
                    dpi=150)

In [None]:
Precision_plot = Visualisation.sort_values('Precision', ascending=False).Precision.plot.barh()
Precision_plot
Precision_plot.figure.savefig("Precision_plot.png",
                    format='png',
                    dpi=150)

In [None]:
Balanced_Accuracy_plot = Visualisation.sort_values('Balanced_Accuracy', ascending=False).Balanced_Accuracy.plot.barh()
Balanced_Accuracy_plot
Balanced_Accuracy_plot.figure.savefig("Balanced_Accuracy_plot.png",
                    format='png',
                    dpi=150)

________________________________________________________________________________________________________________________________

### Tables for Latex

In [None]:
ExecutionTimes = [['NB', 49.3], ['NB+RUS', 40.1], ['NB+STL', 19099.7], 
                  ['NB+IW', 19076.6], 
                  ['KNN', 1686.6], ['KNN+RUS', 1728.5], ['KNN+STL', 23635.2], 
                  ['KNN+IW', 24752.6],
                  ['SVC', 14199.5], ['SVC+RUS', 14450.8], ['SVC+STL', 140446], 
                  ['SVC+IW', 151178],
                  ['DT', 254.1], ['DT+RUS', 323.8], ['DT+STL', 19970.5], 
                  ['DT+IW', 20253.8],
                  ['RF', 1817.5], ['RF+RUS', 3092.62], ['RF+STL', 30567.9], 
                  ['RF+IW', 30588.3],
                  ['BEC', 1361.9], ['BEC+RUS', 2088.47], ['BEC+STL', 27055.7], 
                  ['BEC+IW', 27205.1],
                  ['XGB', 2386.8], ['XGB+RUS', 913.6], ['XGB+STL', 22018.2], 
                  ['XGB+IW', 21795.5],
                  ['LGBM', 200.3], ['LGBM+RUS', 127.5], ['LGBM+STL', 19322.2], 
                  ['LGBM+IW', 19285.2]]

In [None]:
ExecutionTimesTable = pd.DataFrame(
    ExecutionTimes, columns=['Model', 'Execution time in seconds'])

In [None]:
ExecutionTimesTable = ExecutionTimesTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[11,2]
ExecutionTimesViz = ExecutionTimesTable.sort_values('Execution time in seconds', ascending=False).plot.bar(width=0.9, color='green')
plt.title('Model Execution Time Plot in Descending Order')
plt.xlabel(None)
for label in (ExecutionTimesViz.get_xticklabels() + ExecutionTimesViz.get_yticklabels()):
    label.set_fontsize(9)
ExecutionTimesViz.figure.savefig('Model_Execution_Time_Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

-----------------

In [None]:
Accuracy =       [['NB', 0.694], ['NB+RUS', 0.678], ['NB+STL', 0.641], 
                  ['NB+IW', 0.969], 
                  ['KNN', 0.773], ['KNN+RUS', 0.773], ['KNN+STL', 0.773], 
                  ['KNN+IW', 0.773],
                  ['SVC', 0.780], ['SVC+RUS', 0.780], ['SVC+STL', 0.780], 
                  ['SVC+IW', 0.780],
                  ['DT', 0.728], ['DT+RUS', 0.726], ['DT+STL', 0.727], 
                  ['DT+IW', 0.727],
                  ['RF', 0.806], ['RF+RUS', 0.805], ['RF+STL', 0.806], 
                  ['RF+IW', 0.805],
                  ['BEC', 0.787], ['BEC+RUS', 0.786], ['BEC+STL', 0.789], 
                  ['BEC+IW', 0.786],
                  ['XGB', 0.800], ['XGB+RUS', 0.800], ['XGB+STL', 0.800], 
                  ['XGB+IW', 0.800],
                  ['LGBM', 0.808], ['LGBM+RUS', 0.738], ['LGBM+STL', 0.805], 
                  ['LGBM+IW', 0.835]]

In [None]:
AccuracyTable = pd.DataFrame(
    Accuracy, columns=['Model', 'Values'])
AccuracyTable = AccuracyTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
AccuracyViz = AccuracyTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Accuracy Values in Descending Order')
plt.xlabel(None)
for label in (AccuracyViz.get_xticklabels() + AccuracyViz.get_yticklabels()):
    label.set_fontsize(9)
AccuracyViz.figure.savefig('Accuracy_Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

-----------------

In [None]:
Balanced_Accuracy = [['NB', 0.730], ['NB+RUS', 0.725], ['NB+STL', 0.710], 
                  ['NB+IW', 0.730], 
                  ['KNN', 0.654], ['KNN+RUS', 0.654], ['KNN+STL', 0.654], 
                  ['KNN+IW', 0.654],
                  ['SVC', 0.577], ['SVC+RUS', 0.577], ['SVC+STL', 0.577], 
                  ['SVC+IW', 0.577],
                  ['DT', 0.640], ['DT+RUS', 0.638], ['DT+STL', 0.638], 
                  ['DT+IW', 0.641],
                  ['RF', 0.672], ['RF+RUS', 0.673], ['RF+STL', 0.673], 
                  ['RF+IW', 0.672],
                  ['BEC', 0.650], ['BEC+RUS', 0.648], ['BEC+STL', 0.651], 
                  ['BEC+IW', 0.648],
                  ['XGB', 0.683], ['XGB+RUS', 0.683], ['XGB+STL', 0.683], 
                  ['XGB+IW', 0.683],
                  ['LGBM', 0.688], ['LGBM+RUS', 0.750], ['LGBM+STL', 0.697], 
                  ['LGBM+IW', 0.725]]

In [None]:
Balanced_AccuracyTable = pd.DataFrame(
    Balanced_Accuracy, columns=['Model', 'Values'])
Balanced_AccuracyTable = Balanced_AccuracyTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
Balanced_AccuracyViz = Balanced_AccuracyTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Balanced_Accuracy Values in Descending Order')
plt.xlabel(None)
for label in (Balanced_AccuracyViz.get_xticklabels() + Balanced_AccuracyViz.get_yticklabels()):
    label.set_fontsize(9)
Balanced_AccuracyViz.figure.savefig('Balanced__Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

---------------------------------------

In [None]:
Precision = [['NB', 0.434], ['NB+RUS', 0.421], ['NB+STL', 0.394], 
                  ['NB+IW', 0.436], 
                  ['KNN', 0.551], ['KNN+RUS', 0.551], ['KNN+STL', 0.551], 
                  ['KNN+IW', 0.551],
                  ['SVC', 0.717], ['SVC+RUS', 0.717], ['SVC+STL', 0.717], 
                  ['SVC+IW', 0.717],
                  ['DT', 0.449], ['DT+RUS', 0.447], ['DT+STL', 0.447], 
                  ['DT+IW', 0.449],
                  ['RF', 0.674], ['RF+RUS', 0.670], ['RF+STL', 0.676], 
                  ['RF+IW', 0.672],
                  ['BEC', 0.608], ['BEC+RUS', 0.607], ['BEC+STL', 0.615], 
                  ['BEC+IW', 0.607],
                  ['XGB', 0.634], ['XGB+RUS', 0.634], ['XGB+STL', 0.634], 
                  ['XGB+IW', 0.634],
                  ['LGBM', 0.660], ['LGBM+RUS', 0.480], ['LGBM+STL', 0.636], 
                  ['LGBM+IW', 0.734]]

In [None]:
PrecisionTable = pd.DataFrame(
    Precision, columns=['Model', 'Values'])
PrecisionTable = PrecisionTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
PrecisionViz = PrecisionTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Precision Values in Descending Order')
plt.xlabel(None)
for label in (PrecisionViz.get_xticklabels() + PrecisionViz.get_yticklabels()):
    label.set_fontsize(9)
PrecisionViz.figure.savefig('Precision_Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

---------------------------------------

In [None]:
Recall = [['NB', 0.799], ['NB+RUS', 0.818], ['NB+STL', 0.848], 
                  ['NB+IW', 0.798], 
                  ['KNN', 0.419], ['KNN+RUS', 0.419], ['KNN+STL', 0.419], 
                  ['KNN+IW', 0.419],
                  ['SVC', 0.176], ['SVC+RUS', 0.176], ['SVC+STL', 0.176], 
                  ['SVC+IW', 0.176],
                  ['DT', 0.467], ['DT+RUS', 0.465], ['DT+STL', 0.464], 
                  ['DT+IW', 0.470],
                  ['RF', 0.409], ['RF+RUS', 0.412], ['RF+STL', 0.411], 
                  ['RF+IW', 0.409],
                  ['BEC', 0.380], ['BEC+RUS', 0.376], ['BEC+STL', 0.380], 
                  ['BEC+IW', 0.375],
                  ['XGB', 0.450], ['XGB+RUS', 0.450], ['XGB+STL', 0.450], 
                  ['XGB+IW', 0.450],
                  ['LGBM', 0.453], ['LGBM+RUS', 0.775], ['LGBM+STL', 0.484], 
                  ['LGBM+IW', 0.510]]

In [None]:
RecallTable = pd.DataFrame(
    Recall, columns=['Model', 'Values'])
RecallTable = RecallTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
RecallViz = RecallTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Recall Values in Descending Order')
plt.xlabel(None)
for label in (RecallViz.get_xticklabels() + RecallViz.get_yticklabels()):
    label.set_fontsize(9)
RecallViz.figure.savefig('Recall_Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

---------------------------

In [None]:
MCC = [['NB', 0.397], ['NB+RUS', 0.388], ['NB+STL', 0.363], 
                  ['NB+IW', 0.399], 
                  ['KNN', 0.340], ['KNN+RUS', 0.339], ['KNN+STL', 0.340], 
                  ['KNN+IW', 0.339],
                  ['SVC', 0.277], ['SVC+RUS', 0.277], ['SVC+STL', 0.277], 
                  ['SVC+IW', 0.277],
                  ['DT', 0.276], ['DT+RUS', 0.273], ['DT+STL', 0.273], 
                  ['DT+IW', 0.277],
                  ['RF', 0.416], ['RF+RUS', 0.415], ['RF+STL', 0.418], 
                  ['RF+IW', 0.415],
                  ['BEC', 0.358], ['BEC+RUS', 0.355], ['BEC+STL', 0.362], 
                  ['BEC+IW', 0.355],
                  ['XGB', 0.414], ['XGB+RUS', 0.414], ['XGB+STL', 0.414], 
                  ['XGB+IW', 0.414],
                  ['LGBM', 0.433], ['LGBM+RUS', 0.441], ['LGBM+STL', 0.434], 
                  ['LGBM+IW', 0.517]]

In [None]:
MCCTable = pd.DataFrame(
    MCC, columns=['Model', 'Values'])
MCCTable = MCCTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
MCCViz = MCCTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('MCC Values in Descending Order')
plt.xlabel(None)
for label in (MCCViz.get_xticklabels() + MCCViz.get_yticklabels()):
    label.set_fontsize(9)
MCCViz.figure.savefig('MCC_Plot.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

________________________________________________________________________________________________________________________________

________________________________________________________________________________________________________________________________

# Highly imbalanced data

In [None]:
highly_imbalanced_df = imputted_data.drop(imputted_data.query('default_ind == 1').sample(n=19087).index)
highly_imbalanced_df.head()

In [None]:
val_counts_imbalanced = highly_imbalanced_df.default_ind.value_counts()
print(val_counts_imbalanced)

print('---------------------------------------')
print('No Frauds', round(highly_imbalanced_df['default_ind'].value_counts()[0]/len(highly_imbalanced_df) * 100,2), '% of the dataset')
print('Frauds', round(highly_imbalanced_df['default_ind'].value_counts()[1]/len(highly_imbalanced_df) * 100,2), '% of the dataset')

In [None]:
# Splitting data
X = highly_imbalanced_df.drop('default_ind', axis=1)
y = highly_imbalanced_df['default_ind']

original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(
X, y, test_size=0.33, random_state=42, shuffle=True, stratify=y)  
    
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

# HyperOpt

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
    'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)
}


# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'NB_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 3,15,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'DT_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'SVC_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'KNN_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,1),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI= Comparingdata_HI.append({'Classifier' : 'Bagging_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
            'max_depth': hp.quniform('max_depth', 1, 45, 1),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'RF_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 45, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 10)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 10)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 100))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'XGBoost_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(1,1000,1)),
        'max_depth': hp.choice('max_depth', range(1,45,1)),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.quniform('learning_rate', 0, 0.5, 0.001),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(100,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=objective, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=100, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time

metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'LightGBM_HyperOpt_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

________________________________________________________________________________________________________________________________

# Sampling

## Random Undersampling

In [None]:
def resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total =0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=RandomUnderSampler()
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        s=pipe.score(X_test, y_test)
        total= total + s
    score = total/5            
    return score

In [None]:
from sklearn.metrics import balanced_accuracy_score
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=RandomUnderSampler()
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'NB_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 1,30,2),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata_HI= Comparingdata_HI.append({'Classifier' : 'DT_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'SVC_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'KNN_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,2),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'Bagging_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'max_depth': hp.quniform('max_depth', 1, 30, 2),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'RF_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 2),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 20)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 20)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 200))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'XGBoost_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,30,2)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 200)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI= Comparingdata_HI.append({'Classifier' : 'LightGBM_RUS_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SMOTE & TomekLink

In [None]:
def resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total =0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=SMOTETomek(tomek=TomekLinks())
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        s=pipe.score(X_test, y_test)
        total= total + s
    score = total/5            
    return score

In [None]:
from sklearn.metrics import balanced_accuracy_score
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        sampling=SMOTETomek(tomek=TomekLinks())
        X_sampling, y_sampling = sampling.fit_resample(X_train, y_train)
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
        'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'NB_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 1,30,2),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'DT_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'SVC_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'KNN_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,2),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'Bagging_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'max_depth': hp.quniform('max_depth', 1, 30, 2),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'RF_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 2),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 20)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 250, 20)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 200))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'XGBoost_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,30,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'LightGBM_SmoteTomek_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## IW-SMOTE

In [None]:
# the classfication and regression tree
def CART(X=original_Xtrain, y=original_ytrain, XX=original_Xtest):
    model = DecisionTreeClassifier()
    model.fit(X, y)
    predicted = model.predict(XX)
    return predicted


class IW_SMOTE():
    def __init__(self, data, target, balance=1):
        self.balance = balance;  # Sampling rate
        self.data = original_Xtrain # The training set
        self.target = original_ytrain # The labels of training set

    """
            :param lamda: lamda*imbalance ratio = the number of cart
            :param thres: The threshold of filtering noise
            :param k_neighbor: k nearest neighbor
            :param divide_times: The ratio of under sampling minority samples
            :param gen_times: The ratio of generated minority class to majority class samples
            :return: The synthetic samples, attributes and labels
    """
    def IW_SMOTE(self, lamda=100, thres=0.5, divide_times=2, gen_times=1, k_neighbor=5):
        # x_x:Temporary variable, save the training set
        x_x = pd.DataFrame(self.data)
        x_x[len(x_x.columns)] = self.target
        m, n = len(x_x), len(x_x.columns) #m: the number of trainning set; n: the number of columns of the trainning set
        z = x_x[x_x[n - 1] == 1]  # acquire the minority set
        p = x_x[x_x[n - 1] == -1]  # acquire the majority set
        m1, n1 = len(z), len(z.columns) # m1: the number of minority samples; n1: the number of columns of the minority samples
        m2, n2 = len(p), len(p.columns) # m2: the number of majority samples; n2: the number of columns of the majority samples
        IR = m2 / m1  # imbalance ratio
        predict_min_labelset = pd.DataFrame(columns = range(int(IR * lamda))) # the predicted labels of the minority samples
        predict_maj_labelset = pd.DataFrame(columns = range(int(IR * lamda))) # the predicted labels of the majority samples
        # train under-bagging CART
        for i_1 in range(int(IR * lamda)):
            train_subset = z.sample(int(m1 / divide_times)) # train_subset: the subset of training set
            train_subset = train_subset.append(p.sample(int(m1 / divide_times), replace=True))
            predict_maj_labelset[i_1] = CART(np.array(train_subset.iloc[:, 0:n1 - 1]), np.array(train_subset[n1 - 1]),
                                             np.array(p.iloc[:, 0:n2 - 1]))
            predict_min_labelset[i_1] = CART(np.array(train_subset.iloc[:, 0:n1 - 1]), np.array(train_subset[n1 - 1]),
                                             np.array(z.iloc[:, 0:n1 - 1]))
        # filterring noise
        err_rate_min = []  # record the error rate of the reserved minority instance
        reserve_min = []  # record the reserved minority instances
        num_reserve_min = 0  # the number of minority samples after denoising
        z1 = np.array(z)
        predict_min_labelset = np.array(predict_min_labelset)
        for i_2 in range(m1):
            num_right = 0  # record the number of instance which is predicted accurately
            for j in range(int(IR * lamda)):
                if predict_min_labelset[i_2][j] == z1[i_2][n1 - 1]:
                    num_right = num_right + 1
            if ((int(IR * lamda) - num_right) / int(IR * lamda) < thres):
                num_reserve_min += 1
                reserve_min.append(z1[i_2])
                if (int(IR * lamda) - num_right) / int(IR * lamda) < 1 / int(IR * lamda):
                    err_rate_min.append(1 / int(IR * lamda))
                else:
                    err_rate_min.append((int(IR * lamda) - num_right) / int(IR * lamda))
        reserve_min = pd.DataFrame(reserve_min)
        err_rate_min = pd.DataFrame(err_rate_min)
        err_rate_maj = []  # record the error rate of the reserved minority instance
        reserve_maj = []  # record the reserved minority instances
        num_reserve_maj = 0  # the number of majority samples after denoising
        p1 = np.array(p)
        predict_maj_labelset = np.array(predict_maj_labelset)
        for i_3 in range(m2):
            num_right = 0  # record the number of instance which is predicted accurately
            for j in range(int(IR * lamda)):
                if predict_maj_labelset[i_3][j] == p1[i_3][n2 - 1]:
                    num_right = num_right + 1
            if ((int(IR * lamda) - num_right) / int(IR * lamda) < thres):
                num_reserve_maj += 1
                reserve_maj.append(p1[i_3])
                if (int(IR * lamda) - num_right) / int(IR * lamda) < 1 / int(IR * lamda):
                    err_rate_maj.append(1 / int(IR * lamda))
                else:
                    err_rate_maj.append((int(IR * lamda) - num_right) / int(IR * lamda))
        reserve_maj = pd.DataFrame(reserve_maj)

        # generate the synthetic minority instances
        weight = err_rate_min[0] / sum(err_rate_min[0])  # Record the importance of each sample
        num_need_generate = gen_times * num_reserve_maj - num_reserve_min  # The number of minority samples that need to be synthesized
        if num_need_generate == num_reserve_maj:
            return np.array(reserve_maj.iloc[:, 0:len(reserve_maj.columns) - 1]), np.array(
                reserve_maj[len(reserve_maj.columns) - 1])
        else:
            num_generate = 0 # the number of be generated
            new_set = pd.DataFrame(columns=range(n1))
            for i_4 in range(num_reserve_min):
                reserve_min_1 = reserve_min
                nums = pd.DataFrame(weight * (gen_times * num_reserve_maj - num_reserve_min)).iloc[i_4, 0]
                reserve_min_1 = np.array(reserve_min_1)
                dis = [0] * num_reserve_min # distance matrix of per sample
                for m in range(num_reserve_min):
                    dis[m] = np.linalg.norm(reserve_min_1[i_4] - reserve_min_1[m])
                b = sorted(enumerate(dis), key=lambda xxx: xxx[1]) #Sorted dis
                b = b[1:k_neighbor + 1] # choice knn
                for j in range(int(nums)):
                    num_generate = num_generate + 1
                    s_b = random.choice(b)
                    select_ins = reserve_min.iloc[s_b[0], :]
                    new_ins = (reserve_min.iloc[i_4, :] - pd.DataFrame(select_ins).T) * random.random() + pd.DataFrame(
                        select_ins).T  # generate funtion
                    new_set = new_set.append(pd.DataFrame(new_ins))  # add the new instance into a temporary set
            new_z = reserve_min.append(new_set) # The minority synthetic samples
            new_original_data = reserve_maj.append(new_z) # The synthetic samples
            new_original_data.index = range(len(new_original_data))
            # Returns an oversampled dataset
            return np.array(new_original_data.iloc[:, 0:len(new_original_data.columns) - 1]), np.array(new_original_data[len(new_original_data.columns) - 1])

In [None]:
def score_resamplingCV(params, X=original_Xtrain, y=original_ytrain, n_jobs=-1):
    
    skf = StratifiedKFold(shuffle=True)
    total_accuracy =np.array([])
    total_balanced_accuracy =np.array([])
    total_precision =np.array([])
    total_recall =np.array([])
    total_MCC =np.array([])
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        X_sampling =IW_SMOTE(X_train, y_train).data
        y_sampling =IW_SMOTE(X_train, y_train).target 
        
        pipe.fit(X_sampling, y_sampling)
        y_pred = pipe.predict(X_test)
        
        # Calculation
        test_accuracy=accuracy_score(y_test, y_pred)
        test_balanced_accuracy=balanced_accuracy_score(y_test, y_pred)
        test_precision=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_MCC=matthews_corrcoef(y_test, y_pred)
        
        total_accuracy =np.append(total_accuracy ,np.array([test_accuracy]), axis=0)
        total_balanced_accuracy =np.append(total_balanced_accuracy ,np.array([test_balanced_accuracy]), axis=0)
        total_precision =np.append(total_precision ,np.array([test_precision]), axis=0)
        total_recall =np.append(total_recall ,np.array([test_recall]), axis=0)
        total_MCC =np.append(total_MCC ,np.array([test_MCC]), axis=0)
           
    metrics = {}
    metrics['test_accuracy']= total_accuracy
    metrics['test_balanced_accuracy']=total_balanced_accuracy
    metrics['test_precision']=total_precision
    metrics['test_recall']=total_recall
    metrics['test_MCC']=total_MCC
    return metrics

## NB

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', GaussianNB())])

space = {
'classifier__var_smoothing': hp.quniform('classifier__var_smoothing',0,1,0.0001)}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "/n Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "/n Average Precision: {}".format(metrics['test_precision'].mean()),
      "/n Average Recall: {}".format(metrics['test_recall'].mean()),
      "/n Average MCC: {}".format(metrics['test_MCC'].mean()),
      "/n Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'NB_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## DT

In [None]:
pipe= DecisionTreeClassifier()

space = {
                  'criterion': hp.choice('criterion', ['gini', 'entropy']),
                  'max_depth': hp.quniform('max_depth', 1,30,1),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2']),
                  'min_samples_leaf': hp.uniform('min_samples_leaf',1,5),
                  'min_samples_split': hp.uniform('min_samples_split',1,8)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


print('Best parameters: {}'.format(best_param))

# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'DT_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## SVC

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', SVC())])

space = {
                 
            'classifier__C': hp.choice('classifier__C', [10, 100, 1000, 10000]),
            'classifier__gamma': hp.choice('classifier__gamma', [0.01, 0.001, 0.0001]),
            'classifier__kernel': hp.choice('classifier__kernel', ['poly', 'rbf','sigmoid'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'SVC_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## KNN

In [None]:
pipe= Pipeline(steps=[('scaler', RScal),('classifier', KNeighborsClassifier())])

space = {
                'classifier__metric': hp.choice('classifier__metric', ['euclidean', 'manhattan','minkowski']),
                'classifier__n_neighbors': hp.uniform('classifier__n_neighbors', 10,20)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperopt will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'KNN_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## Bagging

In [None]:
pipe= BaggingClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'classifier__max_features': hp.quniform('classifier__max_features',10,50,2),
            'classifier__bootstrap': hp.choice('classifier__bootstrap', [True, False]),
            'classifier__bootstrap_features': hp.choice('classifier__bootstrap_features', [True, False])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'Bagging_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## RF

In [None]:
pipe= RandomForestClassifier()

space = {
            'n_estimators': hp.choice('n_estimators', range(200,10000, 200)),
            'max_depth': hp.quniform('max_depth', 1, 30, 2),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
            'max_features': hp.uniform('max_features', 0.0001,1),
            'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
            'min_samples_split':hp.uniform('min_samples_split',2,6)
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)


print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'RF_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## XGBoost

In [None]:
pipe= xgb.XGBClassifier()

space = {
    'max_depth': hp.quniform('max_depth', 1, 30, 2),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'num_leaves': hp.choice('num_leaves', range(20, 250, 20)),
    'min_child_samples': hp.choice('min_child_samples', range(100, 260, 20)),
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1),
    'n_estimators': hp.choice('n_estimators', range(200,10000, 200))

}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = metric(best_param, cv=sss, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))


# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'XGBoost_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

## LightGBM

In [None]:
pipe= lgb.LGBMClassifier()

space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,30,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

# Initialize trials object
trials = Trials()

start_time = time.time()
best_param=fmin(fn=resamplingCV, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          trials=trials, # logging
          max_evals=100, # maximum number of iterations
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
         )
execution_time = time.time()-start_time


metrics = score_resamplingCV(best_param, X=original_Xtrain, y=original_ytrain)

print('-------------------------------')
print("Average Accuracy: {}".format(metrics['test_accuracy'].mean()),
      "Average Balanced_Accuracy: {}".format(metrics['test_balanced_accuracy'].mean()),
      "Average Precision: {}".format(metrics['test_precision'].mean()),
      "Average Recall: {}".format(metrics['test_recall'].mean()),
      "Average MCC: {}".format(metrics['test_MCC'].mean()),
      "Execution time: {}".format(execution_time))



# Adding information into the dataframe
Comparingdata_HI = Comparingdata_HI.append({'Classifier' : 'LightGBM_IWSmote_HI',
                                      'Accuracy': metrics['test_accuracy'].mean(),
                                      'Balanced_Accuracy': metrics['test_balanced_accuracy'].mean(), 
                                      'Precision': metrics['test_precision'].mean(), 
                                      'Recall': metrics['test_recall'].mean(),
                                      'MCC': metrics['test_MCC'].mean(),
                                      'Execution_time': execution_time,
                                      'Best_param': best_param}, ignore_index = True)

# Table

In [None]:
Table_IH = tabulate(Comparingdata_HI, headers = 'keys', tablefmt = 'html')
Table_IH

# Visualisations 

In [None]:
Visualisation_HI = Comparingdata_HI.set_index('Classifier')
plt.rcParams['figure.figsize']=[20,30]

In [None]:
MCC_plot_HI = Visualisation_HI.sort_values('MCC', ascending=False).MCC.plot.barh()
MCC_plot_HI
MCC_plot_HI.figure.savefig("MCC_plot_Highly_Imbalanced_Data.png",
                    format='png',
                    dpi=150)

In [None]:
Execution_time_plot_HI = Visualisation_HI.sort_values('Execution_time', ascending=False).Execution_time.plot.barh()
Execution_time_plot_HI
Execution_time_plot_HI.figure.savefig("Execution_time_plot_Imbalanced_Data.png",
                    format='png',
                    dpi=150)

In [None]:
Recall_plot_HI = Visualisation_HI.sort_values('Recall', ascending=False).Recall.plot.barh()
Recall_plot_HI
Recall_plot_HI.figure.savefig("Recall_plot_Imbalanced_Data.png",
                    format='png',
                    dpi=150)

In [None]:
Precision_plot_HI = Visualisation_HI.sort_values('Precision', ascending=False).Precision.plot.barh()
Precision_plot_HI
Precision_plot_HI.figure.savefig("Precision_plot_Imbalanced_Data.png",
                    format='png',
                    dpi=150)

In [None]:
Balanced_Accuracy_plot_HI = Visualisation_HI.sort_values('Balanced_Accuracy', ascending=False).Balanced_Accuracy.plot.barh()
Balanced_Accuracy_plot_HI
Balanced_Accuracy_plot_HI.figure.savefig("Balanced_Accuracy_plot_Imbalanced_Data.png",
                    format='png',
                    dpi=150)

________________________________________________________________________________________________________________________________

### Tables for Latex

In [None]:
ExecutionTime = [['NB', 51.9], ['NB+RUS', 3], ['NB+STL', 4647.9], 
                  ['NB+IW', 4519.5], 
                  ['KNN', 1678.1], ['KNN+RUS', 30.3], ['KNN+STL', 6036.1], 
                  ['KNN+IW', 4967.5],
                  ['SVC', 2736.7], ['SVC+RUS', 52.3], ['SVC+STL', 25742.6], 
                  ['SVC+IW', 22800.1],
                  ['DT', 229.7], ['DT+RUS', 2.957], ['DT+STL', 5004.8], 
                  ['DT+IW', 4859.9],
                  ['RF', 1796.8], ['RF+RUS', 32.2], ['RF+STL', 6804.2], 
                  ['RF+IW', 6805.8],
                  ['BEC', 1351.1], ['BEC+RUS', 13.8], ['BEC+STL', 6662.0], 
                  ['BEC+IW', 6648.0],
                  ['XGB', 2348.4], ['XGB+RUS', 12.3], ['XGB+STL', 4794.7], 
                  ['XGB+IW', 4936.2],
                  ['LGBM', 195.9], ['LGBM+RUS', 11.54], ['LGBM+STL', 4558.7], 
                  ['LGBM+IW', 4566.2]]

In [None]:
ExecutionTimesTable = pd.DataFrame(
    ExecutionTime, columns=['Model', 'Execution time in seconds'])
ExecutionTimesTable = ExecutionTimesTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[11,2]
ExecutionTimesViz = ExecutionTimesTable.sort_values('Execution time in seconds', ascending=False).plot.bar(width=0.9, color='green')
plt.title('Model Execution Time Plot in Descending Order 1:99')
plt.xlabel(None)
for label in (ExecutionTimesViz.get_xticklabels() + ExecutionTimesViz.get_yticklabels()):
    label.set_fontsize(9)
ExecutionTimesViz.figure.savefig('Model_Execution_Time_Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

In [None]:
Accuracy = [['NB', 0.623], ['NB+RUS', 0.614], ['NB+STL', 0.521], 
                  ['NB+IW', 0.623], 
                  ['KNN', 0.990], ['KNN+RUS', 0.990], ['KNN+STL', 0.990], 
                  ['KNN+IW', 0.990],
                  ['SVC', 0.990], ['SVC+RUS', 0.990], ['SVC+STL', 0.990], 
                  ['SVC+IW', 0.990],
                  ['DT', 0.977], ['DT+RUS', 0.977], ['DT+STL', 0.977], 
                  ['DT+IW', 0.977],
                  ['RF', 0.990], ['RF+RUS', 0.990], ['RF+STL', 0.990], 
                  ['RF+IW', 0.990],
                  ['BEC', 0.990], ['BEC+RUS', 0.990], ['BEC+STL', 0.990], 
                  ['BEC+IW', 0.990],
                  ['XGB', 0.990], ['XGB+RUS', 0.990], ['XGB+STL', 0.990], 
                  ['XGB+IW', 0.990],
                  ['LGBM', 0.989], ['LGBM+RUS', 0.705], ['LGBM+STL', 0.988], 
                  ['LGBM+IW', 0.998]]

In [None]:
AccuracyTable = pd.DataFrame(
    Accuracy, columns=['Model', 'Values'])
AccuracyTable = AccuracyTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
AccuracyViz = AccuracyTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Accuracy Values in Descending Order 1:99')
plt.xlabel(None)
for label in (AccuracyViz.get_xticklabels() + AccuracyViz.get_yticklabels()):
    label.set_fontsize(9)
AccuracyViz.figure.savefig('Accuracy_Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

-------------------

In [None]:
BalancedAccuracy = [['NB', 0.732], ['NB+RUS', 0.723], ['NB+STL', 0.696], 
                  ['NB+IW', 0.738], 
                  ['KNN', 0.500], ['KNN+RUS', 0.500], ['KNN+STL', 0.500], 
                  ['KNN+IW', 0.500],
                  ['SVC', 0.500], ['SVC+RUS', 0.500], ['SVC+STL', 0.500], 
                  ['SVC+IW', 0.500],
                  ['DT', 0.523], ['DT+RUS', 0.521], ['DT+STL', 0.518], 
                  ['DT+IW', 0.520],
                  ['RF', 0.500], ['RF+RUS', 0.500], ['RF+STL', 0.521], 
                  ['RF+IW', 0.500],
                  ['BEC', 0.502], ['BEC+RUS', 0.501], ['BEC+STL', 0.500], 
                  ['BEC+IW', 0.501],
                  ['XGB', 0.500], ['XGB+RUS', 0.500], ['XGB+STL', 0.500], 
                  ['XGB+IW', 0.500],
                  ['LGBM', 0.502], ['LGBM+RUS', 0.719], ['LGBM+STL', 0.509], 
                  ['LGBM+IW', 0.912]]

In [None]:
Balanced_AccuracyTable = pd.DataFrame(
    BalancedAccuracy, columns=['Model', 'Values'])
Balanced_AccuracyTable = Balanced_AccuracyTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
Balanced_AccuracyViz = Balanced_AccuracyTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Balanced_Accuracy Values in Descending Order 1:99')
plt.xlabel(None)
for label in (Balanced_AccuracyViz.get_xticklabels() + Balanced_AccuracyViz.get_yticklabels()):
    label.set_fontsize(9)
Balanced_AccuracyViz.figure.savefig('Balanced__Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

-----------------------

In [None]:
Precision = [['NB', 0.022], ['NB+RUS', 0.022], ['NB+STL', 0.018], 
                  ['NB+IW', 0.022], 
                  ['KNN', 0], ['KNN+RUS', 0], ['KNN+STL', 0], 
                  ['KNN+IW', 0],
                  ['SVC', 0], ['SVC+RUS', 0], ['SVC+STL', 0], 
                  ['SVC+IW', 0],
                  ['DT', 0.042], ['DT+RUS', 0.038], ['DT+STL', 0.036], 
                  ['DT+IW', 0.041],
                  ['RF', 0], ['RF+RUS', 0], ['RF+STL', 0], 
                  ['RF+IW', 0],
                  ['BEC', 0.14], ['BEC+RUS', 0.067], ['BEC+STL', 0.1], 
                  ['BEC+IW', 0.05],
                  ['XGB', 0], ['XGB+RUS', 0], ['XGB+STL', 0], 
                  ['XGB+IW', 0],
                  ['LGBM', 0.044], ['LGBM+RUS', 0.025], ['LGBM+STL', 0.122], 
                  ['LGBM+IW', 1.0]]

In [None]:
PrecisionTable = pd.DataFrame(
    Precision, columns=['Model', 'Values'])
PrecisionTable = PrecisionTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
PrecisionViz = PrecisionTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Precision Values in Descending Order 1:99')
plt.xlabel(None)
for label in (PrecisionViz.get_xticklabels() + PrecisionViz.get_yticklabels()):
    label.set_fontsize(9)
PrecisionViz.figure.savefig('Precision_Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

------------------

In [None]:
Recall = [['NB', 0.844], ['NB+RUS', 0.834], ['NB+STL', 0.875], 
                  ['NB+IW', 0.853], 
                  ['KNN', 0], ['KNN+RUS', 0], ['KNN+STL', 0], 
                  ['KNN+IW', 0],
                  ['SVC', 0], ['SVC+RUS', 0], ['SVC+STL', 0], 
                  ['SVC+IW', 0],
                  ['DT', 0.059], ['DT+RUS', 0.056], ['DT+STL', 0.049], 
                  ['DT+IW', 0.054],
                  ['RF', 0], ['RF+RUS', 0], ['RF+STL', 0], 
                  ['RF+IW', 0],
                  ['BEC', 0.005], ['BEC+RUS', 0.002], ['BEC+STL', 0.002], 
                  ['BEC+IW', 0.002],
                  ['XGB', 0], ['XGB+RUS', 0], ['XGB+STL', 0], 
                  ['XGB+IW', 0],
                  ['LGBM', 0.005], ['LGBM+RUS', 0.734], ['LGBM+STL', 0.020], 
                  ['LGBM+IW', 0.834]]

In [None]:
RecallTable = pd.DataFrame(
    Recall, columns=['Model', 'Values'])
RecallTable = RecallTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
RecallViz = RecallTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('Recall Values in Descending Order 1:99')
plt.xlabel(None)
for label in (RecallViz.get_xticklabels() + RecallViz.get_yticklabels()):
    label.set_fontsize(9)
RecallViz.figure.savefig('Recall_Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

---------------

In [None]:
MCC = [['NB', 0.095], ['NB+RUS', 0.092], ['NB+STL', 0.078], 
                  ['NB+IW', 0.097], 
                  ['KNN', -0.0002], ['KNN+RUS', -0.0002], ['KNN+STL', -0.0002], 
                  ['KNN+IW', -0.0002],
                  ['SVC', 0], ['SVC+RUS', 0], ['SVC+STL', 0], 
                  ['SVC+IW', 0],
                  ['DT', 0.038], ['DT+RUS', 0.035], ['DT+STL', 0.031], 
                  ['DT+IW', 0.036],
                  ['RF', 0], ['RF+RUS', 0], ['RF+STL', 0], 
                  ['RF+IW', 0],
                  ['BEC', 0.024], ['BEC+RUS', 0.011], ['BEC+STL', 0.014], 
                  ['BEC+IW', 0.010],
                  ['XGB', -0.0004], ['XGB+RUS', -0.0004], ['XGB+STL', -0.0004], 
                  ['XGB+IW', -0.0004],
                  ['LGBM', 0.012], ['LGBM+RUS', 0.095], ['LGBM+STL', 0.044], 
                  ['LGBM+IW', 0.912]]

In [None]:
MCCTable = pd.DataFrame(
    MCC, columns=['Model', 'Values'])
MCCTable = MCCTable.set_index('Model')

In [None]:
plt.rcParams['figure.figsize']=[10,2]
MCCViz = MCCTable.sort_values(
    'Values', ascending=False).plot.bar(
    width=0.8, color='green')
plt.title('MCC Values in Descending Order 1:99')
plt.xlabel(None)
for label in (MCCViz.get_xticklabels() + MCCViz.get_yticklabels()):
    label.set_fontsize(9)
MCCViz.figure.savefig('MCC_Plot_1_99.png',
                                 format='png',
                                 bbox_inches='tight',
                                 dpi=300)

---------------

---------------

# Feature Importance

In [None]:
#Extracting best parameters of the LGBM+IW
Best_param_feature =Comparingdata.loc[2,'Best_param']

In [None]:
#Rerunning the space of LGBM+IW
space = {
        'num_leaves': hp.choice('num_leaves',range(20,3000,20)),
        'max_depth': hp.choice('max_depth', range(1,15,1)),
        'bagging_fraction': hp.uniform('bagging_fraction', 0, 1),
        'feature_fraction':  hp.uniform('feature_fraction', 0, 1),
        'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0005, 0.010),
        'lambda_l1': hp.choice('lambda_l1', range(0, 100, 5)),
        'lambda_l2': hp.choice('lambda_l2', range(0, 100, 5)),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
        'min_data_in_leaf': hp.choice('min_data_in_leaf', range(200,10000, 100)),
        'max_bin': hp.uniform('max_bin', 200, 300),
        'min_gain_to_split': hp.uniform('min_gain_to_split', 0,15),
        'n_estimators': hp.choice('n_estimators', range(200,10000, 100)),
        'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
}

In [None]:
#Setting the parameters
model =lgb.LGBMClassifier()
test =space_eval(space,Best_param_feature)
test['max_bin']=243
model.set_params(**test)

In [None]:
#Fitting the model
model.fit(original_Xtrain, original_ytrain)

In [None]:
#Creating the dataframe and plot to show feature importance
coefs = pd.DataFrame(
   model.feature_importances_,
   columns=['Coefficients'], index=imputted_data.loc[:, imputted_data.columns != 'default_ind'].columns
)

plot_coefs = coefs.sort_values('Coefficients', ascending=True).plot(kind='barh', figsize=(20, 20), color='green', width=0.9)
plt.title('Feature Importance Plot for LightGBM Model')
plt.axvline(x=0, color='.5')
plt.subplots_adjust(left=.3)

plot_coefs.figure.savefig("Feature Importance_good4.png",
                     format = 'png',
                     dpi=100)