## import libraries

In [2]:
# import necessary libraries

import pandas as pd
import numpy as np
import missingno as msno 
import seaborn as sns
import matplotlib.pyplot as plt 

#sklearn

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, make_scorer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score

#imbalanced learning
from imblearn.ensemble import EasyEnsembleClassifier

# import custom functions
from custom_functions import get_data_summary, our_metrics, eval_metrics, evaluate_model


import warnings
warnings.filterwarnings('ignore')

In [3]:
# for reproducibility sake
RSEED = 42

## Make scorer Fbeta

In [4]:
# make the Fbeta scorers needed for the grid search
f15_scorer = make_scorer(fbeta_score, beta=1.5)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
fthree_scorer = make_scorer(fbeta_score, beta=3)

# Import test and train data, target

In [5]:
# # Don't use storemagic for lare Dataframes /Arrays, it will keep crashing!
# # %store shows all stored variables. 
# # %store -z deletes all stored variables from memory
# # retrieve the preprocessed, cleaned and scaled features and target
# %store -r X_train_minmax
# %store -r X_test_minmax 
# %store -r X_train_std
# %store -r X_test_std
# %store -r y_test
# %store -r y_train

In [6]:
X_train_tree = pd.read_csv('data/X_train_tree.csv', delimiter=',')

In [7]:
X_test_tree = pd.read_csv('data/X_test_tree.csv', delimiter=',')

In [8]:
y_test_tree = pd.read_csv('data/y_test_tree.csv', delimiter=',')

In [9]:
y_train_tree = pd.read_csv('data/y_train_tree.csv', delimiter=',')

In [10]:
y_test_tree = np.ravel(y_test_tree)

In [11]:
y_train_tree = np.ravel(y_train_tree)

In [12]:
y_test_tree

array([0, 0, 0, ..., 0, 0, 0])

## EasyEnsembleClassifier

### What is the EasyEnsembleClassifier?

* Machine Learning Model for imbalanced data
* Bag of balanced boosted learners.
* Ensemble of AdaBoost learners trained on different balanced bootstrap samples. The balancing is achieved by random under-sampling.

### AdaBoostClassifier

[scikit-learn documentation](https://scikit-learn.org/stable/modules/ensemble.html#adaboost)
The core principle of AdaBoost is to fit a sequence of weak learners (i.e., models that are only slightly better than random guessing, such as small decision trees) on repeatedly modified versions of the data. The predictions from all of them are then combined through a weighted majority vote (or sum) to produce the final prediction. The data modifications at each so-called boosting iteration consist of applying weights , , …,  to each of the training samples. Initially, those weights are all set to , so that the first step simply trains a weak learner on the original data. For each successive iteration, the sample weights are individually modified and the learning algorithm is reapplied to the reweighted data. At a given step, those training examples that were incorrectly predicted by the boosted model induced at the previous step have their weights increased, whereas the weights are decreased for those that were predicted correctly. As iterations proceed, examples that are difficult to predict receive ever-increasing influence. Each subsequent weak learner is thereby forced to concentrate on the examples that are missed by the previous ones in the sequence.

 By default, weak learners are decision stumps. Different weak learners can be specified through the estimator parameter. The main parameters to tune to obtain good results are n_estimators and the complexity of the base estimators (e.g., its depth max_depth or minimum required number of samples to consider a split min_samples_split).

The individual learners can be weak, but as long as the performance of each one is slightly better than random guessing, the final model can be proven to converge to a strong learner.

In [13]:
# # initiate EasyEnsembleClassifier
# eec = EasyEnsembleClassifier(random_state=RSEED)

In [None]:
# # fit eec to train data
# eec.fit(X_train_tree, y_train_tree)
# # make prediction on test data
# y_pred_eec = eec.predict(X_test_tree)

In [None]:
# # get performance metrics
# our_metrics(y_test_tree, y_pred_eec)
# # print confusion matrix
# # print(confusion_matrix(y_test, y_pred))

In [15]:
eec_rs = EasyEnsembleClassifier(random_state=RSEED)

In [16]:
#eec_rs.get_params()

{'base_estimator': 'deprecated',
 'estimator': None,
 'n_estimators': 10,
 'n_jobs': None,
 'random_state': 42,
 'replacement': False,
 'sampling_strategy': 'auto',
 'verbose': 0,
 'warm_start': False}

### Hyperparameters of the EasyEnsembleClassifier

* 'base_estimator': 'deprecated', now 'estimator'
* 'estimator': None, default=AdaBoostClassifier()
* 'n_estimators': default=10, Number of AdaBoost learners in the ensemble.
* 'n_jobs': default=None, Number of CPU cores used during the cross-validation loop. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors.
* 'random_state': default=None, If None, the random number generator is the RandomState instance used by np.random.
* 'replacement': default=False, Whether or not to sample randomly with replacement or not.
* 'sampling_strategy': default='auto'equivalent to 'not minority', Sampling information to sample the data set. float, str, dict, callable, default=’auto’. When float, it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling. Therefore, the ratio is expressed as $N_m/N_{rM}$,
 where $N_m$ is the number of samples in the minority class and $N_{rM}$ is the number of samples in the majority class after resampling.
* 'verbose': default=0, Controls the verbosity of the building process.
* 'warm_start': default=False. When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble.

In [17]:
# define search space
param_grid = { "sampling_strategy" : ['auto', 0.5, 0.8, 1, 1.2, 1.5],
            "replacement" : [False, True],
                "n_estimators" : [6, 8, 10, 12, 14]}
#               "C" : loguniform(1e-5, 100)}

In [18]:
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=RSEED)

In [19]:
# define Random search
Random_search_f2 = RandomizedSearchCV(eec_rs, param_grid, n_iter=500, scoring=ftwo_scorer, n_jobs=1, cv=cv, random_state=RSEED)

In [20]:
# execute Random search
Random_search_f2.fit(X_train_tree, y_train_tree)

In [None]:
y_pred_train_f2 = Random_search_f2.predict(X_train_tree)
y_pred_RS_f2 = Random_search_f2.predict(X_test_tree)

print("Tuned hyperparameters :(best parameters) ",Random_search_f2.best_params_)
print("Decision Metrics:")
our_metrics(y_test_tree, y_pred_RS_f2)