In [1]:
%matplotlib inline


# Comparison of ensembling classifiers internally using sampling


Ensembling classifiers have shown to improve classification performance compare
to single learner. However, they will be affected by class imbalance. This
example shows the benefit of balancing the training set before to learn
learners. We are making the comparison with non-balanced ensemble methods.

We make a comparison using the balanced accuracy and geometric mean which are
metrics widely used in the literature to evaluate models learned on imbalanced
set.


In [2]:
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

import itertools

import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

from imblearn.datasets import fetch_datasets
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import RUSBoostClassifier

from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# F1 Score warning
When true positive + false positive == 0, precision is undefined; When true positive + false negative == 0, recall is undefined. In such cases, by default the metric will be set to 0, as will f-score, and UndefinedMetricWarning will be raised. This behavior can be modified with zero_division.

In [6]:
def plot_confusion_matrix(cm, classes, ax,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    print(cm)
    print('')

    ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.sca(ax)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

In [7]:
# importing our favorite libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score

Load an imbalanced dataset
##############################################################################
 We will load the UCI SatImage dataset which has an imbalanced ratio of 9.3:1
 (number of majority sample for a minority sample). The data are then split
 into training and testing.



In [8]:
df_p = pd.read_excel (r'C:\Users\vince\Desktop\Jupiter\DMV_Crash_Data_Bool.xlsx', skip_header = True)
print(df_p.head())
print((df_p).shape)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\vince\\Desktop\\Jupiter\\DMV_Crash_Data_Bool.xlsx'

In [None]:
filtered_df = (df_p["vehicle type"], df_p["Number for Road Type"], df_p["Intersection"],
    df_p["Intersection Geometry"], df_p["Parking provision"], df_p["Mode"],
    df_p["Vehicle1 Status"], df_p["Fault"], df_p["NO. for collision type"],
    df_p["InjuriesBool"], df_p["Vehicle1 Damage"], df_p["Vehicle2 Damage"], df_p["signal"])
df_p = pd.DataFrame(data = filtered_df)
print(type(df_p))
print(df_p.head())
df_p = df_p.transpose()
print(type(df_p))
#print(df_p)

In [None]:
#remove Na values

df_R = df_p.dropna(axis=0) #axis=0 
#Determine if rows or columns which contain missing values are removed.
#0, or ‘index’ : Drop rows which contain missing values.
print((df_R).shape)
print(df_R.head())

In [None]:
X_df = (df_R["vehicle type"], df_R["Number for Road Type"], df_R["Intersection"],
    df_R["Intersection Geometry"], df_R["Parking provision"], df_R["Mode"],
    df_R["Vehicle1 Status"], df_R["Fault"], df_R["NO. for collision type"],
    df_R["Vehicle1 Damage"], df_R["Vehicle2 Damage"], df_R["signal"])
print(type(X_df))
print(len(X_df))
X_np= np.asarray(X_df)
print(type(X_np))
print((X_np).shape)
X_np= np.transpose(X_df)
print(type(X_np))
print((X_np).shape)

In [None]:
#Please enter the dependent variables
Y_df = (df_R["InjuriesBool"])
print(type(Y_df))
print(len(Y_df))
Y_np= np.asarray(Y_df)
print(type(Y_np))
print((Y_np).shape)
Y_np= np.transpose(Y_df)
print(type(Y_np))
print((Y_np).shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_np, Y_np, 
                                                    test_size=0.20, 
                                                    random_state=42, stratify = Y_np)

In [None]:
my_class = [-1,  1]

In [None]:
#finding class weight
class_weight = int(y_train.value_counts()[0]/y_train.value_counts()[1])
class_weight

satimage = fetch_datasets()['satimage']
X, y = satimage.data, satimage.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=0)

Classification using a single decision tree
##############################################################################
 We train a decision tree classifier which will be used as a baseline for the
 rest of this example.



The results are reported in terms of balanced accuracy and geometric mean
which are metrics widely used in the literature to validate model trained on
imbalanced set.



In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
print("F1 Score: " + str(f1_score(y_test,y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=my_class, ax=ax,
                      title='Decision tree')

In [None]:
gridsearch_forest = DecisionTreeClassifier() #defining type of model _ aim to find the best parameters

params = { #"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          #"n_estimators": [40, 100,300], 
          "max_features": ["sqrt", "log2", "auto"]
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

#oob_score=False,

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

In [None]:
tree = DecisionTreeClassifier(max_features = "sqrt")  #INSERT THE BEST PARAMETER
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
print("F1 Score: " + str(f1_score(y_test,y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=my_class, ax=ax,
                      title='Decision tree')

# Decision Tree Classifier with Weights

In [None]:
tree = DecisionTreeClassifier(class_weight= {0:1,1:class_weight})
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
print("F1 Score: " + str(f1_score(y_test,y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=my_class, ax=ax,
                      title='Decision tree with Weights')

In [None]:
gridsearch_forest = DecisionTreeClassifier()

params = { "class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          #"n_estimators": [10, 25, 50, 75],
          #"n_estimators": [40, 100,300], 
          "max_features": ["sqrt", "log2", "auto"]
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

#oob_score=False,

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

In [None]:
tree = DecisionTreeClassifier(class_weight= {0:1,1:8},max_features = "log2")  #Change the class weight and include the best paramter
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
print("F1 Score: " + str(f1_score(y_test,y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=my_class, ax=ax,
                      title='Decision tree with Weights')

# Bagging 
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

Classification using bagging classifier with and without sampling
##############################################################################
 Instead of using a single tree, we will check if an ensemble of decsion tree
 can actually alleviate the issue induced by the class imbalancing. First, we
 will use a bagging classifier and its counter part which internally uses a
 random under-sampling to balanced each boostrap sample.



In [None]:
bagging = BaggingClassifier(n_estimators=50, random_state=0)
bagging.fit(X_train, y_train)
y_pred_bc = bagging.predict(X_test)


print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
print("F1 Score: " + str(f1_score(y_test,y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_bagging, classes=my_class, ax=ax,
                      title='Bagging')

Balancing each bootstrap sample allows to increase significantly the balanced
accuracy and the geometric mean.



print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging, classes=my_class, ax=ax[0],
                      title='Bagging')

print('Balanced Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bbc),
              geometric_mean_score(y_test, y_pred_bbc)))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc)
plot_confusion_matrix(cm_balanced_bagging, classes=my_class,
                      ax=ax[1], title='Balanced bagging')

In [None]:
gridsearch_forest = BaggingClassifier()

params = { #"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          #"max_features": ["sqrt", "log2", "None"],
          "oob_score":['False', 'True']
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)


In [None]:
bagging = BaggingClassifier(n_estimators=30, random_state=0) #choose the parameter and keep random_state=0
#random state default=None
bagging.fit(X_train, y_train)
y_pred_bc = bagging.predict(X_test)


print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
print("F1 Score: " + str(f1_score(y_test,y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_bagging, classes=my_class, ax=ax,
                      title='Bagging')

# Balanced Bagging
(https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedBaggingClassifier.html)

In [None]:
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0)
balanced_bagging.fit(X_train, y_train)
y_pred_bbc = balanced_bagging.predict(X_test)
print('Balanced Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bbc),
              geometric_mean_score(y_test, y_pred_bbc)))
print("F1 Score: " + str(f1_score(y_test,y_pred_bbc)))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_balanced_bagging, classes=my_class,
                      ax=ax, title='Balanced bagging')

In [None]:
gridsearch_forest = BalancedBaggingClassifier()

params = { #"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          #"max_features": ["sqrt", "log2", "auto"],
          "oob_score": ['False', 'True'],
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)


In [None]:
balanced_bagging = BalancedBaggingClassifier(n_estimators=55, random_state=0) #choose the parameter and keep random_state=0
balanced_bagging.fit(X_train, y_train)
y_pred_bbc = balanced_bagging.predict(X_test)
print('Balanced Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bbc),
              geometric_mean_score(y_test, y_pred_bbc)))
print("F1 Score: " + str(f1_score(y_test,y_pred_bbc)))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_bbc)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_balanced_bagging, classes=my_class,
                      ax=ax, title='Balanced bagging')

# Random Forest
(https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

Classification using random forest classifier with and without sampling
##############################################################################
 Random forest is another popular ensemble method and it is usually
 outperforming bagging. Here, we used a vanilla random forest and its balanced
 counterpart in which each bootstrap sample is balanced.



In [None]:
rf = RandomForestClassifier(n_estimators=50, random_state=0)


rf.fit(X_train, y_train)


y_pred_rf = rf.predict(X_test)


# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
print("F1 Score: " + str(f1_score(y_test,y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rf, classes=my_class, ax=ax,
                      title='Random forest')

In [None]:
gridsearch_forest = RandomForestClassifier()

params = { "class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          "max_features": ["sqrt", "log2", "auto"],
          "oob_score": ['False', 'True'],
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)



In [None]:
rf = RandomForestClassifier(n_estimators=20, max_features = "auto"
                            , class_weight = {0:1,1:5}, random_state=0)  #choose the parameter and keep random_state=0
#n_estimators, default=100

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
print("F1 Score: " + str(f1_score(y_test,y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rf, classes=my_class, ax=ax,
                      title='Random forest')

# Balanced Random Forest
(https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedRandomForestClassifier.html)

In [None]:
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, class_weight= {0:1,1:class_weight})
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.
print('Balanced Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_brf),
              geometric_mean_score(y_test, y_pred_brf)))
print("F1 Score: " + str(f1_score(y_test,y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_brf, classes=my_class, ax=ax,
                      title='Balanced random forest')


In [None]:
gridsearch_forest = BalancedRandomForestClassifier()

params = {"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          "max_features": ["sqrt", "log2", "auto"],
          "oob_score": ['False', 'True'],
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)



In [None]:
brf = BalancedRandomForestClassifier(n_estimators=65, max_features = "sqrt",
                                     random_state=0, class_weight= {0:1,1:5}) #Use the best parameter and keep random_state =0
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.
print('Balanced Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_brf),
              geometric_mean_score(y_test, y_pred_brf)))
print("F1 Score: " + str(f1_score(y_test,y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_brf, classes=my_class, ax=ax,
                      title='Balanced random forest')


print('Balanced Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_brf),
              geometric_mean_score(y_test, y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_brf, classes=my_class, ax=ax,
                      title='Balanced random forest')

rf = RandomForestClassifier(n_estimators=50, random_state=0, class_weight= {0:1,1:class_weight})
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, class_weight= {0:1,1:class_weight})

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

#Similarly to the previous experiment, the balanced classifier outperform the
#classifier which learn from imbalanced bootstrap samples. In addition, random
#forest outsperforms the bagging classifier.


# Random Forest Classifier with given weights



print('Random Forest classifier performance: with weigths')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rf, classes=my_class, ax=ax,
                      title='Random forest with Weigths')



print('Balanced Random Forest classifier performance: weigths')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_brf),
              geometric_mean_score(y_test, y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_brf, classes=my_class, ax=ax,
                      title='Balanced random forest with Weigths')

# Easy emsemble classifier 
(https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.EasyEnsembleClassifier.html#imblearn.ensemble.EasyEnsembleClassifier)

Boosting classifier
##############################################################################
 In the same manner, easy ensemble classifier is a bag of balanced AdaBoost
 classifier. However, it will be slower to train than random forest and will
 achieve worse performance.



In [None]:
base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
print("F1 Score: " + str(f1_score(y_test,y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_eec, classes=my_class, ax=ax,
                      title='Easy ensemble classifier')



In [None]:
gridsearch_forest = EasyEnsembleClassifier()

params = {#"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          #"max_features": ["sqrt", "log2", "None"],
          #"oob_score"=['False', 'True'],
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)



In [None]:
base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,  #change this n_estimater based on above best parameter
                             base_estimator=base_estimator)  
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
print("F1 Score: " + str(f1_score(y_test,y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_eec, classes=my_class, ax=ax,
                      title='Easy ensemble classifier')



# RUSClassifier 
https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.RUSBoostClassifier.html

In [None]:
rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
print("F1 Score: " + str(f1_score(y_test,y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rusboost, classes=my_class,
                      ax=ax, title='RUSBoost classifier')

#plt.show()

In [None]:
gridsearch_forest = RUSBoostClassifier()

params = {#"class_weight": [{0:1,1:class_weight}, {0:1,1:8}, {0:1,1:10}],
          "n_estimators": [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], 
          #"max_features": ["sqrt", "log2", "None"],
          #"oob_score"=['False', 'True'],
          #"max_depth": [5,8,15], 
          #"min_samples_leaf" : [1, 2, 4],

#"bootstrap": [True, False],

#"ccp_alpha": [0.0, 1.0],

#"criterion": ['mse', 'mae'], "max_depth" : [5, 8, 15], "max_features" :['auto', 'sqrt', 'log2', 2],

#"max_leaf_nodes" = None,

#"max_samples" = None,

#"min_impurity_decrease"0.0,

#"min_impurity_split":None, "min_samples_leaf": [1, 2, 3, 4, 5], "min_samples_split": [2,3],

#"min_weight_fraction_leaf" : 0.0, "n_jobs": [4,-1],

#n_estimators=100, n_jobs=None,

            

#random_state=None,

#verbose=0, warm_start=False 
         }

clf = GridSearchCV(gridsearch_forest, param_grid=params, cv=5 ) 
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)



In [None]:
rusboost = RUSBoostClassifier(n_estimators=70,
                              base_estimator=base_estimator)  #change this n_estimater based on above best parameter
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
print("F1 Score: " + str(f1_score(y_test,y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rusboost, classes=my_class,
                      ax=ax, title='RUSBoost classifier')

#plt.show()

base_estimator = AdaBoostClassifier(n_estimators=10) #class_weight= {0:1,1:class_weight}) doesn't work on base_estimator
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator) #class_weight= {0:1,1:class_weight} doesn't work on eec
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=my_class, ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator) #class_weight= {0:1,1:class_weight} doesn't work rusboost
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=my_class,
                      ax=ax[1], title='RUSBoost classifier')

plt.show()

# Just playing around with color. Don't bother :P

def plot_confusion_matrix(cm, classes, ax,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.YlGnBu):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    print(cm)
    print('')

    ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.sca(ax)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

#delete this later
base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=my_class, ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
y_pred_rusboost = rusboost.predict(X_test)
print('RUSBoost classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rusboost),
              geometric_mean_score(y_test, y_pred_rusboost)))
cm_rusboost = confusion_matrix(y_test, y_pred_rusboost)
plot_confusion_matrix(cm_rusboost, classes=my_class,
                      ax=ax[1], title='RUSBoost classifier')

plt.show()

# ExtraTreesClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier 
rf = ExtraTreesClassifier(n_estimators=50, random_state=0)


rf.fit(X_train, y_train)


y_pred_rf = rf.predict(X_test)


# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Extra Trees classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
print("F1 Score: " + str(f1_score(y_test,y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=1)
plot_confusion_matrix(cm_rf, classes=my_class, ax=ax,
                      title='Extra Trees')