In [1]:
# some imports
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

# Python ≥3.5 is required
import sys

assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn

assert sklearn.__version__ >= "0.20"

# TensorFlow ≥2.0 is required
import tensorflow as tf

assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rc('font', size=12)
plt.rc('figure', figsize=(12, 5))

# Settings for the visualizations
#import seaborn as sns
#sns.set_style("whitegrid")
#sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2,'font.family': [u'times']})

import pandas as pd

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

# Ignore useless warnings (see SciPy issue #5998)
import warnings

warnings.filterwarnings(action="ignore", message="^internal gelsd")

from tensorflow import keras

keras.__version__


'2.8.0'

In [2]:
#import
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
# data
from sklearn.datasets import fetch_openml
from sklearn import metrics
from sklearn.metrics import confusion_matrix as CM
from sklearn.model_selection import train_test_split

# Classifier
import tqdm
from sklearn import base
import numpy as np

# MNIST

In [3]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
X, y = mnist["data"], mnist["target"]

In [4]:

X_train, X_test, y_train, y_test = train_test_split(X[:10000], y[:10000], test_size=0.2)  # 80% training and 20% test
y_test = y_test.to_numpy().astype(np.float)
y_train = y_train.to_numpy().astype(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_test = y_test.to_numpy().astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_train = y_train.to_numpy().astype(np.float)


In [5]:
print(X_train.shape)

(8000, 784)


## Boosting Classifier
based on SAMME algorithm - Stagewise Additive Modeling <br />
Zhu, Ji & Rosset, Saharon & Zou, Hui & Hastie, Trevor. (2006). Multi-class AdaBoost. Statistics and its interface. 2. 10.4310/SII.2009.v2.n3.a8.

<img src="./SAMME-alg.png"
     alt="SAMME algorithm"
     style="float: left; margin-right: 10px;" />

In [6]:
class Boosting:
    '''
    based on SAMME algorithm - Stagewise Additive Modeling
    Zhu, Ji & Rosset, Saharon & Zou, Hui & Hastie, Trevor. (2006). Multi-class AdaBoost. Statistics and its interface. 2. 10.4310/SII.2009.v2.n3.a8.
    '''

    def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.estimator_errors = None
        self.models = None
        self.alpha_m = None
        self.labels = {}
        self.labels2 = {}
        self.classes_no = 0

    def fit(self, X, y):
        self.models = []
        self.estimator_errors = []
        self.alpha_m = []
        n_samples = len(X)  # No of samples
        weight = np.ones(n_samples, dtype=np.float64)  # sample weights - init to 1
        weight /= weight.sum()

        k = len(np.unique(y))  # no of classes

        # create labels from classes type
        self.classes_no = len(np.unique(y))
        for i, cls in enumerate(np.unique(y)):
            self.labels[cls] = i
            self.labels2[i] = cls

        for estimator_no in tqdm.tqdm(range(self.n_estimators)):
            model = base.clone(self.base_estimator).fit(X, y, sample_weight=weight).predict
            predictions = model(X)
            predictions_truth = predictions != y

            # calculate the weak estimator error
            err_m = np.average(predictions_truth, weights=weight, axis=0)

            #compute the wright of the currect classifier
            alpha_m = (np.log((1 - err_m) / err_m) + np.log(k - 1))
            # update the weights, if the prediction was wrong then increase it
            weight = weight * np.exp(alpha_m * predictions_truth)
            # normalize
            # weight /= np.sum(weight)

            # save data
            self.estimator_errors.append(err_m)
            self.models.append(model)
            self.alpha_m.append(alpha_m)

    def predict(self, X):
        '''
        output is calculated by weighted vote
        :param X: predict set
        :return: predicted labels for each entry in the predict set
        '''
        y_pred = []
        for alpha_m, model in tqdm.tqdm(zip(self.alpha_m, self.models)):
            y_new = []
            for predict in model(X):
                pred = np.full(self.classes_no, fill_value=-1 / (self.classes_no - 1), dtype=np.float64)
                pred[self.labels[predict]] = 1
                y_new.append(pred)
            y_pred.append([y * alpha_m for y in y_new])
        y_pred = np.sum(y_pred, axis=0)
        labels_no = [np.argmax(y) for y in y_pred]
        return [self.labels2[i] for i in labels_no]

In [7]:
clf = Boosting(n_estimators=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

100%|██████████| 500/500 [00:47<00:00, 10.55it/s]
500it [00:06, 78.23it/s]


In [8]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("F1 score:", metrics.f1_score(y_test, y_pred, average='micro'))

print("\nConfusion Matrix:\n", CM(y_test, y_pred))

Accuracy: 0.79
F1 score: 0.79

Confusion Matrix:
 [[146   0   2   1   0  54   1   0   3   0]
 [  0 198  12   0   1   1   0   0   4   0]
 [  2   3 142   6   4   6  22   1  18   0]
 [  2   3   4 146   0  17   2   4  11   3]
 [  2   0   2   1 183   3   1   2   3  14]
 [  3   5   1  15   3 132   3   1   9   4]
 [  4   0   6   0  16   8 182   0   4   0]
 [  0   3   6   0   3   2   0 163   2  37]
 [  0   4   3   5   0   7   0   1 139   7]
 [  1   3   1   7  18   0   0  10   3 149]]


## Adaboost classifier


In [9]:
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200,
                            algorithm="SAMME.R", learning_rate=0.5)
# Train the AdaBoost model
ab_model = ab_clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred_ab = ab_model.predict(X_test)

In [10]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_ab))
print("F1 score:", metrics.f1_score(y_test, y_pred_ab, average='micro'))

Accuracy: 0.6455
F1 score: 0.6455


## Bagging Classifier

In [11]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

In [12]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_bag))
print("F1 score:", metrics.f1_score(y_test, y_pred_bag, average='micro'))

Accuracy: 0.9385
F1 score: 0.9385


## Random Forest Classifier

In [13]:
rnd_clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=10, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [14]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))
print("F1 score:", metrics.f1_score(y_test, y_pred_rf, average='micro'))

Accuracy: 0.9375
F1 score: 0.9375


# MNIST Fashion

## Import data

In [15]:
from tensorflow import keras

keras.__version__


'2.8.0'

In [16]:
fashion_mnist = keras.datasets.fashion_mnist
(X_train_full_fashion, y_train_full_fashion), (X_test_fashion, y_test_fashion) = fashion_mnist.load_data()

In [17]:
def flatten_vec(vectors):
    vecs = []
    for vec in vectors:
        vecs.append(np.ndarray.flatten(vec))
    return np.asarray(vecs)

In [18]:
X_train_full_fashion = flatten_vec(X_train_full_fashion)
X_test_fashion = flatten_vec(X_test_fashion)

## Boosting Classifier

In [19]:
clf = Boosting(n_estimators=500)
clf.fit(X_train_full_fashion, y_train_full_fashion)
y_pred_boosting = clf.predict(X_test_fashion)

100%|██████████| 500/500 [14:51<00:00,  1.78s/it]
500it [00:22, 22.25it/s]


In [20]:
print("Accuracy:", metrics.accuracy_score(y_test_fashion, y_pred_boosting))
print("F1 score:", metrics.f1_score(y_test_fashion, y_pred_boosting, average='micro'))

print("\nConfusion Matrix:\n", CM(y_test_fashion, y_pred_boosting))


Accuracy: 0.6055
F1 score: 0.6055

Confusion Matrix:
 [[340   9  64 329   4   1 236   0  17   0]
 [  3 703   7 267   7   0  12   0   1   0]
 [  4   0 738  25 146   0  79   0   8   0]
 [ 10  12  41 894  11   0  32   0   0   0]
 [  0   1 349 130 460   0  56   0   4   0]
 [  0   0   0   0   0 938   1  17  26  18]
 [ 56   6 277 177 197   0 252   0  35   0]
 [  0   0   0   0   0 734   0 237   0  29]
 [  0   1  22   6   1   8  25   4 933   0]
 [  0   0   0   0   0 406   0  26   8 560]]


## Adaboost classifier


In [21]:
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200,
                            algorithm="SAMME.R", learning_rate=0.5)
# Train the AdaBoost model
ab_model = ab_clf.fit(X_train_full_fashion, y_train_full_fashion)
#Predict the response for test dataset
y_pred_ab_fashion = ab_model.predict(X_test_fashion)

In [22]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test_fashion, y_pred_ab_fashion))
print("F1 score:", metrics.f1_score(y_test_fashion, y_pred_ab_fashion, average='micro'))

Accuracy: 0.5814
F1 score: 0.5814


## Bagging Classifier

In [23]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train_full_fashion, y_train_full_fashion)
y_pred_bag_fashion = bag_clf.predict(X_test_fashion)

In [24]:
print("Accuracy:", metrics.accuracy_score(y_test_fashion, y_pred_bag_fashion))
print("F1 score:", metrics.f1_score(y_test_fashion, y_pred_bag_fashion, average='micro'))

Accuracy: 0.872
F1 score: 0.872


## Random Forest Classifier

In [25]:
rnd_clf = RandomForestClassifier(n_estimators=500, min_samples_leaf=10, random_state=42)
rnd_clf.fit(X_train_full_fashion, y_train_full_fashion)
y_pred_rf_fashion = rnd_clf.predict(X_test_fashion)

In [26]:
print("Accuracy:", metrics.accuracy_score(y_test_fashion, y_pred_rf_fashion))
print("F1 score:", metrics.f1_score(y_test_fashion, y_pred_rf_fashion, average='micro'))




Accuracy: 0.868
F1 score: 0.868
