<a href="https://colab.research.google.com/github/AlexeyTri/MLScratch/blob/main/Adaboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. week learner (decision stump)
```
if x < treshold = class -1
else class +1
```
2. 𝛆 - ERROR

$𝛆 = \frac{missclassification}{sample} = \frac{missclassification}{N}$ in the first iteration


$𝛆_t = \sum_{miss}weights$ if 𝛆 > error flip decision and the error = 1 - error

3. WEIGHTS

$w_0 = \frac{1}{N}$

$w = \frac{w \cdot \exp(- α \cdot y \cdot h(X)}{\sum w})$, where h(X) - prediction of t

4. PERFORMANCE

$α = 0.5 \log(\frac{1-𝛆_t}{𝛆})$

5. PREDICTION

$y = sign(\sum_t^T α_t \cdot h(X))$

In [10]:
import numpy as np
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

In [21]:
# class DecisionStump:
#     def __init__(self):
#         self.polarity = 1
#         self.treshold = None
#         self.feature_idx = None
#         self.alpha = None

#     def predict(self, X):
#         n_samples = X.shape[0]
#         predictions = np.ones(n_samples)
#         X_columns = X[:, self.feature_idx]

#         if self.polarity == 1:
#             predictions[X_columns < self.treshold] = -1
#         else:
#             predictions[X_columns > self.treshold] = -1

#         return predictions

class DecisionStump:
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions

In [24]:
class AdaBoost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        self.clfs = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        w = np.full(n_samples, (1/n_samples))

        self.clsf = []

        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float("inf")

            # greedy search to find best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)

                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1

                    # Error = sum of weights of misclassified samples
                    misclassified = w[y != predictions]
                    error = sum(misclassified)

                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error
        # calculate alpha
        EPS = 1e-10
        clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

        # calculate predictions and update weights
        predictions = clf.predict(X)

        w *= np.exp(-clf.alpha * y * predictions)
        # Normalize to one
        w /= np.sum(w)

        # Save classifier
        self.clfs.append(clf)

    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred

In [25]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

data = datasets.load_breast_cancer()
X, y = data.data, data.target

y[y == 0] = -1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

    # Adaboost classification with 5 weak classifiers
clf = AdaBoost(n_clf=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = accuracy(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9122807017543859
