In [1]:
import numpy as np
import copy
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
metric = 'accuracy'

In [3]:
cancer = load_breast_cancer()

In [4]:
X_train, y_train = cancer.data, (cancer.target - 0.5) * 2

In [5]:
X_train.shape, y_train.shape

((569, 30), (569,))

In [6]:
def performance(model):
    score = cross_val_score(model, X_train, y_train, cv=kf, scoring=metric, n_jobs=4).mean()
    return score

In [7]:
class MyAdaBoost(BaseEstimator):
    def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=3), n_estimators=10, learning_rate=1.0):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.amount_of_says = []
    
    def generate_weighted_data(self, data, target, weights):
        dist = np.cumsum(weights)
        new_data = []
        new_target = []
        for _ in range(data.shape[0]):
            random_num = np.random.rand()
            for i in range(len(dist)):
                if random_num <= dist[i]:
                    if i == 0:
                        new_data.append(data[i])
                        new_target.append(target[i])
                        break
                    if random_num > dist[i-1]:
                        new_data.append(data[i])
                        new_target.append(target[i])
                        break
        return np.array(new_data), np.array(new_target)
    
    def fit(self, X, y):
        data = X.copy()
        target = y.copy()
        for _ in range(self.n_estimators):
            weights = np.ones(X.shape[0]) / X.shape[0]
            self.base_estimator.fit(data, target)
            tmp_pred = self.base_estimator.predict(data)
            error_index = (y!=tmp_pred)
            error_rate = weights[error_index].sum()
            tmp_amount_of_say = 1 / 2 * np.log((1-error_rate)/error_rate)
            for i in range(len(weights)):
                if error_index[i]:
                    weights[i] *= np.exp(tmp_amount_of_say)
                else:
                    weights[i] *= np.exp(-tmp_amount_of_say)
            weights = weights / weights.sum()
            self.estimators.append(copy.copy(self.base_estimator))
            self.amount_of_says.append(tmp_amount_of_say)
            data, target= self.generate_weighted_data(data, target, weights)
        return self
    
    def predict(self, X):
        y_scores = np.zeros(X.shape[0])
        for i in range(len(self.estimators)):
            y_scores += self.learning_rate * self.amount_of_says[i] * self.estimators[i].predict(X)
        y_pred = np.sign(y_scores)
        return y_pred

In [8]:
performance(MyAdaBoost())

0.9243647912885662