In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [41]:
class DecisionStump:
    def __init__(self):
        self.feature_idx = None
        self.threshold = None
        self.alpha = None # importance of this classifier
        self.polarity = 1

    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]

        predictions = np.ones(n_samples)

        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1

        return predictions


class AdaBoost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf

    def fit(self, X, y):
        n_samples, n_features = X.shape

        w = np.full(n_samples, 1 / n_samples)
        self.clfs = []

        for _ in range(self.n_clf):
            '''
            for each classifier find the best 
            feature and threshold based on error
            '''
            clf = DecisionStump()
            min_total_error = float('inf')

            # initial dataset weights
           

            # greedy search for best feature and threshold
            for feature_idx in range(n_features):
                X_column = X[:, feature_idx]

                # find the unique values for each feature column
                thresholds = np.unique(X_column)

                # iterate over the thresholds and find the best one
                for threshold in thresholds:
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    
                    missclassified = w[y != predictions]

                    error = np.sum(missclassified)
                    if error > 0.5:
                        error = 1 - error
                        p = -1

                    if error < min_total_error:
                        min_total_error = error
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_idx

            # calculate the say of each decision stump
            clf.alpha = 0.5 * np.log((1 - min_total_error) / (min_total_error + 1e-10))

            # calculate predictions
            predictions = clf.predict(X)

            '''
            if the predictions and the actual labels are not the same, 
            the sign of y * predictions will negative, thus multiplying with
            -clf.alpha will give positive result which will increase the weight 
            of misclassified samples 
            '''
            w = w * np.exp(-clf.alpha * y * predictions)
            w /= np.sum(w) # normalize the weights

            self.clfs.append(clf)

        
    def predict(self, X):
        preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sign(np.sum(preds, axis=0))

        return y_pred

In [42]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target
y[y == 0] = -1

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=24)

In [49]:
best_accuracy = 0
best_n_clf = None
for n_clf in range(10):
    
    clf = AdaBoost(n_clf=n_clf)
    clf.fit(X_train, y_train)
    
    preds = clf.predict(X_test)
    
    accuracy = (np.sum(y_test == preds) / len(y_test)) * 100

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_clf = n_clf

In [50]:
print(f'Accuracy {best_accuracy}, n_clf {best_n_clf}')

Accuracy 97.36842105263158, n_clf 7


In [56]:
for i in range(1, 315300):
    if 315300 % i == 0:
        print(i)

1
2
3
4
5
6
10
12
15
20
25
30
50
60
75
100
150
300
1051
2102
3153
4204
5255
6306
10510
12612
15765
21020
26275
31530
52550
63060
78825
105100
157650
