Importing libraries

In [97]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

Defining class for a weak classifier, which simply returns predictions of 1 or -1 if the sample is above or below the threshold, for a given feature

In [98]:
class WeakClassifier:
    def __init__(self):
        self.check_below_threshold = True
        self.feature_num = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        num_samples = X.shape[0]
        X_column = X[:, self.feature_num]

        predictions = np.ones(num_samples)
        if self.check_below_threshold:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1
        return predictions

In [99]:
class MyAdaBoost:
    def __init__(self, num_classifiers=5):
        self.num_classifiers = num_classifiers
        self.classifier_list = []

    def fit(self, X, y):
        num_rows, num_features = X.shape

        # Initialize the weights
        weights = np.full(num_rows, (1 / num_rows))

        for _ in range(self.num_classifiers):
            clf = WeakClassifier()

            min_error = float("inf")
            for feature_num in range(num_features):
                X_column = X[:, feature_num]
                for threshold in np.unique(X_column):
                    check_below = True
                    predictions = np.ones(num_rows)
                    predictions[X_column < threshold] = -1
                    
                    misclassified_preds = weights[y != predictions]
                    error = sum(misclassified_preds)

                    if error > 0.5:
                        error = 1 - error
                        check_below = False

                    if error < min_error:
                        min_error = error
                        clf.check_below_threshold = check_below
                        clf.threshold = threshold
                        clf.feature_num = feature_num

            EPSILON = 1e-10
            clf.alpha = 0.5 * np.log((1 - error) / (error + EPSILON))

            predictions = clf.predict(X)
            weights *= np.exp(-clf.alpha * y * predictions)
            weights /= np.sum(weights)

            self.classifier_list.append(clf)

    def predict(self, X):
        clf_predictions = [clf.alpha * clf.predict(X) for clf in self.classifier_list]
        y_pred = np.sum(clf_predictions, axis=0)
        return np.sign(y_pred)


In [100]:
def get_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [101]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target
y[y == 0] = -1

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
clf = MyAdaBoost(num_classifiers=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = get_accuracy(y_test, y_pred)
print(acc)

0.956140350877193


In [104]:
data = np.genfromtxt('heart.csv', delimiter=',')
X = data[:, :-1]
y = data[:, -1]
y[y == 0] = -1

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [106]:
clf = MyAdaBoost(num_classifiers=15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = get_accuracy(y_test, y_pred)
print(acc)

0.7658536585365854
