In [22]:
import numpy as np
from scipy.stats import mode

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.datasets import make_classification, make_regression

## Both regression and classification RandomForest model (Bagging)

In [23]:
class RandomForest:
    def __init__(self, n_ensembles=100, learner_type='classification'):
        self.n_ensembles = n_ensembles
        self.weak_learner = None
        self.learner_type = learner_type
        self.feature_size = 0
        self.models = []
        self.set_weak_learner()

    def set_weak_learner(self):
        if self.learner_type == 'classification':
            self.weak_learner = DecisionTreeClassifier
        elif self.learner_type == 'regression':
            self.weak_learner = DecisionTreeRegressor
        else:
            raise ValueError('Invalid learner type, use "classification" or "regression"')

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_samples, n_features = X.shape
        if self.learner_type == 'classification':
            self.feature_size = np.floor(np.sqrt(n_features)).astype(int)
        elif self.learner_type == 'regression':
            self.feature_size = max(1, n_features // 3)

        # Should be better with multi thread
        for _ in range(self.n_ensembles):
            random_features = np.random.choice(n_features, size=self.feature_size, replace=False)
            bootstrapped_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_data, y_data = X[bootstrapped_indices, :][:, random_features], y[bootstrapped_indices]

            model = self.weak_learner()
            model.fit(X_data, y_data)
            self.models.append((model, random_features))

    def predict(self, X: np.ndarray):
        weak_predicts = []
        for model, features in self.models:
            weak_predicts.append(model.predict(X[:, features]))

        predicits = np.array(weak_predicts)
        if self.learner_type == 'classification':
            return mode(predicits, axis=0).mode[0]
        return np.mean(predicits, axis=0)

# Boostings

## AdaBoost

In [None]:
class AdaBoost:
    def __init__(self, model_type='tree', n_estimators=100):
        self.model_type = model_type
        self.model = None
        self.n_estimators = n_estimators
        self.weights = None
        self.models = []
        self.alphas = []
        self.set_model()

    def set_model(self):
        if self.model_type == 'tree':
            self.model = DecisionTreeClassifier

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_samples = X.shape[0]
        self.weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            model = self.model()
            model.fit(X, y, sample_weight=self.weights)

            y_hat = model.predict(X)

            j_w = np.sum(self.weights * (y_hat != y))
            error_m = j_w

            alpha = 0.5 * np.log((1 - error_m) / max(error_m, 1e-10))

            self.weights = self.weights * np.exp(-alpha * y_hat * y)

            self.weights /= np.sum(self.weights)

            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X: np.ndarray):
        predicts = np.array([model.predict(X) for model in self.models])
        return np.sign(np.array(self.alphas) @ predicts)