In [43]:
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor

from decision_tree.DecisionTreeRegressor import CustomDecisionTreeRegressor
from util.errors import coefficient_of_correlation

housing = sklearn.datasets.fetch_california_housing()
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

model = RandomForestRegressor(
    n_estimators=10,
    max_features=3
)
model.fit(x_train, y_train)
model.score(x_test, y_test)


0.798858878112327

In [49]:
%load_ext autoreload
%autoreload 2

class CustomRandomForestRegressor:
    def __init__(self,
        n_estimators=10,
        max_features=2
    ):
        self.trees = None
        self.indices_for_trees = None
        self.n_estimators = n_estimators
        self.max_features = max_features

    def fit(self, x, y):
        self.indices_for_trees = []
        self.trees = []

        for i in range(self.n_estimators):
            x_bootstrap, y_bootstrap, indices = self._bootstrap_dataset(x, y)
            self.indices_for_trees.append(indices)
            tree = CustomDecisionTreeRegressor(
                min_samples_split=2,
                min_samples_leaf=1,
                max_features=self.max_features,
            )
            tree.fit(x_bootstrap, y_bootstrap, verbose=False)
            self.trees.append(tree)

            x_oob, y_oob = self._get_oob(x, y, indices)
            print(f"OOB score for tree {i}: {tree.score(x_oob, y_oob)}")

    def predict(self, x):
        predictions = []
        for tree in self.trees:
            predictions.append(tree.predict(x))

        return np.mean(predictions)

    def score(self, x_test, y_test):
        y_pred = [self.predict(x) for x in x_test]
        return coefficient_of_correlation(y_test, y_pred)

    def _get_oob(self, x_original, y_original, indices):
        unique_picked = np.unique(indices)
        all_idx = np.arange(len(x_original))
        oob_idx = np.setdiff1d(all_idx, unique_picked)

        return x_original[oob_idx], y_original[oob_idx]

    def _bootstrap_dataset(self, x, y):
        indices = np.random.choice(x.shape[0], x.shape[0], replace=True)
        x_bootstrap = x[indices]
        y_bootstrap = y[indices]
        return x_bootstrap, y_bootstrap, indices


rfr = CustomRandomForestRegressor(n_estimators=10, max_features=3)
rfr.fit(x_train, y_train)
rfr.score(x_test, y_test)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
OOB score for tree 0: 0.6189976225276278
OOB score for tree 1: 0.6228937188245025
OOB score for tree 2: 0.5745445567536531
OOB score for tree 3: 0.6151780658715775
OOB score for tree 4: 0.6478959756612954
OOB score for tree 5: 0.6664070385179643
OOB score for tree 6: 0.6558705462027525
OOB score for tree 7: 0.6041126028984901
OOB score for tree 8: 0.6072984172230245
OOB score for tree 9: 0.5454630197957213


np.float64(0.8011017227662364)