In [44]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from DecisionTree import DecisionTree
from collections import Counter

In [45]:
class RandomForest:
    def __init__(self, n_trees=1, min_samples_split=2, max_depth=100):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            clf = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth)
            # sampling from training data
            X_sampled, y_sampled = self._bootstrap(X, y)
            clf.fit(X_sampled, y_sampled)
            self.trees.append(clf)

    def _bootstrap(self, X, y):
        n_samples = X.shape[0]
        sample_idx = np.random.choice(n_samples, n_samples, replace=True)
        return X[sample_idx], y[sample_idx]
    
    def predict(self, X):
        predictions = []
        for clf in self.trees:
            prediction = clf.predict(X) # prediction is a 1 x n dimension array 
            predictions.append(prediction)

        # number_of_tress x samples to  samples x number_of_tress
        predictions = np.array(predictions).swapaxes(0, 1)
        return [Counter(item).most_common(1)[0][0] for item in predictions]
        

In [46]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=24)

In [48]:
clf = RandomForest(n_trees=3, max_depth=16)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = np.sum(predictions == y_test) / len(y_test)
accuracy

0.956140350877193