In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

In [2]:
boston_dataset = load_boston()

In [3]:
X = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)

In [4]:
X, Y = load_boston(return_X_y=True)

In [5]:
X = pd.DataFrame(X)

In [6]:
Y = pd.Series(Y)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.25, random_state = 0)

In [8]:
def rss(y_left, y_right):
    return np.sum((y_left - np.mean(y_left))**2) + np.sum((y_right - np.mean(y_right))**2)

In [9]:
class DecisionTree():
    def __init__(self, X, y, depth, max_depth):
        self.X = X
        self.y = y
        self.depth = depth
        self.max_depth = max_depth
        self.min_rss = np.inf
        self.prediction = np.mean(self.y)
        self.split()

    def find_best_rule(self):
        self.best_feature, self.best_threshold= None, None
        for feature in self.X.columns:
            thresholds = self.X[feature].unique().tolist()
            thresholds.sort()
            thresholds = thresholds[1:]
            for t in thresholds:
                y_left_ix = self.X[feature] < t
                y_left, y_right = self.y[y_left_ix], self.y[~y_left_ix]
                t_rss = rss(y_left, y_right)
                if t_rss < self.min_rss:
                    self.min_rss = t_rss
                    self.best_threshold = t
                    self.best_feature = feature
    

    def split(self):
        if self.depth == self.max_depth or len(self.X) < 2:
            return 

        self.find_best_rule()
        left_ix = self.X[self.best_feature] < self.best_threshold
        self.lhs = DecisionTree(self.X[left_ix], self.y[left_ix], self.depth + 1, self.max_depth)
        self.rhs = DecisionTree(self.X[~left_ix], self.y[~left_ix], self.depth + 1, self.max_depth)
    
  
    def predict_row(self, sample):
        if self.depth == self.max_depth or len(self.X) < 2:
            return self.prediction
        t = self.lhs if sample[self.best_feature] < self.best_threshold  else self.rhs
        return t.predict_row(sample)

    def predict(self, x):
        preds = np.array([self.predict_row(xi) for xi in x.values])
        return preds

    
    

In [10]:
class MyRandomForest():
    def __init__(self, x, y, n_trees, max_depth, sample_size):
        self.x = x
        self.y = y
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.sample_size = sample_size
        np.random.seed(42)
        self.trees = [self.create_tree() for i in range(self.n_trees)]

    def create_tree(self):
        indeces = np.random.permutation(len(self.y))[:self.sample_size]
        return DecisionTree(self.x.iloc[indeces], self.y.iloc[indeces], 0, self.max_depth)

    def predict(self, x):
        return np.mean([t.predict(x) for t in self.trees], axis=0)

In [11]:
MRF = MyRandomForest(X_train, y_train, 50, 5 ,100)

In [12]:
pred_mrf = MRF.predict(X_test)

In [13]:
r2_score(pred_mrf, y_test)

0.5931180249572485

In [14]:
DecTree = DecisionTree(X_train, y_train, 0, 5)

In [15]:
preds_dt = DecTree.predict(X_test)
r2_score(preds_dt, y_test)

0.5738992575503652

In [16]:
regressor = RandomForestRegressor(n_estimators = 50, random_state = 0, 
                                  max_depth = 5, bootstrap=True, max_samples = 100)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [17]:
r2_score(y_pred, y_test)

0.5630576167731276