### RandomForest scratch

In [None]:
import sys

sys.path.insert(0, "..")

In [None]:
import pandas as pd
import numpy as np
import math

import helper as hlp

from pandas.api.types import is_categorical_dtype

In [None]:
# loading data
raw = pd.read_csv("./data/wine_reviews.csv", low_memory = False);

# dropping unnecessary column
raw = raw.drop(columns = ["Unnamed: 0", "description"], inplace = False)

raw.head(3)

In [None]:
# transform non-numerical data to categorical
hlp.trans_categorical(raw, labels = ["description"])

In [None]:
# transform/normalize numerical data
features, targets = hlp.trans_numerical(raw, "points", suffle_data_frame = True)

In [None]:
features.head(3)

In [None]:
# smaller subset of training and validation data
training_set, validation_set = hlp.split_data(features, targets, threshold = 1 / 8, subset = 2000)

In [None]:
class DecisionNode():
    
    def __init__(self, criterion = "mse", min_samples_leaf = 3):
        
        self.criterion = criterion
        self.min_samples_leaf = min_samples_leaf
        
        # left/right nodes
        self.nodes = None
        
        # best split score
        self.split_score = np.inf
        
        # feature(column, value) to be split on 
        self.feature_column, self.feature_value = None, None
    
    def fit(self, x, y):
            
        for column in x.columns:
            
            # find best split for a col
            split(x, y, column)
                    
        if(self.split_score != np.inf):
            
            # splitting selection indices
            split_indices = x[column] < value
            
            # splitting features and targets
            split_dataset = [ x[split_indices], y[split_indices] ], [ x[~split_indices], y[~split_indices] ]
            
            # creating nodes
            self.nodes = [ Node(self.criterion, self.min_samples_leaf) for i in range(0, 2) ]
            
            for index, node in enumerate(self.nodes):
            
                # recursively fit two models with smaller datasets
                node.fit(* split_dataset[index])
                
    def std(self, sum_acc, sum_power_acc, count):
        ''' compute std(x) = sqrt(E(x ^ 2) - E(x) ^ 2) '''
        
        return np.sqrt(sum_power_acc / count - (sum_acc / count) ** 2)
        
                
    def split(self, x, y, column):
        ''' split based on std(x) = sqrt(E(x ^ 2) - E(x) ^ 2) '''
        
        # reset indices
        x = x.reset_index(drop = True)
        
        # sorted indices
        sorting_indices = np.argsort(x[column])
        
        # sum of power to 2 accumulators
        left_power_acc, right_power_acc = 0, np.sum([ target ** 2 for target in y ])
        
        # sum accumulators
        left_acc, right_acc = 0, np.sum(y)
        
        for index, split_index in enumerate(sorting_indices):
            
            # updating left power of 2 sum
            left_power_acc += y[split_index] ** 2
            
            # updating right power of 2 sum
            right_power_acc -= y[split_index] ** 2
            
            
            # update left sum acc
            left_acc += y[split_index]
            
            # update right sum acc
            right_acc -= y[split_index]
            
            
            if(index < self.min_samples_leaf - 1 or index > y.size - self.min_samples_leaf - 1):
                continue

            # mean of standard deviation of 2 subsets
            score = (self.std(left_acc, left_power_acc, index + 1) + 
                         self.std(right_acc, right_power_acc, y.size - (index + 1))) / 2

            if(score < self.split_score):

                # update current best score
                self.split_score = score

                # update current best column and value for the split 
                self.feature_column, self.feature_value = column, x[column][split_index]
            

class DecisionTree():
    
    def __init__(self, criterion = "mse", max_features = 0.6, min_samples_leaf = 4):
        
        self.criterion = criterion
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        
        # root decision node
        self.root = None
    
    def fit(self, x, y):
        
        # initialize decision root node
        self.root = DecisionNode(self.criterion, self.min_samples_leaf)
        
        # fit decision tree to our dataset
        self.root.fit(x, y)
        
    def predict(self, x):
        
        if(self.root != None):
            
            # make actual prediction through full pass
            return self.root.predict(x)

In [None]:
class RandomForestRegressor():
    
    def __init__(self, estimators = 30, criterion = "mse", max_features = 0.6, min_samples_leaf = 4, shuffle = True):
        
        self.estimators = estimators
        self.criterion = criterion
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        self.shuffle = shuffle
        
        # instantiate trees
        self.trees = [ DecisionTree() for i in range(0, estimators) ]
        
        
    def fit(self, x, y):
        
        # assert that both features and targets are same size
        assert(x.shape[0] == y.size)
        
        # assert targets 1D vector of continous data
        assert(y.ndim == 1)
        
        self.x, self.y = x, y
            
        if(self.shuffle):
            
            # create a random permutation of indices
            self.indices = np.random.permutation(y.size)
            
            # shuffle our features and targets based on computed permutation
            self.x, self.y = self.x.iloc(self.indices), self.y.iloc(self.indices)
            
        
        for tree in self.trees:
            
            # fit every decision tree regresor to our dataset
            tree.fit(self.x, self.y)
        
    def predict(self, x):
        
        # mean of tree predictors
        return [ tree.predict(x) for tree in self.trees ] / self.estimators

In [None]:
r = RandomForestRegressor(estimators = 15, criterion = "mse", max_features = 0.6, min_samples_leaf = 4, shuffle = False)

node = DecisionNode()

%timeit node.split(* training_set, "price")

In [None]:
print(f"Split value: {node.split_score}, feature value: {node.feature_value} on {node.feature_column}")