In [None]:
import scipy as sp
import numpy as np
import pandas as pd

%run DecisionTree.ipynb

# Random Forest

In [4]:
from collections import namedtuple
from collections import Counter

class RandomForest(BasicModel):
    DecisionTreeAndSelectedFeaturesTupleType = \
        namedtuple(
            'DecisionTreeAndSelectedFeaturesTupleType',
            ['decision_tree', 'selected_features']
        )
    
    def __init__(
        self,
        n_estimators=100,
        criterion='gini',
        max_depth=None,
        min_samples_split=2,
        max_features=None,
        random_state=42,
        bootstrap=True
    ):
        super().check_value_type_and_set(
            'n_estimators',
            n_estimators,
            int
        )
        
        super().check_value_and_set(
            'criterion',
            criterion,
            ['gini', 'entropy', 'gain_ratio', 'mse', 'mae']
        )
        
        self.__cmp_criterion_values = \
            self.criterion_name_to_cmp[criterion]
        self.__calc_criterion_value = \
            self.criterion_name_to_calculator_method_name[criterion]
        
        if criterion in ['mse', 'mae']:
            self.__task = 'regression'
        else:
            self.__task = 'classification'
        
        super().check_value_type_and_set(
            'max_depth',
            max_depth,
            (int, type(None))
        )
        if self.max_depth is None:
            self.max_depth = float('inf')
        
        super().check_value_type_and_set(
            'min_samples_split',
            min_samples_split,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'max_features',
            max_features,
            (int, float, str, type(None))
        )
        
        if type(max_features) == str:
            super().check_value_and_set(
                'max_features',
                max_features,
                ['auto', 'sqrt', 'log2']
            )
        if max_features == 'auto':
            max_features = 'sqrt'
            
        super().check_value_type_and_set(
            'random_state',
            random_state,
            (np.random.RandomState, int)
        )
        if type(random_state) == int:
            self.random_state = np.random.RandomState(random_state)
        
        super().check_value_type_and_set(
            'bootstrap',
            bootstrap,
            bool
        )
        
        self.ensemble = []
    
    def fit(self, X, y):
        X = super().check_and_transform_X(X)
        y = super().check_and_transform_X(X, y)
        
        self.X, self.y = X, y
        
        if self.max_features is None:
            self.max_features = float('inf')
        
        elif type(self.max_features) == float:
            self.max_features = np.floor(
                X.shape[1] * self.max_features
            )
        
        elif type(self.max_features) == str:
            self.max_features = np.floor(
                getattr(np, self.max_features)(X.shape[1])
            )
        
        for _ in range(self.n_estimators):
            bootstrap_sample, selected_features = \
                self.__generate_bootstrap_sample()
            
            dt = DecisionTree(
                criterion=self.criterion,
                splitter='best',
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_state=self.random_state
            )
            
            dt.fit(*bootstrap_sample)
            
            del dt.X
            del dt.y
            
            self.ensemble.append(
                self.DecisionTreeAndSelectedFeaturesTupleType(
                    dt,
                    selected_features
                )
            )
    
    def __generate_bootstrap_sample(self):
        all_features = np.arange(self.X.shape[1])
        
        selected_features = self.random_state.choice(
            all_features,
            self.max_features,
            replace=False
        )
        
        observation_indexes = np.arange(self.X.shape[0])
        
        selected_observation_indexes = self.random_state.choice(
            observation_indexes,
            self.X.shape[0],
            replace=True
        )
        
        X = self.X[selected_observation_indexes, selected_features]
        y = self.y[selected_observation_indexes, :]
        
        return (X, y), selected_features
    
    @staticmethod
    def __predict_observation(
        decision_tree, 
        selected_features, 
        el
    ):        
        node = decision_tree.root
        
        while not node.is_leaf():
            if el[selected_features[node.feature]] <= node.split_value:
                node = node.left
            else:
                node = node.right
        
        return node.answer
    
    def predict(self, X):
        assert self.ensemble != [], "Not fitted" 
        
        X = super().check_and_transform_X(X)
        
        final_predictions = []
        
        for el in X:
            el_predictions = []
            
            for model in self.ensemble:
                el_prediction = self.__predict_observation(
                    *model,
                    el
                )
                
                el_predictions.append(el_prediction)
            
            final_prediction = None
            
            if self.__task == 'classification':
                final_prediction = (
                    Counter(el_predictions)
                    .most_common()[0][0]
                )
            else:
                final_prediction = np.mean(el_predictions)
            
            final_predictions.append(final_prediction)
        
        return np.array(final_predictions).reshape((-1, 1))