In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
def calculate_metric(y, y_pred, metric):
    delta = y - y_pred
    if metric == 'mae':
        return np.mean(np.abs(delta))
    elif metric == 'mse':
        return np.mean(delta ** 2)
    elif metric == 'rmse':
        return np.sqrt(np.mean(delta ** 2))
    elif metric == 'mape':
        return 100 * np.mean(np.abs(delta / y))
    elif metric == 'r2':
        return 1 - np.mean(delta ** 2) / np.mean((y - np.average(y)) ** 2)

def mse(data):
    n = len(data)
    if n != 0:
        return 1 / n * np.sum((data - np.average(data)) ** 2)
    return 0
def information_gain(data, mask):
    left, right = data[mask], data[~mask]
    return mse(data) - len(left) / len(data) * mse(left) - len(right) / len(data) * mse(right)
class MyTreeReg():
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20,bins = None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.tree = {}
        self.bins=bins
        self.feature_bins = {}
        self.fi = {}

    def __str__(self):
        return "MyTreeReg class: " + ", ".join(('{}={}'.format(item, self.__dict__[item]) for item in self.__dict__))

    def __repr__(self):
        return "MyTreeReg class: " + ", ".join(('{}={}'.format(item, self.__dict__[item]) for item in self.__dict__))

    def get_best_split(self, X, y):
        col_name, split_value, max_IG = '', 0, -np.inf
        for feature in X.columns:
            if self.bins:
                thresholds = self.feature_bins[feature]
            else:
                thresholds = np.unique(X[feature])

            for threshold in thresholds:
                mask = X[feature] <= threshold
                cur_IG = information_gain(y, mask)
                if cur_IG > max_IG:
                    col_name, split_value, max_IG = feature, threshold, cur_IG
        return col_name, split_value, max_IG

    def mean_squared_error(self,y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def fit(self, X, y):
        self.leafs_cnt = 0
        if self.bins:
            self._build_histograms(X)
        for feature in X.columns:
            self.fi[feature] = 0
        self.tree= self._build_tree(X, y)
        for feature in X.columns:
            self.fi[feature] /= len(X)
            if self.fi[feature] == 0.0:
                self.fi[feature] = 0

    def _build_tree(self,X,y,depth=0):
      lenght_y = len(y)
      if ((len(np.unique(y)) == 1)
      or lenght_y == 1
      or depth == self.max_depth
      or lenght_y < self.min_samples_split
      or (self.max_leafs - self.leafs_cnt == 1 and self.max_leafs != 1)
      or (self.max_leafs - self.leafs_cnt == 0 and self.max_leafs == 1)):
        self.leafs_cnt+=1
        return {'leaf': True, 'value': y.mean()}


      best_col, best_split, IG = self.get_best_split(X, y)
      self.fi[best_col] += len(X) * IG
      self.leafs_cnt += 1
      mask = X[best_col] <= best_split
      left_branch =  self._build_tree(X[mask], y[mask], depth + 1)
      self.leafs_cnt -= 1
      right_branch =self._build_tree(X[~mask], y[~mask], depth + 1)



      return  {        'col': best_col,
        'split': best_split,
        'left': left_branch,
        'right': right_branch
    }






    def _predict_proba_single(self, row,node = None):
       if node is None:
            node = self.tree
       if 'value' in node:
            return node['value']
       if row[node['col']] <= node['split']:
            return self._predict_proba_single(row, node['left'])
       else:
           return self._predict_proba_single(row, node['right'])
    def predict(self, X):
        return [self._predict_proba_single(row) for _, row in X.iterrows()]
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree

        if 'value' in node :
            print(f"{' ' * depth}Leaf: {node['value']}")
        else:
            print(f"{' ' * depth}{node['col']} > {node['split']}")
            self.print_tree(node['left'], depth + 1)
            self.print_tree(node['right'], depth + 1)

    def _build_histograms(self, X):
        for feature in X.columns:
            unique_values = np.unique(X[feature])
            if len(unique_values) <= self.bins - 1:
                self.feature_bins[feature] = unique_values
            else:
                counts, bin_edges = np.histogram(X[feature], bins=self.bins)
                self.feature_bins[feature] = bin_edges[1:-1]

class MyForestReg():
    def __init__(self,n_estimators = 10, max_features = 0.5, max_samples = 0.5, random_state = 42, max_depth = 5, min_samples_split = 2, max_leafs = 20, bins = 16,oob_score=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_samples = max_samples
        self.max_depth = max_depth
        self.min_samples_split=min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.random_state = random_state
        self.trees = []
        self.leafs_cnt = 0
        self.fi = {}
        self.oob_score = oob_score
        self.oob_score_ = 0
    def __str__(self):
        return "MyForestReg class: " + ", ".join(('{}={}'.format(item, self.__dict__[item]) for item in self.__dict__))

    def __repr__(self):
        return "MyForestReg class: " + ", ".join(('{}={}'.format(item, self.__dict__[item]) for item in self.__dict__))

    def fit(self, X, y):
        random.seed(self.random_state)

        for feature in X.columns:
            self.fi[feature] = 0

        oob_predictions = [None] * len(X)

        for _ in range(self.n_estimators):
          init_cols = list(X.columns.values)
          init_rows_cnt = len(X)
          cols_smpl_cnt = round(len(X.columns) * self.max_features)
          rows_smpl_cnt = round(init_rows_cnt * self.max_samples)
          cols_idx = random.sample(init_cols, cols_smpl_cnt)
          rows_idx = random.sample(range(init_rows_cnt), rows_smpl_cnt)

          X_sample = X.loc[rows_idx, cols_idx]
          y_sample = y.loc[rows_idx]

          tree = MyTreeReg(max_depth=self.max_depth, min_samples_split=self.min_samples_split, max_leafs=self.max_leafs, bins=self.bins)
          tree.fit(X_sample, y_sample)

          for feature in X_sample.columns:
            self.fi[feature] += tree.fi[feature] / len(X) * len(X_sample)

          self.trees.append(tree)
          self.leafs_cnt += tree.leafs_cnt

          unused_rows_idx = list(set(range(init_rows_cnt)) - set(rows_idx))
          X_oob = X[cols_idx].iloc[unused_rows_idx]
          y_pred = tree.predict(X_oob)
          for idx, pred in zip(unused_rows_idx, y_pred):
                if oob_predictions[idx] is None:
                    oob_predictions[idx] = []
                oob_predictions[idx].append(pred)

          final_y_pred = []
          indexes = []
          for idx, preds in enumerate(oob_predictions):
            if preds:
                indexes.append(idx)
            if preds is not None:
                final_y_pred.append(np.average(preds))

          if self.oob_score:
            self.oob_score_ = calculate_metric(y[indexes], final_y_pred, self.oob_score)
          else:
             self.oob_score_ = None


    def predict(self, X):
        predictions = []
        for tree in self.forest:
            predictions.append(tree.predict(X))
        return np.average(predictions, axis=0)







X, y = make_classification(n_samples=1500, n_features=50, n_informative=30, random_state=42)
X = pd.DataFrame(X).round(2)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
test = X.sample(20, random_state=42)

tree = MyForestReg(
    max_depth = 1500,
    min_samples_split = 100,
    max_leafs = 120
)
tree.fit(X, y)


