In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Diabetes data

In [2]:
from sklearn.datasets import load_diabetes
data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']

# Solution: Владислав Щетинини

In [3]:
class Node:
    def __init__(self):
        self.feature = None
        self.value_split = None
        self.value_leaf = None
        self.side = None
        self.left = None
        self.right = None

class MyTreeReg:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.bins = bins

    def fit(self, X, y):
        self.tree = None
        self.split_values = {}
        
        def create_tree(root, X_root, y_root, side='root', depth=0):
            if root is None:
                root = Node()

            y_root_unique_size = len(y_root.unique())
            if y_root_unique_size == 0 or y_root_unique_size == 1 or \
              depth >= self.max_depth or len(y_root) < self.min_samples_split \
              or (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs):
                root.side = side
                root.value_leaf = y_root.mean()
                return root

            col_name, split_value, _ = self.get_best_split(X_root, y_root)

            X_left = X_root[X_root[col_name] <= split_value]
            y_left = y_root[X_root[col_name] <= split_value]

            X_right = X_root[X_root[col_name] > split_value]
            y_right = y_root[X_root[col_name] > split_value]

            if len(X_left) == 0 or len(X_right) == 0:
                root.side = side
                root.value_leaf = y_root.mean()
                return root

            root.feature = col_name
            root.value_split = split_value
            self.leafs_cnt += 1

            root.left = create_tree(root.left, X_left, y_left, 'left', depth + 1)
            root.right = create_tree(root.right, X_right, y_right, 'right', depth + 1)

            return root

        self.tree = create_tree(self.tree, X, y)

    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            node = self.tree
            while node.feature:
                if row[node.feature] <= node.value_split:
                    node = node.left
                else:
                    node = node.right
            y_pred.append(node.value_leaf)
        return np.array(y_pred)

    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node.feature is not None:
            print(f"{' ' * depth}{node.feature} > {node.value_split}")
            if node.left is not None:
                self.print_tree(node.left, depth + 1)
            if node.right is not None:
                self.print_tree(node.right, depth + 1)
        else:
            print(f"{' ' * depth}{node.side} = {node.value_leaf}")

    def get_best_split(self, X, y):
        mse_0 = self.mse(y)

        col_name = None
        split_value = None
        gain = -float('inf')

        for col in X.columns:
            if not (col in self.split_values.keys()):
                x_unique_values = np.unique(X[col])
                if self.bins is None or len(x_unique_values) - 1 < self.bins:
                    self.split_values[col] = np.array([(x_unique_values[i - 1] + x_unique_values[i]) / 2 for i in range(1, len(x_unique_values))])
                else:
                    _, self.split_values[col] = np.histogram(X[col], bins=self.bins)

            for split_value_i in self.split_values[col]:
                mask = X[col] <= split_value_i
                left_split, right_split = y[mask], y[~mask]

                mse_left = self.mse(left_split)
                mse_right = self.mse(right_split)

                weight_left = len(left_split) / len(y)
                weight_right = len(right_split) / len(y)

                mse_i = weight_left * mse_left + weight_right * mse_right

                gain_i = mse_0 - mse_i
                if gain < gain_i:
                    col_name = col
                    split_value = split_value_i
                    gain = gain_i

        return col_name, split_value, gain

    def mse(self, t):
        t_mean = np.mean(t)
        return ((t - t_mean) ** 2).mean()

    def __str__(self):
        return f"MyTreeReg class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"

# Testing tree #3 - (5,100,10,4)

In [4]:
test = MyTreeReg(5,100,10,4)
test.fit(X,y)

In [5]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 9


In [7]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if node.feature is not None:
        if node.left is not None:
            get_leafs_sum(node.left, depth + 1)
        if node.right is not None:
            get_leafs_sum(node.right, depth + 1)
    else:
        sum_leafs.append(node.value_leaf)
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 1536.517848


In [8]:
test.print_tree()

s5 > 0.003750797226299507
 bmi > 0.040139965041070744
  bmi > -0.02506766542872388
   s4 > 0.05441996975509651
    s2 > 0.04158746183894749
     left = 95.47826086956522
     right = 133.25
    right = 253.0
   right = 121.66666666666667
  right = 176.82758620689654
 bmi > 0.040139965041070744
  bmi > -0.02506766542872388
   left = 140.7037037037037
   sex > 0.0030192411164148523
    left = 205.9090909090909
    right = 177.14285714285714
  right = 232.53968253968253


# Testing tree #5 - (10,40,21,10)

In [9]:
test = MyTreeReg(10,40,21,10)
test.fit(X,y)

In [10]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 21


In [12]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if node.feature is not None:
        if node.left is not None:
            get_leafs_sum(node.left, depth + 1)
        if node.right is not None:
            get_leafs_sum(node.right, depth + 1)
    else:
        sum_leafs.append(node.value_leaf)
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 3487.228104


In [13]:
test.print_tree()

bmi > 0.0140569128531529
 s5 > 0.003750797226299507
  age > 0.0889314447476977
   s5 > -0.0481884758883839
    s6 > -0.001077697500466518
     bp > -0.03906645628417579
      left = 67.0909090909091
      right = 86.77777777777777
     right = 109.4
    s3 > -0.017261217243008803
     left = 133.0
     bp > 0.10759983526898859
      s4 > 0.028257075054077013
       s1 > -0.04257235499460499
        left = 118.45454545454545
        age > 0.023545752629345787
         s2 > -0.021292749288390714
          left = 69.46153846153847
          right = 94.18604651162791
         right = 109.18181818181819
       right = 160.0
      right = 216.0
   right = 277.0
  s4 > 0.10674575915713551
   s6 > -0.001077697500466518
    left = 180.75
    age > 0.0017505219232284985
     left = 114.58823529411765
     right = 165.6315789473684
   right = 216.14285714285714
 s6 > 0.026260208137442592
  bp > 0.03426668949240641
   bmi > 0.1183891216048243
    age > -0.0854304009012407
     left = 274.0
     bp

# Testing tree #6  - (15,35,30,6)

In [14]:
test = MyTreeReg(15,35,30,6)
test.fit(X,y)

In [15]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 27


In [17]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if node.feature is not None:
        if node.left is not None:
            get_leafs_sum(node.left, depth + 1)
        if node.right is not None:
            get_leafs_sum(node.right, depth + 1)
    else:
        sum_leafs.append(node.value_leaf)
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 4232.007259


In [18]:
test.print_tree()

s5 > 0.003750797226299507
 bmi > -0.0033317886054590046
  s5 > -0.039531930369269996
   age > 0.07440129094361951
    s6 > -0.001077697500466518
     bmi > -0.04680354225198875
      left = 89.17241379310344
      right = 70.46428571428571
     right = 105.11764705882354
    right = 199.0
   s2 > 0.09398763777839597
    bp > 0.09130358065197033
     s6 > -0.04664087356364835
      left = 119.96
      s4 > -0.076394503750001
       left = 72.33333333333333
       s2 > -0.06321289003994951
        left = 157.0
        age > 0.0017505219232284985
         left = 93.60714285714286
         right = 112.70588235294117
     right = 159.5
    right = 189.0
  bmi > 0.08361171868760049
   s5 > -0.039531930369269996
    left = 112.26315789473684
    bp > 0.00982230756687899
     left = 141.1904761904762
     right = 176.8695652173913
   right = 253.85714285714286
 bmi > 0.040139965041070744
  bmi > -0.0033317886054590046
   s6 > 0.044485478562715314
    s6 > -0.001077697500466518
     left = 172.