In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Diabetes data

In [2]:
from sklearn.datasets import load_diabetes
data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']

# Solution: Тимур Юнусов

In [3]:
class MyTreeReg:
    def __init__(self, max_depth = 5, min_samples_split = 2, max_leafs = 20, bins = None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.bins = bins
        self._X_check_left = None
        self._X_check_right = None
        self._div_val_check = None
        self._div_feature_check = None
        self
    def __str__(self):
        return ('MyTreeReg class: max_depth={}, min_samples_split={}, max_leafs={}'.format(self.max_depth, self.min_samples_split, self.max_leafs))
    def _get_dividers(self, X, y):
        tmp = pd.concat([X, y], axis = 1)
        dividers = dict()
        for feature_name in X.columns:
            tmp_sorted = tmp.sort_values(by = feature_name, ascending = False)
            unique_values = tmp_sorted[feature_name].unique()
            possible_dividers = (unique_values[:-1] + unique_values[1:])/2
            if self.bins is not None:
                if len(possible_dividers) <= self.bins - 1:
                    dividers[feature_name] = possible_dividers
                else:
                    _, bins_values = np.histogram(tmp_sorted[feature_name], bins = self.bins)
                    dividers[feature_name] = bins_values[1:-1]
            else:
                dividers[feature_name] = possible_dividers
        self.dividers = dividers
    def _get_best_split(self, X, y):
        y.name = 'target'
        y_mean = np.mean(y)
        tmp = pd.concat([X, y], axis = 1)
        MSE0 = np.mean((y - y_mean)**2)
        best_split = {'col_name': None, 'split_value':None,  'gain':0}
        for feature_name, div in self.dividers.items():
            sorted_idx = np.argsort(X[feature_name])
            X_sorted = X.iloc[sorted_idx]
            y_sorted = y.iloc[sorted_idx]
            for val in div:
                left_mask = X_sorted[feature_name] <= val
                right_mask = ~left_mask
                if np.sum(left_mask) ==0 or np.sum(right_mask)==0:
                     continue
                else:
                    y_left, y_right = y_sorted[left_mask], y_sorted[right_mask]
                    MSE_left = np.mean((y_left - np.mean(y_left)) ** 2)
                    MSE_right = np.mean((y_right - np.mean(y_right)) ** 2)
                    IG = MSE0 - (MSE_left * len(y_left) / len(y) + MSE_right * len(y_right) / len(y))
                    if IG > best_split['gain']:
                        best_split.update({'col_name': feature_name, 'split_value':val,  'gain':IG})
        return best_split['col_name'], best_split['split_value'], best_split['gain']                         
    def _split_dataset(self, data, col_name, split_value):
        left, right = data.query('{} <= @split_value'.format(col_name)), data.query('{} > @split_value'.format(col_name))
        return left, right 
    def _build_tree(self, data, depth):
        X, y = data.drop(columns = 'target'), data['target']
        if len(y.unique()) == 1:
            self.leafs_cnt += 1
            self.potential_leafs -= 1
            return y.mean()
        if len(y) < self.min_samples_split:
            self.leafs_cnt += 1
            self.potential_leafs -= 1
            return y.mean()
        if ((self.max_depth != None) and (depth >= self.max_depth)):
            self.leafs_cnt += 1
            self.potential_leafs -= 1
            return y.mean()
        if  ((self.max_leafs != None) and (self.leafs_cnt + self.potential_leafs) >= self.max_leafs):
            self.leafs_cnt += 1
            return y.mean()
        k = 0
        for feature, div in self.dividers.items():
            if (np.sum((X[feature] > min(div))) == 0) or (np.sum((X[feature] < max(div))) == 0):
                print('Yes')
                k += 1
        if k == len(self.dividers):
            self.leafs_cnt += 1
            self.potential_leafs -= 1
            return y.mean()
        col_name, split_value, ig = self._get_best_split(X, y)
        if depth == 0:
            self.potential_leafs += 2
        else:
            self.potential_leafs += 1
        left, right = self._split_dataset(data, col_name, split_value)
        return {'feature': col_name,
                'split_value': split_value,
                'left': self._build_tree(left, depth + 1), 
                'right': self._build_tree(right, depth + 1)}
    def fit(self, X, y):
        y.name = 'target'
        self._get_dividers(X, y)
        data = pd.concat([X, y], axis = 1)
        self.potential_leafs = 0
        self.tree =  self._build_tree(data, depth = 0)
    def _predict_row(self, sample, node = None):
        if node == None:
            node = self.tree
        if type(node) == dict:
            if sample[node['feature']] <= node['split_value']:
                return self._predict_row(sample, node['left'])
            else:
                return self._predict_row(sample, node['right'])
        else:
            return node
    def predict(self, X_test):
        return list(X_test.apply(self._predict_row, axis = 1))

# Testing tree #3 - (5,100,10,4)

In [19]:
test = MyTreeReg(5,100,10,4)
test.fit(X,y)

Yes
Yes


In [20]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 9


In [22]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if type(node) == dict:
        if node['left']  is not None:
            get_leafs_sum(node['left'], depth + 1)
        if node['right']  is not None:
            get_leafs_sum(node['right'], depth + 1)
    else:
        sum_leafs.append(float(node))
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 1536.517848


In [23]:
def print_tree(node=None, depth=0):
        if node is None:
            node = test.tree
        if type(node) == dict:
            print(f"{' ' * depth}{node['feature']} > {node['split_value']}")
            if node['left'] is not None:
                print_tree(node['left'], depth + 1)
            if node['right'] is not None:
                print_tree(node['right'], depth + 1)
        else:
            print(f"{' ' * depth}Leaf ->{node}")
print_tree(node=None, depth=0)

s5 > 0.003750797226299507
 bmi > 0.040139965041070744
  bmi > -0.02506766542872388
   s4 > 0.05441996975509651
    s2 > 0.04158746183894749
     Leaf ->95.47826086956522
     Leaf ->133.25
    Leaf ->253.0
   Leaf ->121.66666666666667
  Leaf ->176.82758620689654
 bmi > 0.040139965041070744
  bmi > -0.02506766542872388
   Leaf ->140.7037037037037
   sex > 0.0030192411164148523
    Leaf ->205.9090909090909
    Leaf ->177.14285714285714
  Leaf ->232.53968253968253


# Testing tree #5 - (10,40,21,10)

In [24]:
test = MyTreeReg(10,40,21,10)
test.fit(X,y)

In [25]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 21


In [26]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if type(node) == dict:
        if node['left']  is not None:
            get_leafs_sum(node['left'], depth + 1)
        if node['right']  is not None:
            get_leafs_sum(node['right'], depth + 1)
    else:
        sum_leafs.append(float(node))
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 3487.228104


In [27]:
def print_tree(node=None, depth=0):
        if node is None:
            node = test.tree
        if type(node) == dict:
            print(f"{' ' * depth}{node['feature']} > {node['split_value']}")
            if node['left'] is not None:
                print_tree(node['left'], depth + 1)
            if node['right'] is not None:
                print_tree(node['right'], depth + 1)
        else:
            print(f"{' ' * depth}Leaf ->{node}")
print_tree(node=None, depth=0)

bmi > 0.0140569128531529
 s5 > 0.003750797226299507
  age > 0.0889314447476977
   s5 > -0.0481884758883839
    s6 > -0.001077697500466518
     bp > -0.03906645628417579
      Leaf ->67.0909090909091
      Leaf ->86.77777777777777
     Leaf ->109.4
    s3 > -0.017261217243008803
     Leaf ->133.0
     bp > 0.10759983526898859
      s4 > 0.028257075054077013
       s1 > -0.04257235499460499
        Leaf ->118.45454545454545
        age > 0.023545752629345787
         s2 > -0.021292749288390714
          Leaf ->69.46153846153847
          Leaf ->94.18604651162791
         Leaf ->109.18181818181819
       Leaf ->160.0
      Leaf ->216.0
   Leaf ->277.0
  s4 > 0.10674575915713551
   s6 > -0.001077697500466518
    Leaf ->180.75
    age > 0.0017505219232284985
     Leaf ->114.58823529411765
     Leaf ->165.6315789473684
   Leaf ->216.14285714285714
 s6 > 0.026260208137442592
  bp > 0.03426668949240641
   bmi > 0.1183891216048243
    age > -0.0854304009012407
     Leaf ->274.0
     bp > -0.039

# Testing tree #6  - (15,35,30,6)

In [28]:
test = MyTreeReg(15,35,30,6)
test.fit(X,y)

In [29]:
print('Number of leaves is', test.leafs_cnt)

Number of leaves is 26


In [30]:
sum_leafs = []
def get_leafs_sum(node, depth):
    if node is None:
        node = test.tree
    if type(node) == dict:
        if node['left']  is not None:
            get_leafs_sum(node['left'], depth + 1)
        if node['right']  is not None:
            get_leafs_sum(node['right'], depth + 1)
    else:
        sum_leafs.append(float(node))
    return sum_leafs
get_leafs_sum(test.tree,0)
print('Sum of the leafves values is', np.round(np.sum(sum_leafs),6))

Sum of the leafves values is 4120.649305


In [31]:
def print_tree(node=None, depth=0):
        if node is None:
            node = test.tree
        if type(node) == dict:
            print(f"{' ' * depth}{node['feature']} > {node['split_value']}")
            if node['left'] is not None:
                print_tree(node['left'], depth + 1)
            if node['right'] is not None:
                print_tree(node['right'], depth + 1)
        else:
            print(f"{' ' * depth}Leaf ->{node}")
print_tree(node=None, depth=0)

s5 > 0.003750797226299507
 bmi > -0.0033317886054590046
  s5 > -0.039531930369269996
   age > 0.07440129094361951
    s6 > -0.001077697500466518
     bmi > -0.04680354225198875
      Leaf ->89.17241379310344
      Leaf ->70.46428571428571
     Leaf ->105.11764705882354
    Leaf ->199.0
   s2 > 0.09398763777839597
    bp > 0.09130358065197033
     s6 > -0.04664087356364835
      Leaf ->119.96
      age > 0.0017505219232284985
       Leaf ->93.40625
       s6 > -0.001077697500466518
        Leaf ->126.36363636363636
        Leaf ->104.51851851851852
     Leaf ->159.5
    Leaf ->189.0
  bmi > 0.08361171868760049
   s5 > -0.039531930369269996
    Leaf ->112.26315789473684
    bp > 0.00982230756687899
     Leaf ->141.1904761904762
     Leaf ->176.8695652173913
   Leaf ->253.85714285714286
 bmi > 0.040139965041070744
  bmi > -0.0033317886054590046
   s6 > 0.044485478562715314
    s6 > -0.001077697500466518
     Leaf ->172.84615384615384
     Leaf ->128.0
    Leaf ->191.3846153846154
   s2 > 