In [360]:
import numpy as np
import pandas as pd

In [361]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [362]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [363]:
class Node:
    def __init__(self):
        self.feature = None
        self.value_split = None
        self.value_leaf = None
        self.side = None
        self.left = None
        self.right = None

class MyTreeClf:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.bins = bins
        self.__sum_tree_values = 0
        self.split_values = {}
        self.criterion = criterion
        self.fi = {}

    def fit(self, X, y):
        self.tree = None
        self.fi = { col: 0 for col in X.columns }
        
        def create_tree(root, X_root, y_root, side='root', depth=0):
            if root is None:
                root = Node()
            col_name, split_value, ig = self.get_best_split(X_root, y_root)

            proportion_ones = len(y_root[y_root == 1]) / len(y_root) if len(y_root) else 0

            if proportion_ones == 0 or proportion_ones == 1 or depth >= self.max_depth or \
              len(y_root) < self.min_samples_split or \
              (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs):
                root.side = side
                root.value_leaf = proportion_ones
                self.__sum_tree_values += root.value_leaf
                return root

            self.fi[col_name] += len(y_root) / len(y) * ig

            X_left = X_root.loc[X_root[col_name] <= split_value]
            y_left = y_root.loc[X_root[col_name] <= split_value]

            X_right = X_root.loc[X_root[col_name] > split_value]
            y_right = y_root.loc[X_root[col_name] > split_value]

            if len(X_left) == 0 or len(X_right) == 0:
                root.side = side
                root.value_leaf = proportion_ones
                self.__sum_tree_values += root.value_leaf
                return root

            root.feature = col_name
            root.value_split = split_value
            self.leafs_cnt += 1

            root.left = create_tree(root.left, X_left, y_left, 'left', depth + 1)
            root.right = create_tree(root.right, X_right, y_right, 'right', depth + 1)

            return root

        self.tree = create_tree(self.tree, X, y)

    def predict_proba(self, X):
        for _, row in X.iterrows():
            node = self.tree
            while node.feature is not None:
                if row[node.feature] <= node.value_split:
                    node = node.left
                else:
                    node = node.right
            yield node.value_leaf

    def predict(self, X):
        y_pred = np.array(list(self.predict_proba(X)))
        return (y_pred >= 0.5).astype(int)

    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node.feature is not None:
            print(f"{' ' * depth}{node.feature} > {node.value_split}")
            if node.left is not None:
                self.print_tree(node.left, depth + 1)
            if node.right is not None:
                self.print_tree(node.right, depth + 1)
        else:
            print(f"{' ' * depth}{node.side} = {node.value_leaf}")

    def get_best_split(self, X, y):
        count_labels = y.value_counts()
        p_zero = count_labels / count_labels.sum()
        s_zero = self.__node_rule(p_zero)

        X = X.copy()
        X.loc[:, 'y'] = y
        split_values = {}
        col_name = None
        split_value = None
        s_cur_min = float('inf')

        for col in X.columns[:-1]:
            if not (col in self.split_values.keys()):
                x_unique_values = np.unique(X[col])
                if self.bins is not None and len(x_unique_values) - 1 >= self.bins:
                    _, self.split_values[col] = np.histogram(X[col], bins=self.bins)
                else:
                    self.split_values[col] = np.array([(x_unique_values[i - 1] + x_unique_values[i]) / 2 for i in range(1, len(x_unique_values))])

            for split_value_cur in self.split_values[col]:
                left_split = X['y'][X[col] <= split_value_cur]
                right_split = X['y'][X[col] > split_value_cur]

                left_count_labels = left_split.value_counts()
                p_left = left_count_labels / left_count_labels.sum()
                s_left = self.__node_rule(p_left, left_split)

                right_count_labels = right_split.value_counts()
                p_right = right_count_labels / right_count_labels.sum()
                s_right = self.__node_rule(p_right, right_split)

                weight_left = len(left_split) / len(y)
                weight_right = len(right_split) / len(y)

                s_cur = weight_left * s_left + weight_right * s_right
                if s_cur_min > s_cur:
                    s_cur_min = s_cur
                    col_name = col
                    split_value = split_value_cur

        ig = s_zero - s_cur_min
        return col_name, split_value, ig

    def __node_rule(self, p, split=pd.Series()):
        if self.criterion == 'entropy':
            return -np.sum(p * np.log2(p)) if not split.empty else 0
        elif self.criterion == 'gini':
            return 1 - np.sum(p ** 2)

    def __str__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}"
    
    def sum_leafs(self):
        return self.__sum_tree_values


In [364]:
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [365]:
my_tree = MyTreeClf(15, 20, 30, 6, 'gini')
my_tree.fit(X, y)
print(my_tree.leafs_cnt)
print(round(my_tree.sum_leafs(), 6))
my_tree.print_tree()
print(my_tree.fi)

27
12.412269
variance > -0.10864999999999903
 skewness > 8.497483333333335
  skewness > 4.043366666666667
   curtosis > 6.32065
    curtosis > 2.4517333333333333
     left = 1.0
     entropy > -1.2163999999999993
      left = 0.0
      skewness > -0.41075000000000017
       left = 1.0
       right = 0.8333333333333334
    skewness > -4.864866666666666
     entropy > -1.2163999999999993
      variance > -2.4197999999999995
       left = 1.0
       right = 0.8
      right = 1.0
     skewness > -0.41075000000000017
      left = 0.3333333333333333
      right = 0.0
   variance > -2.4197999999999995
    left = 1.0
    right = 0.0
  variance > -4.73095
   left = 1.0
   right = 0.0
 variance > 2.2025000000000006
  curtosis > -1.4171833333333335
   skewness > 8.497483333333335
    skewness > 4.043366666666667
     left = 1.0
     entropy > -4.882299999999999
      left = 1.0
      right = 0.47058823529411764
    right = 0.0
   entropy > 0.6165500000000002
    skewness > 4.043366666666667
     

In [366]:
class Node:
    def __init__(self):
        self.feature = None
        self.value_split = None
        self.value_leaf = None
        self.side = None
        self.left = None
        self.right = None

class MyTreeReg:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 1
        self.bins = bins
        self.__sum_tree_values = 0
        self.split_values = {}
        self.criterion = criterion
        self.fi = {}

    def fit(self, X, y):
        self.tree = None
        self.fi = { col: 0 for col in X.columns }
        
        def create_tree(root, X_root, y_root, side='root', depth=0):
            if root is None:
                root = Node()
            col_name, split_value, ig = self.get_best_split(X_root, y_root)

            mean_value = y_root.mean()

            if depth >= self.max_depth or \
              len(y_root) < self.min_samples_split or \
              (self.leafs_cnt > 1 and self.leafs_cnt >= self.max_leafs):
                root.side = side
                root.value_leaf = mean_value
                self.__sum_tree_values += root.value_leaf
                return root

            self.fi[col_name] += len(y_root) / len(y) * ig

            X_left = X_root.loc[X_root[col_name] <= split_value]
            y_left = y_root.loc[X_root[col_name] <= split_value]

            X_right = X_root.loc[X_root[col_name] > split_value]
            y_right = y_root.loc[X_root[col_name] > split_value]

            if len(X_left) == 0 or len(X_right) == 0:
                root.side = side
                root.value_leaf = mean_value
                self.__sum_tree_values += root.value_leaf
                return root

            root.feature = col_name
            root.value_split = split_value
            self.leafs_cnt += 1

            root.left = create_tree(root.left, X_left, y_left, 'left', depth + 1)
            root.right = create_tree(root.right, X_right, y_right, 'right', depth + 1)

            return root

        self.tree = create_tree(self.tree, X, y)

    def predict(self, X):
        y_pred = []
        for _, row in X.iterrows():
            node = self.tree
            while node.feature is not None:
                if row[node.feature] <= node.value_split:
                    node = node.left
                else:
                    node = node.right
            y_pred.append(node.value_leaf)
        return np.array(y_pred)
    
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node.feature is not None:
            print(f"{' ' * depth}{node.feature} > {node.value_split}")
            if node.left is not None:
                self.print_tree(node.left, depth + 1)
            if node.right is not None:
                self.print_tree(node.right, depth + 1)
        else:
            print(f"{' ' * depth}{node.side} = {node.value_leaf}")

    def get_best_split(self, X, y):
        mse_0 = self.mse(y)

        col_name = None
        split_value = None
        gain = -float('inf')

        for col in X.columns:
            if not (col in self.split_values.keys()):
                x_unique_values = np.unique(X[col])
                if self.bins is None or len(x_unique_values) - 1 < self.bins:
                    self.split_values[col] = np.array([(x_unique_values[i - 1] + \
                    x_unique_values[i]) / 2 for i in range(1, len(x_unique_values))])
                else:
                    _, self.split_values[col] = np.histogram(X[col], bins=self.bins)

            for split_value_i in self.split_values[col]:
                mask = X[col] <= split_value_i
                left_split, right_split = y[mask], y[~mask]

                mse_left = self.mse(left_split)
                mse_right = self.mse(right_split)

                weight_left = len(left_split) / len(y)
                weight_right = len(right_split) / len(y)

                mse_i = weight_left * mse_left + weight_right * mse_right

                gain_i = mse_0 - mse_i
                if gain < gain_i:
                    col_name = col
                    split_value = split_value_i
                    gain = gain_i

        return col_name, split_value, gain
            
    def mse(self, t):
        t_mean = np.mean(t)
        return np.sum((t - t_mean) ** 2) / (len(t)+1e-15)
    
    def __node_rule(self, p, split=pd.Series()):
        if self.criterion == 'entropy':
            return -np.sum(p * np.log2(p)) if not split.empty else 0
        elif self.criterion == 'gini':
            return 1 - np.sum(p ** 2)

    def __str__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}"
    
    def sum_leafs(self):
        return self.__sum_tree_values


In [367]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']

In [368]:
my_tree_reg = MyTreeReg(15, 35, 30, 6)
my_tree_reg.fit(X, y)
print(my_tree_reg.leafs_cnt)
print(my_tree_reg.sum_leafs())
print(my_tree.fi)
my_tree_reg.print_tree()


30
4389.213406654218
{'variance': 0.29104459429524376, 'skewness': 0.1053945528010072, 'curtosis': 0.058254028287495284, 'entropy': 0.008363882866077312}
s5 > 0.0037500805443044516
 bmi > -0.003331788605458283
  s5 > -0.0395323199148986
   age > 0.07440129094361725
    s1 > -0.033215875558837024
     age > 0.03807590643342304
      bp > 0.009822407098564287
       s6 > -0.0010776975004659628
        sex > 0.003019241116414738
         left = 95.28
         right = 77.83333333333333
        right = 135.5
       right = 43.5
      right = 139.33333333333334
     right = 75.13333333333334
    right = 199.0
   s2 > 0.09398763777839642
    bp > 0.09130321352699682
     age > -0.07090024709715959
      left = 129.0
      bp > -0.07165839932986823
       left = 80.33333333333333
       s6 > -0.04664087356364498
        left = 120.44444444444444
        age > -0.03457486258696538
         left = 77.5
         s2 > -0.010812714100501275
          left = 117.26086956521739
          right = 101.