In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('data/BankNote_Authentication.xls')
y = df['class']
del df['class']
df
s = df.loc[0]

In [5]:
class Node():

    def __init__(self, value=None):
        self.df = None
        self.parent = None
        self.value = value
        self.left = None
        self.right = None
        self.is_leaf = False

class Tree():

    def __init__(self):
        self.root = None

    def print_tree(self):
        """Печатает дерево в консоли, начиная с корня"""
        if not self.root:
            print("(пустое дерево)")
            return
        
        # Вспомогательная функция с рекурсивным обходом
        def _print(node, prefix="", is_left=True):
            if not node:
                return
            
            # Определяем отображение узла
            if node.is_leaf:
                label = f"{node.value[0]} = {node.value[1]}" if is_left else f"{node.value[0]} = {node.value[1]}"
            else:
                label = f"{node.value[0]} > {node.value[1]}"
            
            print(prefix + ("└── " if not prefix else "├── ") + label)
            
            # Новый префикс для следующих уровней
            new_prefix = prefix + ("    " if not prefix else "│   ")
            
            # Рекурсивно печатаем потомков
            _print(node.left, new_prefix, True)
            _print(node.right, new_prefix, False)
        
        # Запускаем обход от корня
        _print(self.root, "", False)
        
    def find_proba(self, s):
        def tree_traversal(node, s):
            if node.is_leaf:
                return node.value[1]
            else:
                feature, split_val = node.value[0], node.value[1]
                if s[feature] < split_val:
                    return tree_traversal(node.left, s)
                else:
                    return tree_traversal(node.right, s)
        return tree_traversal(self.root, s)
        
        

In [None]:
class MyTreeClf():
    
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.depth = 0
        self.leafs_sum = 0
        self.tree = Tree()
        self.curr = None


        
    def __str__(self):
        attrs = vars(self)
        return f'MyTreeClf class: ' + ', '.join(f'{key}={value}' for key, value in attrs.items())
    
    @staticmethod
    def entropy(N, count_nonzero):
        return -(1 / N) * (count_nonzero * np.log2(count_nonzero / N) +
                                (N - count_nonzero) * np.log2((N - count_nonzero) / N))

    @staticmethod
    def custom_log(value):
        if value == 0.0:
            return 0
        return np.log2(value)
    
    def get_best_split(self, X, y):
        names = X.columns.values.tolist()
        N = len(y)
        count_nonzero = np.count_nonzero(y)
        d = pd.DataFrame(columns=['col_name', 'split_value', 'ig'])
        S0 = MyTreeClf.entropy(N, count_nonzero)
        X_numpy = X.to_numpy()
        for i in range(len(X.columns)):
            feature = pd.DataFrame({i: X_numpy[:,i], 'y': y})
            unique = feature[i].unique()
            unique = np.sort(unique)
            feature_unique_mean = (unique[:-1] + unique[1:]) / 2
            for k in range(len(feature_unique_mean)):
                left_df = feature.loc[feature[i] <= feature_unique_mean[k]]
                right_df = feature.loc[feature[i] > feature_unique_mean[k]]
                left = left_df['y']
                right = right_df['y']
                N1 = len(left)
                N2 = len(right)
                c_n_l = int(np.count_nonzero(left))
                c_n_r = int(np.count_nonzero(right))
                S1 = -(c_n_l / N1) * MyTreeClf.custom_log(c_n_l / N1) - ((N1 - c_n_l) / N1) * MyTreeClf.custom_log((N1 - c_n_l) / N1)
                S2 = -(c_n_r / N2) * MyTreeClf.custom_log(c_n_r / N2) - ((N2 - c_n_r) / N2) * MyTreeClf.custom_log((N2 - c_n_r) / N2)
                ig = S0 - (N1 / N) * S1 - (N2 / N) * S2
                d.loc[len(d.index)] = [names[i], feature_unique_mean[k], ig]
        d = d.sort_values(by=['ig', 'col_name'], ascending=[False, True])
        col_name, split_value, ig = d.iloc[0, 0], d.iloc[0, 1], d.iloc[0, 2]
        return col_name, split_value


    def build_tree(self, X):
        # self.tree.print_tree()
        # print('')
        if self.leafs_cnt >= self.max_leafs or self.depth > self.max_depth:
            return
        if not self.tree.root:
            y = self.y[X.index]
            col_name, split_value = self.get_best_split(X, y)
            self.tree.root = Node((col_name, split_value))
            self.tree.root.df = X
            self.curr = self.tree.root
            X_left = X.loc[X[col_name] <= split_value]
            self.depth += 1
            self.build_tree(X_left)
        if not self.curr.left:
            y = self.y[X.index]
            if not self.is_leaf(y): # узел
                col_name, split_value = self.get_best_split(X, y)
                self.curr.left = Node((col_name, split_value))
                self.curr.left.parent = self.curr
                self.curr.left.df = X
                self.curr = self.curr.left
                X_left = X.loc[X[col_name] <= split_value]
                self.depth += 1
                self.build_tree(X_left)
            else: #  лист
                # print('Зашел в левый лист')
                self.curr.left = Node(('leaf_left', np.count_nonzero(y == 1) / len(y)))
                self.leafs_sum += self.curr.left.value[1]
                self.curr.left.is_leaf = True
                self.curr.left.parent = self.curr
                self.leafs_cnt += 1
                self.build_tree(self.curr.df)
        elif self.curr.left and not self.curr.right:
            X_right = self.curr.df.loc[self.curr.df[self.curr.value[0]] > self.curr.value[1]]
            y_r = self.y[X_right.index]
            if not self.is_leaf(y_r):
                col_name, split_value = self.get_best_split(X_right, y_r)
                self.curr.right = Node((col_name, split_value))
                self.curr.right.df = X_right
                self.curr.right.parent = self.curr
                self.curr = self.curr.right
                self.depth += 1
                X_left = X_right.loc[X_right[col_name] <= split_value]
                self.build_tree(X_left)
            else:
                self.curr.right = Node(('leaf_right', np.count_nonzero(y_r == 1) / len(y_r)))
                self.leafs_sum += self.curr.right.value[1]
                self.curr.right.is_leaf = True
                self.curr.right.parent = self.curr
                if self.tree.root != self.curr:
                    self.curr = self.curr.parent
                self.depth -= 1
                self.leafs_cnt += 1
                self.build_tree(self.curr.df)
        elif self.curr == self.tree.root and self.curr.left and self.curr.right:
            return
        elif self.curr.left and self.curr.right:
            self.curr = self.curr.parent
            self.depth -= 1
            self.build_tree(self.curr.df)


    def fit(self, X, y):
        self.y = y
        self.build_tree(X)

    def is_leaf(self, y):
        '''True - лист, False - узел'''
        if len(y) == 1 or np.count_nonzero(y == 1) == 0 or np.count_nonzero(y == 0) == 0 or \
                self.max_depth == self.depth or self.max_leafs <= 2 or self.min_samples_split > len(y) or \
                self.max_leafs - self.leafs_cnt == 1:
            return True
        return False

    def predict_proba(self, X):
        proba = []
        for ind in X.index.values:
            s = X.loc[ind]
            proba.append(self.tree.find_proba(s))
        return proba
    
    def predict(self, X):
        proba = []
        for ind in X.index.values:
            s = X.loc[ind]
            proba.append(self.tree.find_proba(s))
        pred = [1 if val > 0.5 else 0 for val in proba]
        return pred
        

In [29]:
lst = [(1,1,2)]
for max_depth, min_samples_split, max_leafs in lst:
    model = MyTreeClf(max_depth=max_depth, min_samples_split=min_samples_split, max_leafs=max_leafs)
    model.fit(df, y)
    print(round(model.leafs_sum, 6))
    print(model.leafs_cnt)

0.918956
2


In [30]:
model.tree.print_tree()

└── variance > 0.320165
    ├── leaf_left = 0.8112633181126332
    ├── leaf_right = 0.1076923076923077


In [26]:
def custom_log(value):
    if value == 0.0:
        return 0
    return np.log(value)
custom_log(0)

0

In [None]:
class MyTreeClf():
    def __init__(self, max_depth=5, min_samples_split=2,max_leafs=2,bins=None):         
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.tree_dict = {}
        self.depths_numb={}
        self.node_count = 0
        self.bins = bins
        # self.splits = {}
    def __str__(self):
        attributes = ', '.join(f"{key}={value}" for key, value in vars(self).items())
        return f"MyTreeClf class: {attributes}"
    def __repr__(self):
        attributes = ', '.join(f"{key}={value}" for key, value in vars(self).items())
        return f"MyTreeClf class: {attributes}"
    @staticmethod
    def entr(y):
      n = y.shape[0]
      one_count = np.count_nonzero(y == 1)
      zero_count =  np.count_nonzero(y == 0)
      if one_count == 0 or zero_count==0:
        return 0
      s=-((zero_count/n)*np.log2(zero_count/n)+(one_count/n)*np.log2(one_count/n))
      return s
 
    def get_best_split(self,X,y):
      n_cols = X.shape[1]
      s0 = self.entr(y)
      ig_max=0
      col_name=""
      split_value=np.float()
      for i in range(n_cols):
          feature_values = np.sort(X.iloc[:,i].unique())
          splits_feature = [(feature_values[idx]+feature_values[idx+1])/2 for idx in range(len(feature_values)-1)]
          if self.bins is not None and len(splits_feature) > self.bins-1:
              splits_feature = np.histogram(feature_values, bins=self.bins)[1][1:-1]
          ig = 0
          for split_val in splits_feature:
              idx_right = X[np.all([X.iloc[:,i] > split_val], axis=0)].index
              idx_left = X[np.all([X.iloc[:,i] <= split_val], axis=0)].index
              ig = s0 - ((len(idx_left)/len(y))*self.entr(y.loc[idx_left])) - ((len(idx_right)/len(y))*self.entr(y.loc[idx_right]))
              if ig > ig_max:
                  ig_max=ig
                  col_name=X.iloc[:,i].name
                  split_value = split_val
      return col_name, split_value, ig_max
 
    def node_or_list(self, y,curr_depth):
      if y.shape[0] == 1:
        # print('1 семпл')
        return False
      elif np.count_nonzero(y == 1) == 0: 
        # print('Класса 1 нету')
        return False
      elif np.count_nonzero(y == 0)==0:
        # print('Класса 0 нету')
        return False
      elif self.max_depth ==curr_depth:
        # print('Макс глубина')
        return False
      elif self.max_leafs <=2:
        # print('Меньше 1 листа быть не может')
        return False      
      elif self.min_samples_split >y.shape[0]:
        # print('Минимальное возможное количетсво элементов для разбиения')
        return False
      else:
        if self.node_count!=0:   
          return True
 
    def add_leaf(self,curr_depth,leaf_name,leaf):
          if curr_depth in self.tree_dict.keys():
            one = np.count_nonzero(y.loc[leaf] == 1)
            chance = one/len(leaf)
            self.tree_dict[curr_depth].append([chance])
          else:
            one = np.count_nonzero(y.loc[leaf] == 1)
            chance = one/len(leaf)
            self.tree_dict[curr_depth] = []
            self.tree_dict[curr_depth].append([chance])
          self.leafs_cnt += 1
 
    def iter_for_feature(self,X,y,curr_depth=0):
        col_name, split_val, ig = self.get_best_split(X,y)
        leaf_left = X[np.all([X.loc[:,col_name] <= split_val], axis=0)].index
        leaf_right = X[np.all([X.loc[:,col_name] > split_val], axis=0)].index
        if curr_depth in self.tree_dict.keys():
          if curr_depth in self.depths_numb.keys():
            self.tree_dict[curr_depth].append([col_name, split_val,self.depths_numb[curr_depth],self.depths_numb[curr_depth]+1])
            self.depths_numb[curr_depth] +=2
          else:
            self.depths_numb[curr_depth] = 0
            self.tree_dict[curr_depth].append([col_name, split_val,self.depths_numb[curr_depth],self.depths_numb[curr_depth]+1])
            self.depths_numb[curr_depth] +=2
        else:
          self.depths_numb[curr_depth] = 0
          self.tree_dict[curr_depth] = []
          self.tree_dict[curr_depth].append([col_name, split_val,self.depths_numb[curr_depth],self.depths_numb[curr_depth]+1])
          self.depths_numb[curr_depth] +=2
        curr_depth +=1    
        if self.node_or_list(y.loc[leaf_left],curr_depth):
          self.node_count-=1
          self.iter_for_feature(X.loc[leaf_left],y.loc[leaf_left],curr_depth)
        else:
          self.add_leaf(curr_depth,"leaf_left",leaf_left)
        if self.node_or_list(y.loc[leaf_right],curr_depth):
          self.node_count-=1
          self.iter_for_feature(X.loc[leaf_right],y.loc[leaf_right],curr_depth)
        else:
          self.add_leaf(curr_depth,"leaf_right",leaf_right)
 
    def fit(self, X_fit: pd.DataFrame, y_fit: pd.Series):
      X = X_fit.copy()
      y = y_fit.copy()
      # self.splits = {col:[] for col in X.columns}
      self.node_count = self.max_leafs-2
      self.iter_for_feature(X,y)
      # return self.tree_dict
 
    def pred(self,x,i,c):
      if len(self.tree_dict[i][c]) != 1:
        uslovie = self.tree_dict[i][c][1]
        column = self.tree_dict[i][c][0] 
        idx_left_leaf=self.tree_dict[i][c][2]
        idx_right_leaf= self.tree_dict[i][c][3]
        if x[column] <= uslovie:
          i+=1            
          return self.pred(x,i,idx_left_leaf)
        else:
          i+=1
          return self.pred(x,i,idx_right_leaf)
      else:
        return self.tree_dict[i][c][-1]
 
    def predict(self, X_test: pd.DataFrame):
        X = X_test.copy()
        y_pred_vec = X.apply(lambda x: self.pred(x,0,0),axis=1)
        return np.sum(y_pred_vec)
 
instance = MyTreeClf(max_depth=5, min_samples_split=2,max_leafs=10,bins=20)
tree_dict = instance.fit(X,y)
instance.predict(X)
 
import graphviz
 
class TreeNode:
    def __init__(self, value, left=None, right=None):
        self.is_leaf = len(value) == 1  # Лист содержит только один элемент
        self.value = value
        self.left = left
        self.right = right
 
def build_tree(tree_dict):
    nodes = {0: [TreeNode(tree_dict[0][0])]}
 

6


In [None]:
class Test:
    __val = True

    @classmethod
    def func(cls):
        print(cls__val)
        cls.__val = False

a = Test()
a.func()

NameError: name '_Test__val' is not defined