In [19]:
import pandas as pd
import numpy as np
import random

- max_depth – максимальная глубина.
По-умолчанию: 5
- min_samples_split – кол-во объектов в листе, чтобы его можно было разбить и превратить в узел.
По-умолчанию: 2
- max_leafs – максимальное количество листьев разрешенное для дерева.
По-умолчанию: 20

In [37]:
class MyTreeClf:
    def __init__(self, max_depth = 5, min_samples_split = 2, max_leafs  = 20):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_count = 0
        self.pred_sum = 0
       
    def _entropy(self, y):
        epsilon = 1e-12
        p0 = np.sum(y == 0) / len(y)
        p1 = np.sum(y == 1) / len(y)
        entropy = - (p0 * np.log2(p0 + epsilon) + p1 * np.log2(p1 + epsilon))
        return entropy
    def __repr__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"
        
    def _get_best_split(self, X, y):
        if isinstance(X, pd.DataFrame):
            X_array = X.values
            col_names = X.columns
        else:
             X_array = np.asarray(X)
            
        if isinstance(y, pd.DataFrame):
            y_array= y.values.flatten()
        else:
            y_array = np.asarray(y)
        n_samples, n_features = X_array.shape
        S_0 = self._entropy(y_array)
        max_IG = 0
        best_split = None
        best_feature_index = None
        for j in range(n_features):
            f = np.sort(np.unique(X_array[:,j]))
            splits = []
            for i in range(len(f)-1):
                split = (f[i]+f[i+1])/2
                mask_left = X_array[:, j] <= split
                mask_right = X_array[:, j] > split
                X_r, y_r = X_array[mask_right], y_array[mask_right]
                X_l, y_l = X_array[mask_left], y_array[mask_left]
                S_r = self._entropy(y_r)
                S_l = self._entropy(y_l)
                N_r = len(y_r)
                N_l = len(y_l)
                IG = S_0 - N_r / n_samples * S_r - N_l/n_samples * S_l
                if IG > max_IG:
                    max_IG = IG
                    best_split = split
                    best_feature_index = j
        return best_feature_index, max_IG, best_split,
        
    def _build_tree(self, X_train, y_train, feature_names, depth = 0):
   
        if (
            depth >= self.max_depth or 
            len(np.unique(y_train)) == 1 or 
            len(y_train) == 1 or 
            len(y_train) < self.min_samples_split or
            self.leafs_count>= self.max_leafs
        ):
            
            pred = np.mean(y_train)
            self.pred_sum += pred
            return {
                "type" : "leaf",
                "prediction" : pred,
                "n_samples" : len(y_train),
                "depth" : depth
            }
        
        best_feature, ig, best_split = self._get_best_split(X_train, y_train)
        self.leafs_count += 1
        if ig <= 0:
    
            pred = np.mean(y_train)
            self.pred_sum += pred
            return {
                "type" : "leaf",
                "prediction" : pred,
                "n_samples" : len(y_train)
            }
        mask_left = X_train[:, best_feature] <= best_split
        mask_right = X_train[:, best_feature] > best_split
        X_r, y_r = X_train[mask_right], y_train[mask_right]
        X_l, y_l = X_train[mask_left], y_train[mask_left]
        
        right_subtree = self._build_tree(X_r, y_r, feature_names, depth +1)
        left_subtree = self._build_tree(X_l, y_l, feature_names,depth +1)
        return {
            "type" : "node",
            'feature' : best_feature,
            'split' : best_split,
            'feature_name' : feature_names[best_feature],
            'depth' : depth,
            'leaf_right' : right_subtree,
            'leaf_left' : left_subtree,
        
        }
        
    def fit(self, X, y):
        self.leafs_count = 0
        feature_names = X.columns.to_list()
        X_train = X.to_numpy()
        y_train = y.to_numpy()
        self.tree_ = self._build_tree(X_train, y_train, feature_names, depth = 0)

    def print_tree(self, node = None, path = "1", side = None):
        if node is None:
            node = self.tree_
        if node["type"] == "leaf":
            if side is not None:
                print(' '*node['depth'], f"{path}.{side} - {node["prediction"]}")
            else:
                print(f"{path} - {node["prediction"]}")
            return
        feature = node["feature_name"]
        split = node["split"]
        depth = node['depth']
        print(' '*depth, f"{path} - {feature} > {split}")
        self.print_tree(node["leaf_left"], path + ".1", side = "left") 
        self.print_tree(node["leaf_right"], path + ".2", side = "right")
                

In [38]:
features = ['variance', 'skewness', 'curtosis', 'entropy', 'class']
data = pd.read_csv("data_banknote_authentication.txt", names = features)

In [39]:
X = data.drop("class", axis = 1)
y = data["class"].copy()

In [43]:
tree = MyTreeClf( max_depth = 5, min_samples_split = 200, max_leafs  = 10)
tree.fit(X, y)

In [44]:
print(tree.leafs_count, tree.pred_sum)

8 4.796617280208611


1 - variance > 0.320165

1.1 - skewness > 5.86535

1.1.1 - curtosis > 6.21865

1.1.1.1 - variance > -0.36205

1.1.1.1.left - 1.0

1.1.1.1.right - 0.9649122807017544

1.1.1.right - 0.8397435897435898

1.1.right - 0.2867647058823529

1.2 - variance > 1.7907000000000002

1.2.1 - curtosis > -2.2721999999999998

1.2.1.left - 0.9473684210526315

1.2.1.right - 0.10227272727272728

1.2.2 - curtosis > -4.802

1.2.2.2 - variance > 2.03655

1.2.2.2.left - 0.05555555555555555

1.2.2.2.right - 0.0

1.2.2.left - 0.6

In [45]:
tree.print_tree()

 1 - variance > 0.320165
  1.1 - skewness > 5.86535
   1.1.1 - curtosis > 6.21865
    1.1.1.1 - variance > -0.36205
     1.1.1.1.1.left - 1.0
     1.1.1.1.2.right - 0.9649122807017544
    1.1.1.2.right - 0.8397435897435898
   1.1.2.right - 0.2867647058823529
  1.2 - variance > 1.7907000000000002
   1.2.1 - curtosis > -2.2721999999999998
    1.2.1.1.left - 0.9473684210526315
    1.2.1.2.right - 0.10227272727272728
   1.2.2 - curtosis > -4.802
    1.2.2.1.left - 0.6
    1.2.2.2 - variance > 2.03655
     1.2.2.2.1.left - 0.05555555555555555
     1.2.2.2.2.right - 0.0


In [220]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, roc_auc_score
# Фиксируем сид для воспроизводимости
np.random.seed(42)

# Синтетические данные
X, y = make_classification(
    n_samples=200,
    n_features=10,
    n_informative=10,
    n_redundant=0,
    n_classes=2
)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [31]:
def entropy(y):
    epsilon = 1e-12
    p0 = np.sum(y == 0) / len(y)
    p1 = np.sum(y == 1) / len(y)
    entropy = - (p0 * np.log2(p0 + epsilon) + p1 * np.log2(p1 + epsilon))
    return entropy
    

In [17]:
def get_best_split(X, y):
    N, n_features = X.shape
    S_0 = entropy(y)
    max_IG = 0
    best_split = 0
    feature_index = 0
    for j in range(n_features):
        f = np.sort(np.unique(X[:,j]))
        splits = []
        for i in range(len(f)-1):
            split = (f[i]+f[i+1])/2
            mask_left = X[:, j] <= split
            mask_right = X[:, j] > split
            X_r, y_r = X[mask_right], y[mask_right]
            X_l, y_l = X[mask_left], y[mask_left]
            S_r = entropy(y_r)
            S_l = entropy(y_l)
            N_r = len(y_r)
            N_l = len(y_l)
            IG = S_0 - N_r / N * S_r - N_l/N * S_l
            if IG > max_IG:
                max_IG = IG
                best_split = split
                feature_index = j
    return feature_index, max_IG, best_split


In [77]:
def calculate_prediction(y):
    unique, counts = np.unique(y, return_counts = True)
    max_freq = max(counts)
    modes = unique[np.where(counts == max_freq)]
    if len(modes) > 1:
        return 1
    else:
        return modes[0]

In [116]:
leaf_count = 0
pred_sum = 0
def build_tree(X_train, y_train, depth = 0, max_depth = 10, min_samples_split = 10, max_leafs = 20):
    global leaf_count
    global pred_sum
    if (
        depth >= max_depth or 
        len(np.unique(y_train)) == 1 or 
        len(y_train) == 1 or 
        len(y_train) < min_samples_split or
        leaf_count >= max_leafs
    ):
        leaf_count += 1
        pred = np.mean(y_train)
        return {
            "type" : "leaf",
            "prediction" : pred,
            "n_samples" : len(y_train)
        }
    
    best_feature, ig, best_split = get_best_split(X_train, y_train)
    
    if ig <= 0:
        leaf_count += 1
        pred = np.mean(y_train)
        return pred
        
    mask_left = X_train[:, best_feature] <= best_split
    mask_right = X_train[:, best_feature] > best_split
    X_r, y_r = X_train[mask_right], y_train[mask_right]
    X_l,y_l = X_train[mask_left], y_train[mask_left]
    right_subtree = build_tree(X_r, y_r, depth +1)
    left_subtree = build_tree(X_l, y_l, depth +1)
    node = {
        "type" : "node",
        'feature' : best_feature,
        'split' : best_split,
        'leaf_right' : right_subtree,
        'leaf_left' : left_subtree,
    }

    return node

In [93]:
node = build_tree(X_train, y_train)