In [1]:
import numpy as np
import pandas as pd

import random

from sklearn.datasets import make_classification

In [2]:
#X, y = make_classification(n_samples=500, n_features=20, n_informative=2, random_state=42)
#X = pd.DataFrame(X)
#y = pd.Series(y)
#X.columns = [f'col_{col}' for col in X.columns]

In [3]:
df = pd.read_csv(r'D:\Pythonizm\DS_Datasets\banknote+authentication.zip', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [4]:
display(X.head())
print(X.shape)

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.6216,8.6661,-2.8073,-0.44699
1,4.5459,8.1674,-2.4586,-1.4621
2,3.866,-2.6383,1.9242,0.10645
3,3.4566,9.5228,-4.0112,-3.5944
4,0.32924,-4.4552,4.5718,-0.9888


(1372, 4)


In [5]:
display(y.head())
print(y.shape)

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

(1372,)


In [None]:
class MyTreeClf():

    def __init__(self, max_depth = 5, min_samples_split = 2, max_leafs = 20):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.depth = 0


    def __repr__(self):
        return f'MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}'


    def _entropy(self, y):
        return -np.sum([(i/len(y))*np.log2(i/len(y)) if i>0 else 0 for i in y.value_counts(sort=False)])


    def _get_best_split(self, X, y):
        X, y = X.copy(), y.copy()
        S0 = self._entropy(y)
        max_ig = -1e15
        for col in X:
            values = sorted(X[col].unique())
            dividers = [(values[i]+values[i+1])/2 for i in range(len(values)-1)]
            for divider in dividers:
                split_right = y[X[col] > divider]
                split_left = y[X[col] <= divider]
                S_right = (len(split_right)/len(y))*self._entropy(split_right)
                S_left = (len(split_left)/len(y))*self._entropy(split_left)
                ig = S0 - S_right - S_left
                if ig > max_ig:
                    col_name, split_value, max_ig = col, divider, ig
        return col_name, split_value, max_ig


    def _is_leaf(self, X, y):
        if len(X) == 1:
            return True
        if len(X) < self.min_samples_split:
            return True
        if self._entropy(y) == 0:
            return True
        return False
    

    def _split(self, X: pd.DataFrame, y: pd.Series, data: pd.Series, side: str, depth: int, verbose: bool):
        if not self._is_leaf(X[data], y[data]) and self.leafs_cnt < (self.max_leafs - 1) and depth < self.max_depth:
            if verbose:
                print(side.lower(), len(X[data]), depth+1)
            self._splitter(X[data], y[data], verbose, depth)
        else:
            if verbose:
                print(f'\n{side} Done | size: {len(X[data])} | entrope: {self._entropy(y[data])} | depth: {depth+1} | val: {np.sum(y[data])/len(y[data])}\n')
                self.leaf_sum += np.sum(y[data])/len(y[data])
            self.leafs_cnt += 1
            self.model[depth+1][2**(depth) - self.rest[depth+1]] = sum(y[data])/len(y[data])
            for i in range(self.max_depth - depth + 1):
                self.rest[depth + i + 1] -= 2**i


    def _splitter(self, X, y, verbose, depth = 0):
        depth += 1
        split = self._get_best_split(X, y)
        self.model[depth][2**(depth-1) - self.rest[depth]] = list(split[:2])
        self.rest[depth] -= 1
        if verbose:
            print(split)
        left = X[split[0]] <= split[1]
        right = X[split[0]] > split[1]

        self._split(X, y, left, 'Left', depth, verbose)

        self._split(X, y, right, 'Right', depth, verbose)


    def fit(self, X: pd.DataFrame, y: pd.Series, verbose = False):
        X, y = X.copy(), y.copy()
        self.leafs_cnt = 0
        self.depth = 0
        self.model = {}
        self.rest = {}
        self.leaf_sum = 0

        depth = 0
        while depth < self.max_depth + 1:
            depth += 1
            self.model[depth] = [[0, 0] for _ in range(2**(depth-1))]
            self.rest[depth] = 2**(depth-1)
       
        self._splitter(X, y, verbose)


    def predict_proba(self, X: pd.DataFrame):
        X = X.copy()
        y = []
        for row in X.iterrows():
            index = 0
            for depth in range(1, self.max_depth+2):
                if type(self.model[depth][index]) == type(y):
                    col, val = self.model[depth][index][0], self.model[depth][index][1]
                    if row[1][col] <= val:
                        index *= 2
                    else:
                        index *= 2
                        index += 1
                else:
                    y.append(self.model[depth][index])
                    break
        return pd.Series(y)


    def predict(self, X: pd.DataFrame):
        X = X.copy()
        y_proba = self.predict_proba(X)
        y = y_proba.apply(lambda x: 1 if x > 0.5 else 0)
        return y


    def print_tree(self):
        for layer in self.model:
            print(f'{layer} - {self.model[layer]}')
        print(self.leafs_cnt)
        print(round(self.leaf_sum, 6))

In [7]:
tree = MyTreeClf(max_depth=3, min_samples_split=2, max_leafs=1)
print(tree)

MyTreeClf class: max_depth=3, min_samples_split=2, max_leafs=1


In [8]:
tree.fit(X, y, True)

('variance', 0.320165, 0.39961186274479576)

Left Done | size: 657 | entrope: 0.6988212030641663 | depth: 2 | val: 0.8112633181126332


Right Done | size: 715 | entrope: 0.49291577961610966 | depth: 2 | val: 0.1076923076923077



In [9]:
tree.print_tree()

1 - [['variance', 0.320165]]
2 - [0.8112633181126332, 0.1076923076923077]
3 - [[0, 0], [0, 0], [0, 0], [0, 0]]
4 - [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]
2
0.918956


In [10]:
tree.predict_proba(X)

0       0.107692
1       0.107692
2       0.107692
3       0.107692
4       0.107692
          ...   
1367    0.107692
1368    0.811263
1369    0.811263
1370    0.811263
1371    0.811263
Length: 1372, dtype: float64

In [11]:
tree.predict(X)

0       0
1       0
2       0
3       0
4       0
       ..
1367    0
1368    1
1369    1
1370    1
1371    1
Length: 1372, dtype: int64