In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [3]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [4]:
class Node:
    def __init__(self, value, left=None, right=None):
        self.value = value
        self.left = left
        self.right = right

    def __str__(self):
        return f"Node: ({self.value}), ({self.right}), ({self.left})"

In [50]:
class MyTreeClf:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20, bins=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs

        self.leafs_cnt = 0
        self.root = None
        self.bins = bins
        self.splitters = None


    def __repr__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"

    def _get_splitters(self, x):
        values = np.array(sorted(x.unique()))
        splitters = np.array([(values[i] + values[i+1])/2 for i in range(len(values)-1)])
        return splitters
    
    def _get_splitters_wbins(self, X, y):
        self.splitters = {}
        if self.bins:
            native_splitters = {col: self._get_splitters(X[col]) for col in X.columns}
            for col, splitters in native_splitters.items():
                if len(splitters) <= self.bins - 1:
                    self.splitters[col] = splitters
                else:
                    hist = np.histogram(X[col], bins=self.bins)[1][1:-1]
                    self.splitters[col] = hist

    def _entropy(self, col_targ):
        p0 = (col_targ.iloc[:, 1] == 0).sum() / col_targ.shape[0]
        p1 = col_targ.iloc[:, 1].sum() / col_targ.shape[0]
        S = -p0*np.log2(p0+1e-15) - p1*np.log2(p1+1e-15)
        return S

    def _get_ig(self, x, y, split):
        col_targ = pd.concat([x, y], axis=1)
        S0 = self._entropy(col_targ)

        left_sub = col_targ.loc[col_targ[x.name] <= split, :]
        right_sub = col_targ.loc[col_targ[x.name] > split, :]
        S1, S2 = self._entropy(left_sub), self._entropy(right_sub)

        IG = S0 - left_sub.shape[0]/col_targ.shape[0]*S1 - right_sub.shape[0]/col_targ.shape[0]*S2
        return IG
        

    def _get_best_split(self, X, y):
        cols = X.columns
        if self.bins is None:
            splitters = {col: self._get_splitters(X[col]) for col in cols}
        else:
            splitters = self.splitters
        best_col = None
        best_split = None
        best_ig = 0

        for col, splits in splitters.items():
            x = X[col]
            igs = np.array([self._get_ig(x, y, split) for split in splits])
            max_idx = igs.argmax()
            max_ig, max_split = igs[max_idx], splits[max_idx]
            if max_ig > best_ig:
                best_col = col
                best_split = max_split
                best_ig = max_ig
        
        if best_col is None:
            best_col = "None"
            best_split = X.min().min()-1
            best_ig = 0
        return best_col, best_split, best_ig

    def is_leaf(self, data, depth):
        return (all(data.iloc[:, -1] == 1)) or\
               (all(data.iloc[:, -1] == 0)) or\
               (depth >= self.max_depth-1) or\
               (data.shape[0] < self.min_samples_split) or\
               (self.leafs_cnt >= self.max_leafs-1)

    
    def _fit(self, X, y, depth=0):
        best_col, best_split, best_ig = self._get_best_split(X, y)
        root = Node((best_col, best_split))

        col_targ = pd.concat([X, y], axis=1)

        if (best_split > col_targ[best_col].max() or best_split < col_targ[best_col].min()):
            value = col_targ.iloc[:, -1].sum() / col_targ.shape[0]
            return Node(('leaf', value))

        left_sub = col_targ.loc[col_targ[best_col] <= best_split, :]
        right_sub = col_targ.loc[col_targ[best_col] > best_split, :]


        if self.is_leaf(left_sub, depth):
            value = left_sub.iloc[:, -1].sum() / left_sub.shape[0]
            root.left = Node(('left', value))
            self.leafs_cnt += 1
        else:
            X, y = left_sub.drop(left_sub.columns[-1], axis=1), left_sub.iloc[:, -1]
            root.left = self._fit(X, y, depth+1)

        if self.is_leaf(right_sub, depth):
            value = right_sub.iloc[:, -1].sum() / right_sub.shape[0]
            root.right = Node(('right', value))
            self.leafs_cnt += 1
        else:
            X, y = right_sub.drop(right_sub.columns[-1], axis=1), right_sub.iloc[:, -1]
            root.right = self._fit(X, y, depth+1)
        return root

    def fit(self, X, y):
        self._get_splitters_wbins(X, y)
        self.root = self._fit(X, y)

    
    def _predict_proba(self, x, root):
        if (root.right is None and root.left is None):
            return root.value[1]
        
        feat, split = root.value
        if x[feat] <= split:
            return self._predict_proba(x, root.left)
        else:
            return self._predict_proba(x, root.right)

    def predict_proba(self, X):
        y_pred_logits = np.array([self._predict_proba(X.iloc[i, :], self.root) for i in range(X.shape[0])])
        return y_pred_logits

    def predict(self, X):
        y_pred = (self.predict_proba(X) > 0.5).astype(int)
        return y_pred


    def _print_tree(self, root, intend):
        if (root is None):
            return None
        
        feat, split = root.value
        if (root.left is None and root.right is None):
            print('  '*intend, end='')
            print(f"{feat} = {split}")
        else:
            print('  '*intend, end='')
            print(f"{feat} > {split}")

        self._print_tree(root.left, intend+1)
        self._print_tree(root.right, intend+1)

    def print_tree(self):
        self._print_tree(self.root, 0)

    def _sum_leafs(self, root):
        if (root is None):
            return 0
        if (root.left is None and root.right is None):
            return root.value[1]
        return self._sum_leafs(root.left) + self._sum_leafs(root.right)
    
    def sum_leafs(self):
        return self._sum_leafs(self.root)



In [51]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [52]:
col_tag = pd.concat([X, y], axis=1)
col_tag.loc[:, y.name]

0       0
1       0
2       0
3       0
4       0
       ..
1367    1
1368    1
1369    1
1370    1
1371    1
Name: target, Length: 1372, dtype: int64

In [53]:
my_tree = MyTreeClf(5, 200, 10, 4)
my_tree.fit(X, y)
print(my_tree.leafs_cnt)
print(round(my_tree.sum_leafs(), 6))
my_tree.print_tree()

{'variance': array([-3.575375, -0.10865 ,  3.358075]), 'skewness': array([-7.091925, -0.41075 ,  6.270425]), 'curtosis': array([ 0.517275,  6.32065 , 12.124025]), 'entropy': array([-5.798775, -3.04935 , -0.299925])}


  p0 = (col_targ.iloc[:, 1] == 0).sum() / col_targ.shape[0]
  p1 = col_targ.iloc[:, 1].sum() / col_targ.shape[0]
  p0 = (col_targ.iloc[:, 1] == 0).sum() / col_targ.shape[0]
  p1 = col_targ.iloc[:, 1].sum() / col_targ.shape[0]
  p0 = (col_targ.iloc[:, 1] == 0).sum() / col_targ.shape[0]
  p1 = col_targ.iloc[:, 1].sum() / col_targ.shape[0]


KeyError: None

In [48]:
y_pred_logits = my_tree.predict_proba(X)
y_pred = my_tree.predict(X)
(y_pred == y).sum() / len(y)

AttributeError: 'NoneType' object has no attribute 'right'

In [56]:
col_tag.min()

variance    -7.0421
skewness   -13.7731
curtosis    -5.2861
entropy     -8.5482
target       0.0000
dtype: float64