In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [115]:
class Node:
    def __init__(self, value, left=None, right=None):
        self.value = value
        self.left = left
        self.right = right


In [145]:
class MyTreeClf:
    def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.leafs_cnt = 0
        self.root = None

    def __repr__(self):
        return f"MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}"
    
    @staticmethod
    def prob(y):
        return np.sum((y == 1)) / len(y)



    def _fit(self, X, y, depth):

        if (depth >= self.max_depth or self.prob(y) in (0, 1) or len(y) < self.min_samples_split or self.leafs_cnt >= self.max_leafs-1):
            self.leafs_cnt += 1
            return Node(("leaf", self.prob(y)))
        
        best_feat, best_split, ig = self.get_best_split(X, y)
        X['y'] = y
        left = X.loc[X[best_feat] <= best_split, :]
        right = X.loc[X[best_feat] > best_split, :]
        left_X, left_y = left.drop('y', axis=1), left['y']
        right_X, right_y = left.drop('y', axis=1), left['y']
        
        left_subtree = self._fit(left_X, left_y, depth+1)
        right_subtree = self._fit(right_X, right_y, depth+1)
        return Node((best_feat, best_split), left_subtree, right_subtree)
        

    def fit(self, X, y):
        self.root = self._fit(X, y, 0)

    def _print_tree(self, root, intend):
        if (root is None):
            return None
        feat, split = root.value
        if (root.left is None and root.right is None):
            print('  '*intend, end='')
            print(f"{feat} = {split}")
        else:
            print('  '*intend, end='')
            print(f"{feat} > {split}")
        self._print_tree(root.left, intend+1)
        self._print_tree(root.right, intend+1)

    def print_tree(self):
        self._print_tree(self.root, 0)


    @staticmethod
    def entropy(x):
      p = np.bincount(x) / len(x)
      return -1 * np.sum(p * np.log2(p + 1e-15))

    def information_gain(self, x, y):
        N = len(x) + len(y)
        s0 = self.entropy(np.concatenate((x, y)))
        sx, sy = self.entropy(x), self.entropy(y)
        return s0 - len(x) / N * sx - len(y) / N * sy

    def get_best_split(self, X, y):
        cols = X.columns.tolist()
        X, y = np.asarray(X), np.asarray(y)
        col_name, split_value, ig = None, 0, 0
        for col_index in range(X.shape[1]):
            values = X[:, col_index]
            uniques = np.sort(np.unique(values))
            ents = []
            for index in range(len(uniques)):
                sep = np.mean(uniques[index : index + 2])
                current_ig = self.information_gain(y[values <= sep], y[values > sep])

                if current_ig > ig:
                    col_name = cols[col_index]
                    split_value = sep
                    ig = current_ig

        return col_name, split_value, ig



In [136]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [137]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [146]:
df = pd.read_csv('./data/data_banknote_authentication.txt', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [147]:
my_decision_tree = MyTreeClf(5, 5, 10)
my_decision_tree.fit(X, y)


In [148]:
my_decision_tree.print_tree()

variance > 0.320165
  skewness > 5.86535
    curtosis > 6.21865
      variance > -0.36205
        leaf = 1.0
        leaf = 1.0
      variance > -0.36205
        leaf = 1.0
        leaf = 1.0
    curtosis > 6.21865
      variance > -0.36205
        leaf = 1.0
        leaf = 1.0
      variance > -0.36205
        leaf = 1.0
        leaf = 1.0
  skewness > 5.86535
    curtosis > 6.21865
      variance > -0.36205
        leaf = 1.0
        leaf = 1.0
      leaf = 0.9945205479452055
    leaf = 0.9481765834932822


In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
y_true = np.array([1, 0, 0, 1, 1])
y_pred = np.array([0.5, 0.6, 0.2, 0.1, 0.7])

df = pd.DataFrame({'true': y_true, 'pred': y_pred}).sort_values(by='pred')
print(roc_auc_score(y_true, y_pred))

0.5
