In [20]:
import numpy as np
import math

In [21]:
def gini(counts):
    """ Calculates Gini Impurity 
    
        Args:
            counts: The number of samples in each class.
    """
    # counts: array of shape (K,)
    total = counts.sum()
    if total == 0:
        return 0.0
    p = counts / total
    return 1.0 - np.sum(p * p)

def entropy(counts):
    """ Calculate Entropy
    
        Args:
            counts: The number of samples in each class.
    """
    p = counts/counts.sum()
    p = p[p > 0]   # avoid log(0)
    return -np.sum(p * np.log2(p))


In [24]:
def best_split_one_feature(x, y):
    x = np.asarray(x)
    y = np.asarray(y)

    # Sort by feature to scan thresholds once
    order = np.argsort(x, kind="mergesort")
    x = x[order]
    classes, y_enc = np.unique(y[order], return_inverse=True)   # Encode labels to 0...K-1
    K = len(classes)
    N = len(x)
    if N <= 1:
        return None, np.inf  # no split

    parent_counts = np.bincount(y_enc, minlength=K)
    parent_imp = gini(parent_counts)

    # Prefix sums of class counts
    prefix = np.zeros((N+1, K), dtype=int)
    for i in range(1, N+1):
        prefix[i] = prefix[i-1]
        prefix[i, y_enc[i-1]] += 1
    
    best_t = None
    best_after_imp = np.inf  # we minimize weighted impurity after

    for i in range(1, N):
        # Skip if they are the same
        if x[i-1]==x[i]:
            continue

        t = (x[i-1] + x[i]) / 2   # The threshold is the middle point
        
        n_left = i
        n_right = N - i
        left_counts = prefix[i]
        right_counts = parent_counts - left_counts

        if n_left==0 or n_right==0:
            continue

        left_imp = gini(left_counts)
        right_imp = gini(right_counts)
        weighted_after = (n_left/N) * left_imp + (n_right/N) * right_imp

        if weighted_after < best_after_imp:
            best_after_imp = weighted_after
            best_t = t
    
    return best_t, best_after_imp

x=[1, 3, 5, 6, 8]
y=[0, 1, 0, 0, 2]

best_split_one_feature(x, y)

(np.float64(7.0), np.float64(0.30000000000000004))

In [26]:
def best_split(X, y):
    X = np.asarray(X)
    y = np.asarray(y)
    N, D = X.shape  # N samples, D features
    
    best_feat = None
    best_t = None
    best_imp = np.inf
    for i in range(D):
        x_i = X[:,i]  # get feature i (column i of the matrix)
        t, imp = best_split_one_feature(x_i, y)

        if imp < best_imp:
            best_imp = imp
            best_feat = i
            best_t = t
    return best_feat, best_t, best_imp


X = [
    [1, 6],
    [2, 4],
    [3, 3],
    [1, 0]
]

y = [0, 0, 1, 1]

print(best_split(X, y))

(1, np.float64(3.5), np.float64(0.0))


In [27]:
from dataclasses import dataclass
from typing import Optional, Tuple, Union, List
@dataclass
class Node:
    feature:Optional[int] = None
    threshold: Optional[float] = None
    left: Optional["Node"] = None
    right: Optional["Node"] = None

    # for the leaves
    proba: Optional[np.ndarray] = None
    label: Optional[int] = None

    depth: int = 0
    n_samples: int = 0
    impurity: float = 0.0

    def is_leaf(self) -> bool:
        return self.proba is not None