In [1]:
import numpy as np

In [2]:
def entropy(y):

    """
    also known as level of impurity

    node = array of targets in a feature branch (left or right or parent), ex. [0,1,0,0,1]
    """

    zero_length = len(y) == 0
    all_zeros = np.sum(y) == 0
    all_ones = np.sum(y) == len(y)

    if zero_length or all_zeros or all_ones:
        return 0
    else:
    
        p1 = np.sum(y) / len(y)
        entropy = -p1 * np.log2(p1) - (1-p1) * np.log2(1-p1)

    return entropy


In [3]:
def split_feature(node_to_split, rows_considered, feature_id):

    """
    Scans values for feature_id and walks by available rows in rows_for_splitting. 
    Function groups 1's and puts indices in the left_branch, and 0's indices to the right_branch
    
    rows_for_splitting will be recursively reducing for each sub_branch
    """

    left_branch = []
    right_branch = []

    for row in rows_considered:
        if node_to_split[row,feature_id] == 1:
            left_branch.append(row)
        else:
            right_branch.append(row)

    return left_branch, right_branch

In [4]:
def InformationGain(x, y, rows_considered, feature_id):

    """
    Function will calculate feature information gain using all x, all y, and subset of rows into consideration
    
    """

    left_indices, right_indices = split_feature(x, rows_considered, feature_id)

    y_left, y_right = y[left_indices], y[right_indices]
    
    entropy_parent, entropy_left, entropy_right = entropy(y), entropy(y_left), entropy(y_right)

    len_total = len(y_left) + len(y_right)
    len_left = len(y_left)
    len_right = len(y_right)

    left_weight, right_weight = len_left / len_total, len_right / len_total

    ig = entropy_parent - (left_weight * entropy_left + right_weight * entropy_right)

    return ig

## Find best feature to split

In [5]:
def next_split(x, y, rows_considered):

    """
    returns the optimal feature to split from limited samples list
    
    """

    # stop if 

    zero_length = len(rows_considered) == 0
    all_zeros = np.sum(y[rows_considered]) == 0
    all_ones = np.sum(y[rows_considered]) == len(y[rows_considered])

    if zero_length or all_zeros or all_ones:
        return -1

    # code

    features_total = x.shape[1]
    best_feature = -1

    ig = []

    for feature in range(features_total):
        ig.append(InformationGain(x,y,rows_considered=rows_considered,feature_id=feature))

    np_ig = np.array(ig)
    best_feature = np_ig.argmax()

    return best_feature

## Recursively generating tree

In [6]:
def build_tree_recursive(x, y, rows_considered, current_depth, tree):
    current_depth += 1

    max_depth = x.shape[1]

    if current_depth == max_depth:
        return "reached max depth"

    best_feature = next_split(x, y, rows_considered)

    left_indices, right_indices = split_feature(x, rows_considered, best_feature)

    if len(left_indices) == 0 or len(right_indices) == 0:
        return -1

    tree.append((left_indices, right_indices, best_feature))
    
    build_tree_recursive(x, y, left_indices, current_depth, tree)
    build_tree_recursive(x, y, right_indices, current_depth, tree)

# Example

In [7]:
x_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y_train = np.array([1,1,0,0,1,0,0,1,1,0])

x_train, y_train

(array([[1, 1, 1],
        [1, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [1, 1, 1],
        [0, 1, 1],
        [0, 0, 0],
        [1, 0, 1],
        [0, 1, 0],
        [1, 0, 0]]),
 array([1, 1, 0, 0, 1, 0, 0, 1, 1, 0]))

In [8]:
root_indices = np.arange(len(y_train))


info_gain0 = InformationGain(x_train, y_train, root_indices, feature_id=0)
print("Information Gain from splitting the root on brown cap: ", info_gain0)

info_gain1 = InformationGain(x_train, y_train, root_indices, feature_id=1)
print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)

info_gain2 = InformationGain(x_train, y_train, root_indices, feature_id=2)
print("Information Gain from splitting the root on solitary: ", info_gain2)


Information Gain from splitting the root on brown cap:  0.034851554559677034
Information Gain from splitting the root on tapering stalk shape:  0.12451124978365313
Information Gain from splitting the root on solitary:  0.2780719051126377


<img src="./images/Screenshot 2023-07-28 at 21.16.10.png"></img>

In [9]:
best_feature = next_split(x_train, y_train, root_indices)
print("Best feature to split on: %d" % best_feature)

Best feature to split on: 2


In [10]:
decision_tree = []
build_tree_recursive(x_train, y_train, root_indices, current_depth=0, tree = decision_tree)

In [11]:
decision_tree

[([0, 1, 4, 5, 7], [2, 3, 6, 8, 9], 2),
 ([0, 1, 4, 7], [5], 0),
 ([8], [2, 3, 6, 9], 1)]

<img src="./images/Screenshot 2023-07-31 at 14.40.23.png"></img>