# Random Forest

In [29]:
import numpy as np
import pandas as pd

## Some Helper Functions

In [42]:
def compute_entropy(labels: np.ndarray) -> float:
    """Computes the entropy of a given set of labels (assumed to be either 0 or 1)."""

    entropy: float = 0.0

    # there is zero entropy in an empty set of labels
    if labels.shape[0] == 0:
        return 0.0

    # calculate ratio of true to false labels
    true_rate = labels.sum() / labels.shape[0]
    false_rate = 1 - true_rate

    # return zero entropy if either of these rates are 0
    if true_rate == 0 or false_rate == 0:
        return 0.0

    return -1 * false_rate * np.log2(false_rate) - true_rate * np.log2(true_rate)

def compute_information_gain(old_labels: np.ndarray, new_labels_left: np.ndarray, new_labels_right: np.ndarray) -> float:
    """Computes the information gained from splitting up the labels in old_labels to the two new sets."""

    information_gain: float = 0.0

    # compute old entropy
    old_entropy = compute_entropy(old_labels)

    # compute new entropy
    left_weight = new_labels_left.shape[0] / old_labels.shape[0]
    right_weight = new_labels_right.shape[0] / old_labels.shape[0]
    new_entropy = (left_weight * compute_entropy(new_labels_left)) + (right_weight * compute_entropy(new_labels_right))

    return old_entropy - new_entropy

def split_on_attribute(X: np.ndarray, y: np.ndarray, split_attribute_index: int, split_value: int) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Splits the data in X and y according to the chosen split attribute index (a column in X) and
        the defined split value."""
    
    X_left = []
    X_right = []
    y_left = []
    y_right = []

    for i, feature_vector in enumerate(X):
        if feature_vector[split_attribute_index] <= split_value:
            X_left.append(feature_vector)
            y_left.append(y[i])
        else:
            X_right.append(feature_vector)
            y_right.append(y[i])
    
    return np.array(X_left), np.array(X_right), np.array(y_left), np.array(y_right)

def find_best_split(X: np.ndarray, y: np.ndarray) -> tuple[int, int, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Finds the best way to split up the current dataset for the highest information gain."""

    N = X.shape[0]
    d = X.shape[1]

    highest_information_gain = np.NINF
    attribute_index = 0
    split_value = 0
    X_left = np.array([])
    X_right = np.array([])
    y_left = np.array([])
    y_right = np.array([])

    # loop through a random choice of attributes
    for new_attribute_index in np.random.choice(np.arange(d), int(np.power(d, 0.5)), replace=False).tolist():

        # first calculate the value to split on, which will be the mean in this example
        new_split_value = X[:, attribute_index].sum() / N

        # build partitions
        X_left_tmp, X_right_tmp, y_left_tmp, y_right_tmp = split_on_attribute(X, y, new_attribute_index, new_split_value)

        # then find the information gain for this split
        new_info_gain = compute_information_gain(y, y_left_tmp, y_right_tmp)

        # if this is higher than any previous gain, save off the result
        if new_info_gain > highest_information_gain:
            highest_information_gain = new_info_gain
            attribute_index = new_attribute_index
            split_value = new_split_value
            X_left = X_left_tmp
            X_right = X_right_tmp
            y_left = y_left_tmp
            y_right = y_right_tmp

    return attribute_index, split_value, X_left, X_right, y_left, y_right

print(compute_entropy(np.array([0,0,0,1,1,1,1,1,1])))
print(compute_information_gain(np.array([0,0,0,1,1,1]), np.array([0,0]), np.array([0,1,1,1])))
print(split_on_attribute(np.array([[3, 10],[1,22],[2,28],[5,32],[4,32]]), np.array([1,1,0,0,1]), 0, 3))

0.9182958340544896
0.4591479170272448
(array([[ 3, 10],
       [ 1, 22],
       [ 2, 28]]), array([[ 5, 32],
       [ 4, 32]]), array([1, 1, 0]), array([0, 1]))
