In [1]:
import numpy as np
import numpy.typing as npt

In [2]:
data_113 = np.loadtxt('dataset/20151026_113_labeled.txt')
data_114 = np.loadtxt('dataset/20151026_114_labeled.txt')
data = np.concatenate((data_113, data_114), axis=0)

features = data[:, :6]
labels = data[:, 6]

print(features.shape)
print(labels.shape)

(600, 6)
(600,)


In [3]:
def calc_entropy(labels: npt.NDArray[np.float64]) -> float:
    entropy = 0
    for label in np.unique(labels):
        prob = np.sum(labels == label) / len(labels)
        entropy -= prob * np.log2(prob)
    return entropy


def calc_info_gain(features: npt.NDArray[np.float64], labels: npt.NDArray[np.float64], split_value: float) -> float:
    entropy = calc_entropy(labels)
    left_labels = labels[features < split_value]
    right_labels = labels[features >= split_value]
    left_entropy = calc_entropy(left_labels)
    right_entropy = calc_entropy(right_labels)
    left_prob = len(left_labels) / len(labels)
    right_prob = len(right_labels) / len(labels)
    info_gain = entropy - left_prob * left_entropy - right_prob * right_entropy
    return info_gain


def choose_best_feature(features: npt.NDArray[np.float64], labels: npt.NDArray[np.float64]) -> tuple[int, float]:
    best_feature_index = -1
    best_split_value = -1
    best_info_gain = 0
    for feature_index in range(features.shape[1]):
        feature = features[:, feature_index]
        for value in feature:
            info_gain = calc_info_gain(feature, labels, value)
            if info_gain > best_info_gain:
                best_feature_index = feature_index
                best_split_value = value
                best_info_gain = info_gain
    return best_feature_index, best_split_value