In [1]:
import numpy as np

In [2]:
"""
Create data for decision tree
"""
def create_data(discrete=False):
    feature_name = np.array(['outlook', 'temperature', 'humidity', 'windy'])
    data = np.array([
        [1., 85., 85., 0.],
        [1., 80., 90., 1.],
        [2., 83., 78., 0.],
        [3., 70., 96., 0.],
        [3., 68., 80., 0.],
        [3., 65., 70., 1.],
        [2., 64., 65., 1.],
        [1., 72., 95., 0.],
        [1., 69., 70., 0.],
        [3., 75., 80., 0.],
        [1., 75., 70., 1.],
        [2., 72., 90., 1.],
        [2., 81., 75., 0.],
        [3., 71., 80., 1.]
    ])
    if discrete:
        data[:, 1][data[:, 1] < 70] = 1.
        data[:, 1][np.logical_and(70. <= data[:, 1],data[:, 1] <= 79.)] = 2.
        data[:, 1][data[:, 1] >= 80.] = 3.
        data[:, 2][data[:, 2] < 76] = 1.
        data[:, 2][data[:, 2] >= 76] = 2.
    label = np.array([
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.]
    ])
    return data.astype(np.float32), label.astype(np.float32), feature_name

In [3]:
"""
Compute the entropy
"""
def get_entropy(label):
    distinct_label, count = np.unique(np.squeeze(label), return_counts=True)
    p = np.divide(count, np.sum(count))
    entropy = -np.sum(np.multiply(p, np.log2(p)))
    return entropy

In [4]:
"""
Select the best feature for splitting
Return: the index of the best features
"""
def get_best_feature(data, label):
    num_data = data.shape[0]
    num_feature = data.shape[1]
    base_entropy = get_entropy(label)
    information_gains_ratio = np.zeros((num_feature))
    for i in range(num_feature):
        unique_feature, unique_count = np.unique(data[:, i], return_counts=True)
        feature_entropy = 0.
        for j in range(unique_feature.shape[0]):
            temp_label = label[data[:, i] == unique_feature[j]]
            p_feature = temp_label.shape[0] / num_data
            feature_entropy += p_feature * get_entropy(temp_label)
        information_gains_ratio[i] = (base_entropy - feature_entropy) / get_entropy(data[:, i])
    return np.argmax(information_gains_ratio)

In [5]:
"""
Create decision tree
"""
def create_tree(tree, data, label, feature_name, pre_feature='Root', pre_value='Root'):
    #Check if the data subset of the current node is pure#
    if np.unique(label).shape[0] == 1:
        return np.squeeze(label)[0]
    
    #Get the best feature for splitting#
    index_best_feature = get_best_feature(data, label)
    values = np.unique(data[:, index_best_feature])
    
    #Create the tree layer in depth first#
    for value in values:
        split_feature_name = feature_name[index_best_feature]
        temp = data[data[:, index_best_feature] == value]
        new_data = np.append(temp[:, :index_best_feature], temp[:, index_best_feature + 1:], axis=1)
        new_label = label[data[:, index_best_feature] == value]
        new_feature_name = np.append(feature_name[0:index_best_feature], feature_name[index_best_feature + 1:])
        subtree = create_tree(list(), new_data, new_label, new_feature_name, pre_feature=split_feature_name, pre_value=str(value))
        tree.append(((split_feature_name, value), subtree))
    return tree

In [6]:
data, label, feature_name = create_data(discrete=True)
tree = create_tree(list(), data, label, feature_name)
tree

[(('outlook', 1.0), [(('humidity', 1.0), 1.0), (('humidity', 2.0), 0.0)]),
 (('outlook', 2.0), 1.0),
 (('outlook', 3.0), [(('windy', 0.0), 1.0), (('windy', 1.0), 0.0)])]