In [3]:
import numpy as np
import pandas as pd


def create_data():
    data_value = np.array(
        [
            ["long", "thick", 175, "no", "man"],
            ["short", "medium", 168, "no", "man"],
            ["short", "thin", 178, "yes", "man"],
            ["short", "thick", 172, "no", "man"],
            ["long", "medium", 163, "no", "man"],
            ["short", "thick", 180, "no", "man"],
            ["long", "thick", 173, "yes", "man"],
            ["short", "thin", 174, "no", "man"],
            ["long", "thin", 164, "yes", "woman"],
            ["long", "medium", 158, "yes", "woman"],
            ["long", "thick", 161, "yes", "woman"],
            ["short", "thin", 166, "yes", "woman"],
            ["long", "thin", 158, "no", "woman"],
            ["short", "medium", 163, "no", "woman"],
            ["long", "thick", 161, "yes", "woman"],
            ["long", "thin", 164, "no", "woman"],
            ["short", "medium", 172, "yes", "woman"],
        ]
    )
    columns = np.array(["hair", "voice", "height", "ear_stud", "labels"])
    data = pd.DataFrame(data_value.reshape(17, 5), columns=columns)
    return data

In [4]:
data = create_data()
data

Unnamed: 0,hair,voice,height,ear_stud,labels
0,long,thick,175,no,man
1,short,medium,168,no,man
2,short,thin,178,yes,man
3,short,thick,172,no,man
4,long,medium,163,no,man
5,short,thick,180,no,man
6,long,thick,173,yes,man
7,short,thin,174,no,man
8,long,thin,164,yes,woman
9,long,medium,158,yes,woman


In [5]:
import math


def get_Ent(data):

    num_sample = len(data)  
    label_counts = {}  
    for i in range(num_sample):
        each_data = data.iloc[i, :]
        current_label = each_data["labels"] 

        if current_label not in label_counts.keys():
            label_counts[current_label] = 0
        label_counts[current_label] += 1

    Ent = 0.0  
    for key in label_counts:
        prob = float(label_counts[key]) / num_sample
        Ent -= prob * math.log(prob, 2) 
    return Ent

In [6]:
base_ent = get_Ent(data)
base_ent

0.9975025463691153

In [7]:
def get_gain(data, base_ent, feature):

    feature_list = data[feature]   
    unique_value = set(feature_list)  
    feature_ent = 0.0

    for each_feature in unique_value:
        temp_data = data[data[feature] == each_feature]
        weight = len(temp_data) / len(feature_list)   
        temp_ent = weight * get_Ent(temp_data)
        feature_ent = feature_ent + temp_ent

    gain = base_ent - feature_ent  
    return gain

In [8]:
get_gain(data, base_ent, "hair")

0.062200515199107964

The consistent value

In [9]:
def get_splitpoint(data, base_ent, feature):
    #sort the continues value and turn it to float type
    continues_value = data[feature].sort_values().astype(np.float64)
    continues_value = [i for i in continues_value]  # delete the index
    t_set = []
    t_ent = {}


    for i in range(len(continues_value) - 1):
        temp_t = (continues_value[i] + continues_value[i + 1]) / 2
        t_set.append(temp_t)

    for each_t in t_set:
        # greater than the split point
        temp1_data = data[data[feature].astype(np.float64) > each_t]
        # less than the split point
        temp2_data = data[data[feature].astype(np.float64) < each_t]
        weight1 = len(temp1_data) / len(data)
        weight2 = len(temp2_data) / len(data)
        temp_ent = (
            base_ent - weight1 * get_Ent(temp1_data) - weight2 * get_Ent(temp2_data)
        )
        t_ent[each_t] = temp_ent
    print("t_ent:", t_ent)
    final_t = max(t_ent, key=t_ent.get)
    return final_t

In [10]:
final_t = get_splitpoint(data, base_ent, "height")
final_t

t_ent: {158.0: 0.1179805181500242, 159.5: 0.1179805181500242, 161.0: 0.2624392604045631, 162.0: 0.2624392604045631, 163.0: 0.3856047022157598, 163.5: 0.15618502398692893, 164.0: 0.3635040117533678, 165.0: 0.33712865788827096, 167.0: 0.4752766311586692, 170.0: 0.32920899348970845, 172.0: 0.5728389611412551, 172.5: 0.4248356349861979, 173.5: 0.3165383509071513, 174.5: 0.22314940393447813, 176.5: 0.14078143361499595, 179.0: 0.06696192680347068}


172.0

In [11]:
def choice_1(x, t):
    if x > t:
        return ">{}".format(t)
    else:
        return "<{}".format(t)


deal_data = data.copy()

deal_data["height"] = pd.Series(
    map(lambda x: choice_1(int(x), final_t), deal_data["height"])
)
deal_data

Unnamed: 0,hair,voice,height,ear_stud,labels
0,long,thick,>172.0,no,man
1,short,medium,<172.0,no,man
2,short,thin,>172.0,yes,man
3,short,thick,<172.0,no,man
4,long,medium,<172.0,no,man
5,short,thick,>172.0,no,man
6,long,thick,>172.0,yes,man
7,short,thin,>172.0,no,man
8,long,thin,<172.0,yes,woman
9,long,medium,<172.0,yes,woman


In [12]:
def choose_feature(data):

    num_features = len(data.columns) - 1
    base_ent = get_Ent(data)
    best_gain = 0.0  
    best_feature = data.columns[0]
    for i in range(num_features):  
        temp_gain = get_gain(data, base_ent, data.columns[i])  
        if temp_gain > best_gain:  
            best_gain = temp_gain
            best_feature = data.columns[i]
            
    return best_feature

In [13]:
choose_feature(deal_data)

'height'

Construct the tree

In [14]:
def create_tree(data):
    feature_list = data.columns[:-1].tolist()
    label_list = data.iloc[:, -1]
    if len(data["labels"].value_counts()) == 1:
        leaf_node = data["labels"].mode().values
        return leaf_node  
    if len(feature_list) == 1:
        leaf_node = data["labels"].mode().values
        return leaf_node  
    best_feature = choose_feature(data) 
    tree = {best_feature: {}}
    feat_values = data[best_feature]
    unique_value = set(feat_values)
    for value in unique_value:
        temp_data = data[data[best_feature] == value]
        temp_data = temp_data.drop([best_feature], axis=1)
        tree[best_feature][value] = create_tree(temp_data)
    return tree

In [15]:
tree = create_tree(deal_data)
tree

{'height': {'>172.0': array(['man'], dtype=object),
  '<172.0': {'ear_stud': {'no': {'voice': {'medium': array(['man'], dtype=object),
      'thick': array(['man'], dtype=object),
      'thin': array(['woman'], dtype=object)}},
    'yes': array(['woman'], dtype=object)}}}}

In [64]:
def classify(tree, test):

    first_feature = list(tree.keys())[0] 
    feature_dict = tree[first_feature]  
    labels = test.columns.tolist()
    value = test[first_feature][0]
    for key in feature_dict.keys():
        if value == key:
            if type(feature_dict[key]).__name__ == "dict":  
                class_label = classify(feature_dict[key], test)  
            else:
                class_label = feature_dict[key]
    return class_label

In [68]:
test = pd.DataFrame(
    {"hair": ["long"], "voice": ["thick"], "height": [163], "ear_stud": ["yes"]}
)
test

Unnamed: 0,hair,voice,height,ear_stud
0,long,thick,163,yes


In [69]:
test["height"] = pd.Series(map(lambda x: choice_1(int(x), final_t), test["height"]))
test

Unnamed: 0,hair,voice,height,ear_stud
0,long,thick,<172.0,yes


In [71]:
classify(tree, test)

array(['woman'], dtype=object)