In [1]:
import pandas as pd
import numpy as np

In [2]:
ds = pd.read_csv('./PlayTennis.csv')
ds

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [3]:
def calc_total_entropy(dataset, label, class_list):
    total_row = dataset.shape[0]
    total_entr = 0
    
    for c in class_list: 
        total_class_count = dataset[dataset[label] == c].shape[0]
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row)
        total_entr += total_class_entr
    
    return total_entr

# calc_total_entropy(ds, 'Play Tennis', ['Yes', 'No'])

In [4]:
def calc_entropy(feature_value_data, label, class_list): # defined for a class in a feature
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] 
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count 
            entropy_class = - probability_class * np.log2(probability_class)  
        entropy += entropy_class
    return entropy

# calc_entropy(ds, 'Temperature', ['Hot','Cool', 'Mild'])

In [5]:
def calc_info_gain(feature_name, dataset, label, class_list):
    feature_value_list = dataset[feature_name].unique() 
    total_row = dataset.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = dataset[dataset[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list) 
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy 
        
    return calc_total_entropy(dataset, label, class_list) - feature_info 

# calc_info_gain('Temperature', ds, 'Play Tennis', ['Yes', 'No'])

In [6]:
def find_most_informative_feature(dataset, label, class_list):
    feature_list = dataset.columns.drop(label) 
                                           
    max_info_gain = -1
    max_info_feature = None
    
    for feature in feature_list:  
        feature_info_gain = calc_info_gain(feature, dataset, label, class_list)
        if max_info_gain < feature_info_gain:
            max_info_gain = feature_info_gain
            max_info_feature = feature
            
    return max_info_feature

# find_most_informative_feature(ds, 'Play Tennis', {'Yes', 'No'})

In [7]:
def generate_sub_tree(feature_name, dataset, label, class_list):
    feature_value_count_dict = dataset[feature_name].value_counts(sort=False).to_dict() 
    tree = {}  

    for feature_value, count in feature_value_count_dict.items():
        feature_value_data = dataset[dataset[feature_name] == feature_value]
        
        assigned_to_node = False  
        for c in class_list:
            class_count = feature_value_data[feature_value_data[label] == c].shape[0]

            if class_count == count:
                tree[feature_value] = c
                dataset = dataset[dataset[feature_name] != feature_value]
                assigned_to_node = True
        if not assigned_to_node:
            tree[feature_value] = "?"

    return tree, dataset


In [8]:
def make_tree(root, prev_feature_value, dataset, label, class_list):
    if dataset.shape[0] != 0: 
        max_info_feature = find_most_informative_feature(dataset, label, class_list)
        tree, dataset = generate_sub_tree(max_info_feature, dataset, label, class_list) 
        next_root = None
        
        if prev_feature_value != None: 
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else: #add to root of the tree
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()):
            if branch == "?":
                feature_value_data = dataset[dataset[max_info_feature] == node] 
                make_tree(next_root, node, feature_value_data, label, class_list)

In [9]:
def id3(dataset_m, label):
    dataset = dataset_m.copy() 
    tree = {} 
    class_list = dataset[label].unique() 
    make_tree(tree, None, dataset, label, class_list) 
    return tree

In [10]:
tree = id3(ds, 'Play Tennis')

In [11]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = next(iter(tree)) 
        feature_value = instance[root_node] 
        if feature_value in tree[root_node]: 
            return predict(tree[root_node][feature_value], instance) 
        else:
            return None

In [12]:
def evaluate(tree, dataset, label):
    correct_predict = 0
    for index, row in dataset.iterrows(): 
        result = predict(tree, dataset.iloc[index])
        if result == dataset[label].iloc[index]: 
            correct_predict += 1 
    accuracy = correct_predict / (len(dataset)) 
    return accuracy

In [13]:
evaluate(tree, ds, 'Play Tennis')

1.0