In [1]:
import pandas as pd
df_tennis=pd.read_csv('PlayTennis.csv')

In [2]:
def entropy(probs):  
    import math 
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

In [3]:
def entropy_of_list(a_list): 
    from collections import Counter 
    cnt = Counter(x for x in a_list)
    print("No and Yes Classes:",cnt) 
    num_instances = len(a_list)
    probs = [x / num_instances for x in cnt.values()] 
    return entropy(probs)

In [4]:
total_entropy = entropy_of_list(df_tennis['PlayTennis']) 
print("Entropy of given PlayTennis Data Set:",total_entropy)

No and Yes Classes: Counter({'Yes': 9, 'No': 5})
Entropy of given PlayTennis Data Set: 0.9402859586706309


In [5]:
def information_gain(df, split_attribute_name, target_attribute_name): 
    print("Information Gain Calculation of ",split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    for name,group in df_split: 
        print("Name:",name) 
        print("Group:",group) 
    nobs = len(df.index) 
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations'] 
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] ) 
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy 

In [10]:
def id3(df, target_attribute_name, attribute_names, default_class=None): 
    from collections import Counter 
    cnt = Counter(x for x in df[target_attribute_name]) 
    #print(cnt)
    if len(cnt) == 1: 
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names):
        return default_class 
    else: 
        default_class = max(cnt.keys())
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
        print("Gain of all attributes",gainz)
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max]
        print("Best",best_attr)
        tree = {best_attr:{}} 
        print("Tree:",tree)
        remaining_attribute_names = [i for i in attribute_names if i != best_attr] 
        print("Remaining Attributes",remaining_attribute_names)
        for attr_val, data_subset in df.groupby(best_attr):
            
            subtree = id3(data_subset, 
                          target_attribute_name, 
                          remaining_attribute_names, 
                          default_class) 
            tree[best_attr][attr_val] = subtree
        return tree

In [11]:
attribute_names = list(df_tennis.columns) 
print("List of Attributes:", attribute_names) 
attribute_names.remove('PlayTennis')  
print("Predicting Attributes:", attribute_names)

List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [12]:
from pprint import pprint 
tree = id3(df_tennis,'PlayTennis',attribute_names) 
print("\n\nThe Resultant Decision Tree is :\n") 
pprint(tree)

Information Gain Calculation of  Outlook
Name: Overcast
Group:    PlayTennis   Outlook Temperature Humidity    Wind
2         Yes  Overcast         Hot     High    Weak
6         Yes  Overcast        Cool   Normal  Strong
11        Yes  Overcast        Mild     High  Strong
12        Yes  Overcast         Hot   Normal    Weak
Name: Rain
Group:    PlayTennis Outlook Temperature Humidity    Wind
3         Yes    Rain        Mild     High    Weak
4         Yes    Rain        Cool   Normal    Weak
5          No    Rain        Cool   Normal  Strong
9         Yes    Rain        Mild   Normal    Weak
13         No    Rain        Mild     High  Strong
Name: Sunny
Group:    PlayTennis Outlook Temperature Humidity    Wind
0          No   Sunny         Hot     High    Weak
1          No   Sunny         Hot     High  Strong
7          No   Sunny        Mild     High    Weak
8         Yes   Sunny        Cool   Normal    Weak
10        Yes   Sunny        Mild   Normal  Strong
No and Yes Classes: Cou