In [1]:
import pandas as pd 
from pandas import DataFrame  
df_tennis = pd.read_csv("weather.csv") 

In [3]:
df_tennis

Unnamed: 0,id,outlook,temperature,humidity,wind,play
0,1,sunny,hot,high,weak,no
1,2,sunny,hot,high,strong,no
2,3,overcast,hot,high,weak,yes
3,4,rainy,mild,high,weak,yes
4,5,rainy,cool,normal,weak,yes
5,6,rainy,cool,normal,strong,no
6,7,overcast,cool,normal,strong,yes
7,8,sunny,mild,high,weak,no
8,9,sunny,cool,normal,weak,yes
9,10,rainy,mild,normal,weak,yes


In [4]:
def entropy(probs): 
    import math
    
    return sum([-prob*math.log(prob, 2) for prob in probs])


def entropy_of_list(a_list): 
    from collections import Counter     
    
    cnt = Counter(x for x in a_list)
    print("No and Yes Classes:",a_list.name,cnt)     
    
    num_instances = len(a_list)*1.0     
    probs = [x / num_instances for x in cnt.values()]     
    
    return entropy(probs) # Call Entropy: 

In [5]:
total_entropy = entropy_of_list(df_tennis['play']) 
print("Entropy of given PlayTennis Data Set:",total_entropy)

No and Yes Classes: play Counter({'yes': 9, 'no': 5})
Entropy of given PlayTennis Data Set: 0.9402859586706309


In [6]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0): 
    df_split = df.groupby(split_attribute_name) 
    
    for name,group in df_split:         
        print(name)         
        print(group)    
        nobs = len(df.index) * 1.0
        df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name] 
        df_agg_ent.columns = ['Entropy', 'PropObservations'] 
        new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
        old_entropy = entropy_of_list(df[target_attribute_name])  
    
    return old_entropy - new_entropy

In [7]:
print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'outlook', 'play')),"\n") 
print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'humidity', 'play')),"\n") 
print('\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'wind', 'play')),"\n") 
print('\n Info-gain for Temperature is:' + str( information_gain(df_tennis , 'temperature','play')),"\n")

overcast
    id   outlook temperature humidity    wind play
2    3  overcast         hot     high    weak  yes
6    7  overcast        cool   normal  strong  yes
11  12  overcast        mild     high  strong  yes
12  13  overcast         hot   normal    weak  yes
No and Yes Classes: play Counter()
No and Yes Classes: play Counter({'yes': 4})
No and Yes Classes: play Counter({'yes': 3, 'no': 2})
No and Yes Classes: play Counter({'no': 3, 'yes': 2})
No and Yes Classes: play Counter({'yes': 9, 'no': 5})
rainy
    id outlook temperature humidity    wind play
3    4   rainy        mild     high    weak  yes
4    5   rainy        cool   normal    weak  yes
5    6   rainy        cool   normal  strong   no
9   10   rainy        mild   normal    weak  yes
13  14   rainy        mild     high  strong   no
No and Yes Classes: play Counter()
No and Yes Classes: play Counter({'yes': 4})
No and Yes Classes: play Counter({'yes': 3, 'no': 2})
No and Yes Classes: play Counter({'no': 3, 'yes': 2})
No and

In [8]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter     
    cnt = Counter(x for x in df[target_attribute_name])
    
    if len(cnt) == 1:         
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names): 
             return default_class 
    else:
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max] 
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr): 
            subtree = id3(data_subset,
                          target_attribute_name,                         
                          remaining_attribute_names,                         
                          default_class)
            tree[best_attr][attr_val] = subtree 
        return tree

In [9]:
attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('id')
attribute_names.remove('play') 
print("Predicting Attributes:", attribute_names)


List of Attributes: ['id', 'outlook', 'temperature', 'humidity', 'wind', 'play']
Predicting Attributes: ['outlook', 'temperature', 'humidity', 'wind']


In [10]:
from pprint import pprint 
tree = id3(df_tennis,'play',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n") 
pprint(tree) 

overcast
    id   outlook temperature humidity    wind play
2    3  overcast         hot     high    weak  yes
6    7  overcast        cool   normal  strong  yes
11  12  overcast        mild     high  strong  yes
12  13  overcast         hot   normal    weak  yes
No and Yes Classes: play Counter()
No and Yes Classes: play Counter({'yes': 4})
No and Yes Classes: play Counter({'yes': 3, 'no': 2})
No and Yes Classes: play Counter({'no': 3, 'yes': 2})
No and Yes Classes: play Counter({'yes': 9, 'no': 5})
rainy
    id outlook temperature humidity    wind play
3    4   rainy        mild     high    weak  yes
4    5   rainy        cool   normal    weak  yes
5    6   rainy        cool   normal  strong   no
9   10   rainy        mild   normal    weak  yes
13  14   rainy        mild     high  strong   no
No and Yes Classes: play Counter()
No and Yes Classes: play Counter({'yes': 4})
No and Yes Classes: play Counter({'yes': 3, 'no': 2})
No and Yes Classes: play Counter({'no': 3, 'yes': 2})
No and