In [3]:

import pandas as pd
from pandas import DataFrame
df_tennis = pd.read_csv('tennis.csv')
print(df_tennis)


     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
3       rain        mild     high    weak        yes
4       rain        cool   normal    weak        yes
5       rain        cool   normal  strong         no
6   overcast        cool   normal  strong        yes
7      sunny        mild     high    weak         no
8      sunny        cool   normal    weak        yes
9       rain        mild   normal    weak        yes
10     sunny        mild   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
13      rain        mild     high  strong         no


In [6]:
attribute_names = list(df_tennis.columns)
attribute_names.remove('playtennis')

print(attribute_names)

['outlook', 'temperature', 'humidity', 'wind']


In [7]:
def entropy_of_list(lst):
    from collections import Counter
    count = Counter(x for x in lst)
    num_instances = len(lst)*1.
    probs = [x/num_instances for x in count.values()]
    return entropy(probs)

In [8]:
def entropy(probs):
    import math
    return sum([-prob*math.log(prob,2) for prob in probs])

In [9]:
total_entropy = entropy_of_list(df_tennis['playtennis'])


In [13]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index)*1.
    df_agg_ent = df_split.agg({target_attribute_name:[entropy_of_list, lambda x:len(x)/nobs]})
    df_agg_ent.columns = ['Entropy','propobservations']
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['propobservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    print(split_attribute_name, 'IG :',old_entropy - new_entropy)
    return old_entropy - new_entropy

In [14]:
def id3(df, target_attribute_name, attribute_names, default_class = None):
    from collections import Counter
    count = Counter(x for x in df[target_attribute_name])
    if len(count)==1:
        return next(iter(count))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(count.keys())
        gain = [
            information_gain(df, attr, target_attribute_name) for attr in attribute_names
        ]
        print()
        index_of_max = gain.index(max(gain))
        best_attr = attribute_names[index_of_max]
        
        tree = {best_attr:{}}
        
        remaining_attribute_names = [ i for i in attribute_names if i!= best_attr ]
        
        for attr_val, data_subset in df.groupby(best_attr):
                subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
                tree[best_attr][attr_val] = subtree
        
        return tree

In [16]:
from pprint import pprint
tree = id3(df_tennis, 'playtennis', attribute_names)
print("\n\nThe Resultant Decision Tree is:\n")
pprint(tree)

outlook IG : 0.2467498197744391
temperature IG : 0.029222565658954647
humidity IG : 0.15183550136234136
wind IG : 0.04812703040826927

temperature IG : 0.01997309402197489
humidity IG : 0.01997309402197489
wind IG : 0.9709505944546686

temperature IG : 0.5709505944546686
humidity IG : 0.9709505944546686
wind IG : 0.01997309402197489



The Resultant Decision Tree is:

{'outlook': {'overcast': 'yes',
             'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
