In [None]:
import pandas as pd
from collections import Counter
import math
from pprint import pprint

In [None]:
df_tennis=pd.read_csv('id3_ds.csv')
print(df_tennis)

     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
3       rain        mild     high    weak        yes
4       rain        cool   normal    weak        yes
5       rain        cool   normal  strong         no
6   overcast        cool   normal  strong        yes
7      sunny        mild     high    weak         no
8      sunny        cool   normal    weak        yes
9       rain        mild   normal    weak        yes
10     sunny        mild   normal    weak        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
13      rain        mild     high  strong         no


In [None]:
def entropy(probs):
    return sum([-prob*math.log(prob,2)for prob in probs])


In [None]:
def entropy_of_list(a_list):
    cnt=Counter(x for x in a_list)
    print("No and Yes class:",a_list.name,cnt)
    num_instances=len(a_list)*1.0
    probs=[x/num_instances for x in cnt.values()]
    print()
    return entropy(probs)

In [None]:
print(df_tennis['playtennis'])
total_entropy=entropy_of_list(df_tennis['playtennis'])
print("entropy of given playtennis dataset:",total_entropy)

0      no
1      no
2     yes
3     yes
4     yes
5      no
6     yes
7      no
8     yes
9     yes
10    yes
11    yes
12    yes
13     no
Name: playtennis, dtype: object
No and Yes class: playtennis Counter({'yes': 9, 'no': 5})

entropy of given playtennis dataset: 0.9402859586706309


In [None]:
def information_gain(data,split_attribute_name,target_attribute,trace=0):
    print("info gain calculation of ",split_attribute_name)
    data_split = data.groupby(split_attribute_name)
    for name,group in data_split:
        print(name)
        print(group)
    nobs = len(data.index)*1.0
    data_agg1 = data_split.agg({target_attribute:lambda x:entropy_of_list(x)})
    data_agg2 = data_split.agg({target_attribute:lambda x:len(x)/nobs})
    data_agg1.columns = ['entropy']
    data_agg2.columns = ['proportion']
    new_entropy = sum(data_agg1['entropy']*data_agg2['proportion'])
    old_entropy = entropy_of_list(data[target_attribute])
    print()
    return old_entropy-new_entropy

In [None]:
print("info gain for outlook is :"+ str(information_gain(df_tennis,'outlook','playtennis')),"\n")

info gain calculation of  outlook
overcast
     outlook temperature humidity    wind playtennis
2   overcast         hot     high    weak        yes
6   overcast        cool   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
rain
   outlook temperature humidity    wind playtennis
3     rain        mild     high    weak        yes
4     rain        cool   normal    weak        yes
5     rain        cool   normal  strong         no
9     rain        mild   normal    weak        yes
13    rain        mild     high  strong         no
sunny
   outlook temperature humidity    wind playtennis
0    sunny         hot     high    weak         no
1    sunny         hot     high  strong         no
7    sunny        mild     high    weak         no
8    sunny        cool   normal    weak        yes
10   sunny        mild   normal    weak        yes
No and Yes class: playtennis Counter({'yes': 4})

No and Yes class: p

In [None]:
print()
print("info gain for humidity is :"+ str(information_gain(df_tennis,'humidity','playtennis')),"\n")



info gain calculation of  humidity
high
     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
3       rain        mild     high    weak        yes
7      sunny        mild     high    weak         no
11  overcast        mild     high  strong        yes
13      rain        mild     high  strong         no
normal
     outlook temperature humidity    wind playtennis
4       rain        cool   normal    weak        yes
5       rain        cool   normal  strong         no
6   overcast        cool   normal  strong        yes
8      sunny        cool   normal    weak        yes
9       rain        mild   normal    weak        yes
10     sunny        mild   normal    weak        yes
12  overcast         hot   normal    weak        yes
No and Yes class: playtennis Counter({'no': 4, 'yes': 3})

No and Yes class: playtennis Counter({'yes': 

In [None]:
print()
print("info gain for temperature is :"+ str(information_gain(df_tennis,'temperature','playtennis')),"\n")


info gain calculation of  temperature
cool
    outlook temperature humidity    wind playtennis
4      rain        cool   normal    weak        yes
5      rain        cool   normal  strong         no
6  overcast        cool   normal  strong        yes
8     sunny        cool   normal    weak        yes
hot
     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
12  overcast         hot   normal    weak        yes
mild
     outlook temperature humidity    wind playtennis
3       rain        mild     high    weak        yes
7      sunny        mild     high    weak         no
9       rain        mild   normal    weak        yes
10     sunny        mild   normal    weak        yes
11  overcast        mild     high  strong        yes
13      rain        mild     high  strong         no
No and Yes class: playtennis Counter({'yes': 3, 'no

In [None]:
def id3(df,target_attribute_name,attribute_names,default_class=None):
    cnt=Counter(x for x in df[target_attribute_name])
    if len(cnt)==1:
        return next(iter(cnt))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class=max(cnt.keys())
        gainz=[information_gain(df,attr,target_attribute_name) for attr in attribute_names]
        index_of_max=gainz.index(max(gainz))
        best_attr=attribute_names[index_of_max]
        tree={best_attr:{ }}
        remaining_attribute_names=[i for i in attribute_names if i!=best_attr]
        for attr_val,data_subset in df.groupby(best_attr):
            subtree=id3(data_subset,target_attribute_name,remaining_attribute_names,default_class)
            tree[best_attr][attr_val]=subtree
            print()
    return tree

In [None]:
attribute_names=list(df_tennis.columns)
print("list of attributes:",attribute_names)


list of attributes: ['outlook', 'temperature', 'humidity', 'wind', 'playtennis']


In [None]:
attribute_names.remove('playtennis')
print("pridicting attributes:",attribute_names)

pridicting attributes: ['outlook', 'temperature', 'humidity', 'wind']


In [None]:
tree=id3(df_tennis,"playtennis",attribute_names)
print()

info gain calculation of  outlook
overcast
     outlook temperature humidity    wind playtennis
2   overcast         hot     high    weak        yes
6   overcast        cool   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
rain
   outlook temperature humidity    wind playtennis
3     rain        mild     high    weak        yes
4     rain        cool   normal    weak        yes
5     rain        cool   normal  strong         no
9     rain        mild   normal    weak        yes
13    rain        mild     high  strong         no
sunny
   outlook temperature humidity    wind playtennis
0    sunny         hot     high    weak         no
1    sunny         hot     high  strong         no
7    sunny        mild     high    weak         no
8    sunny        cool   normal    weak        yes
10   sunny        mild   normal    weak        yes
No and Yes class: playtennis Counter({'yes': 4})

No and Yes class: p

In [None]:
pprint("\n\n the result decison tree is:\n")
pprint(tree)

'\n\n the result decison tree is:\n'
{'outlook': {'overcast': 'yes',
             'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [None]:

def classify(instance,tree,default=None):
    attribute=next(iter(tree))
    if instance[attribute] in tree[attribute].keys():
        result=tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result)
        else:return result
    else:
        return default


In [None]:
df_new=pd.read_csv('PlayTennisTest.csv')
df_new['predicted']=df_new.apply(classify,axis=1,args=(tree,'?'))
df_new.drop(["playtennis"], axis = 1, inplace = True)
print(df_new)

    outlook temperature humidity    wind predicted
0  overcast         hot   normal    weak       yes
1      rain        mild     high  strong        no
2     sunny        mild     high    weak        no
3     sunny        cool   normal    weak       yes
