In [1]:
import pandas as pd  
import csv 

In [2]:
train_data = pd.read_csv("playtennis.csv")

In [3]:
cols = train_data.shape[1]
rows = train_data.shape[0]
print("Rows:", rows)
print("Columns:", cols)

Rows: 14
Columns: 5


In [4]:
train_data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rainy,Mild,High,Weak,Yes
4,Rainy,Cool,Normal,Weak,Yes


In [5]:
print("Training Data:\n", train_data)

Training Data:
      Outlook Temperature Humidity   Windy Play Tennis
0      Sunny         Hot     High    Weak          No
1      Sunny         Hot     High  Strong          No
2   Overcast         Hot     High    Weak         Yes
3      Rainy        Mild     High    Weak         Yes
4      Rainy        Cool   Normal    Weak         Yes
5      Rainy        Cool   Normal  Strong          No
6   Overcast        Cool   Normal  Strong         Yes
7      Sunny        Mild     High    Weak          No
8      Sunny        Cool   Normal    Weak         Yes
9      Rainy        Mild   Normal    Weak         Yes
10     Sunny        Mild   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
13     Rainy        Mild     High  Strong          No


In [6]:
def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )
def entropy_of_list(a_list):  
    from collections import Counter
    cnt = Counter(x for x in a_list)   
    print("\nClasses:",cnt)
    num_instances = len(a_list) 
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]
    print(probs)
    print("\n Classes:",list(cnt.keys()))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) 
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", train_data['Play Tennis'])
total_entropy = entropy_of_list(train_data['Play Tennis'])
print("\n Total Entropy of Play Tennis Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: Play Tennis, dtype: object

Classes: Counter({'Yes': 9, 'No': 5})

 Number of Instances of the Current Sub Class is 14:
[0.35714285714285715, 0.6428571428571429]

 Classes: ['No', 'Yes']
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:

 Total Entropy of Play Tennis Data Set: 0.9402859586706309


In [7]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    df_split = df.groupby(split_attribute_name)
    print("split:",type(df_split))
    for name,group in df_split:
        print("Name:\n",name)
        print("Group:\n",group)
    nobs = len(df.index) 
    print("NOBS",nobs) 
    df_ent_prob = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    print(df_ent_prob.columns)
    print("the entropy and the probability value for each attribute is",df_ent_prob)
    df_ent_prob.columns = ['Entropy', 'PropObservations']
    new_entropy = sum( df_ent_prob['Entropy'] * df_ent_prob['PropObservations'] )
    overall_entropy = entropy_of_list(df[target_attribute_name])
    return overall_entropy - new_entropy


print('Info-gain for Outlook is :'+str( information_gain(train_data, 'Outlook', 'Play Tennis')),"\n")
print('\n Info-gain for Humidity is: ' + str( information_gain(train_data, 'Humidity', 'Play Tennis')),"\n")
print('\n Info-gain for Wind is:' + str( information_gain(train_data, 'Windy', 'Play Tennis')),"\n")
print('\n Info-gain for Temperature is:' + str( information_gain(train_data, 'Temperature','Play Tennis')),"\n")

Information Gain Calculation of  Outlook
split: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
Name:
 Overcast
Group:
      Outlook Temperature Humidity   Windy Play Tennis
2   Overcast         Hot     High    Weak         Yes
6   Overcast        Cool   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
Name:
 Rainy
Group:
    Outlook Temperature Humidity   Windy Play Tennis
3    Rainy        Mild     High    Weak         Yes
4    Rainy        Cool   Normal    Weak         Yes
5    Rainy        Cool   Normal  Strong          No
9    Rainy        Mild   Normal    Weak         Yes
13   Rainy        Mild     High  Strong          No
Name:
 Sunny
Group:
    Outlook Temperature Humidity   Windy Play Tennis
0    Sunny         Hot     High    Weak          No
1    Sunny         Hot     High  Strong          No
7    Sunny        Mild     High    Weak          No
8    Sunny        Cool   Normal    Weak  

In [8]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt)) 
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] 
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max]
        tree = {best_attr:{}} 
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

In [9]:
attribute_names = list(train_data.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('Play Tennis') 
print("Predicting Attributes:", attribute_names)

List of Attributes: ['Outlook', 'Temperature', 'Humidity', 'Windy', 'Play Tennis']
Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Windy']


In [10]:
from pprint import pprint
tree = id3(train_data,'Play Tennis',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
pprint(tree)

Information Gain Calculation of  Outlook
split: <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
Name:
 Overcast
Group:
      Outlook Temperature Humidity   Windy Play Tennis
2   Overcast         Hot     High    Weak         Yes
6   Overcast        Cool   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
Name:
 Rainy
Group:
    Outlook Temperature Humidity   Windy Play Tennis
3    Rainy        Mild     High    Weak         Yes
4    Rainy        Cool   Normal    Weak         Yes
5    Rainy        Cool   Normal  Strong          No
9    Rainy        Mild   Normal    Weak         Yes
13   Rainy        Mild     High  Strong          No
Name:
 Sunny
Group:
    Outlook Temperature Humidity   Windy Play Tennis
0    Sunny         Hot     High    Weak          No
1    Sunny         Hot     High  Strong          No
7    Sunny        Mild     High    Weak          No
8    Sunny        Cool   Normal    Weak  

In [11]:
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Best Attribute :
 Outlook
Tree Keys:
 dict_keys(['Overcast', 'Rainy', 'Sunny'])
