3. **Write a program to demonstrate the working of the decision tree based ID3 Algorithm. Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.**

In [None]:
import pandas as pd
from pandas import DataFrame
from math import log
from collections import Counter
from pprint import pprint

In [None]:
df_tennis = pd.read_csv('PlayTennis.csv')
df_tennis.keys()[0]

'PlayTennis'

In [None]:
print('\nGiven Play tennis Dataset: \n\n', df_tennis)


Given Play tennis Dataset: 

    PlayTennis   Outlook Temperature Humidity    Wind
0          No     Sunny         Hot     High    Weak
1          No     Sunny         Hot     High  Strong
2         Yes  Overcast         Hot     High    Weak
3         Yes      Rain        Mild     High    Weak
4         Yes      Rain        Cool   Normal    Weak
5          No      Rain        Cool   Normal  Strong
6         Yes  Overcast        Cool   Normal  Strong
7          No     Sunny        Mild     High    Weak
8         Yes     Sunny        Cool   Normal    Weak
9         Yes      Rain        Mild   Normal    Weak
10        Yes     Sunny        Mild   Normal  Strong
11        Yes  Overcast        Mild     High  Strong
12        Yes  Overcast         Hot   Normal    Weak
13         No      Rain        Mild     High  Strong


In [None]:
def entropy(probs):
    return sum([-prob * log(prob, 2) for prob in probs])

In [None]:
def entropy_of_list(a_list):
    cnt = Counter(x for x in a_list)
    num_instances = len(a_list) * 1.0
    probs = [x / num_instances for x in cnt.values()]
    return entropy(probs)

In [None]:
def information_gain(df, split_attribute_name, target_attribute_name):
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x: len(x)/nobs]})[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy

In [None]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    cnt = Counter(x for x in df[target_attribute_name])
    if len(cnt) == 1:
        return next(iter(cnt))
    elif df.empty or (not attribute_names):
        return default_class
    else:
        default_class = max(cnt.keys())
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        index_of_max = gainz.index(max(gainz))
        best_attr = attribute_names[index_of_max]
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
            tree[best_attr][attr_val] = subtree
    return tree

In [None]:
attribute_names = list(df_tennis.columns)

In [None]:
print("List of attributes: ", attribute_names)

List of attributes:  ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']


In [None]:
attribute_names.remove('PlayTennis')

In [None]:
print("Predicting Attributes: ", attribute_names)

Predicting Attributes:  ['Outlook', 'Temperature', 'Humidity', 'Wind']


In [None]:
tree = id3(df_tennis, 'PlayTennis', attribute_names)

In [None]:
print("\n\nThe Resultant Decistion Tree is: \n")
pprint(tree)



The Resultant Decistion Tree is: 

{'Outlook': {'Overcast': 'Yes',
             'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
             'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}
