#### Name: Ananya Godse   SAP ID: 60009220161

# Decision Tree Classifier - Tennis Dataset

### Importing the libraries and the dataset

In [1]:
import numpy as np
import pandas as pd
import pprint

In [2]:
df = pd.read_excel("tennis.xlsx")
df

Unnamed: 0,Day,Outlook,Temperature,Humidity,Wind,Play Tennis?
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,Yes
6,D7,Overcast,Cool,Normal,Strong,No
7,D8,Sunny,Mild,High,Weak,Yes
8,D9,Sunny,Cool,Normal,Weak,No
9,D10,Rain,Mild,Normal,Weak,Yes


In [3]:
df.drop(columns=["Day"], inplace=True)
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis?
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,Yes
6,Overcast,Cool,Normal,Strong,No
7,Sunny,Mild,High,Weak,Yes
8,Sunny,Cool,Normal,Weak,No
9,Rain,Mild,Normal,Weak,Yes


### Calculating the entropy

This function calculates the entropy of the entire dataset, that is, the sum of probability of 'Yes' multiplied by log to the base 2 (probability of 'Yes') and the probability of 'No' multiplied by log to the base 2 (probability of 'No')

In [4]:
def entropy(df):
    class_col = df.keys()[-1]
    class_labels = df[class_col].unique()
    entropy = 0
    for label in class_labels:
        p = df[class_col].value_counts()[label]/len(df[class_col])
        entropy += -p * np.log2(p)
        
    return entropy

In [5]:
print(f"Entropy of the dataset: {entropy(df)}")

Entropy of the dataset: 0.9402859586706311


This function calculates the entropy of each attribute:

In [6]:
def attribute_entropy(df, attribute):
    class_col = df.keys()[-1]
    class_labels = df[class_col].unique()
    attribute_values = df[attribute].unique()
    attribute_entropy = 0
    for value in attribute_values:
        entropy = 0
        for label in class_labels:
            num = len(df[attribute][df[attribute]==value][df[class_col]==label])
            den = len(df[attribute][df[attribute]==value])
            p = num/den
            entropy += -p * np.log2(p + 0.000001)
        attribute_entropy += (den/len(df[attribute])) * entropy
    
    return attribute_entropy

As an example

In [7]:
print(f"Entropy of Outlook: {attribute_entropy(df, 'Outlook')}")

Entropy of Outlook: 0.8363903963670357


### Finding the best splitting attribute using Information Gain

This function calculates the information gain for each attribute and returns the attribute where the information gain is the largest. 

In [8]:
def information_gain(df):
    info_gain = []
    for attribute in df.keys()[:-1]:
        gain = entropy(df) - attribute_entropy(df, attribute)
        info_gain.append(gain)
    
    best_split = df.keys()[:-1][np.argmax(info_gain)]
    return best_split

In [9]:
print(f"Attribute with the greatest information gain: {information_gain(df)}")

Attribute with the greatest information gain: Outlook


### Building the tree

This function creates a subtable so that the tree can be grown from the current node. 

In [10]:
def sub_table(df, attribute, value):
    return df[df[attribute] == value].reset_index(drop = True)

This is the actual build tree function that is called recursively until we obtain a pure class at each leaf node. At every call, it filters the original dataset to return only rows that are useful for determining the next node.

In [11]:
def build_tree(df, tree=None):
    node = information_gain(df)
    attribute_values = np.unique(df[node])
    class_col = df.keys()[-1]
    
    if tree is None:
        tree = {}
        tree[node] = {}
        
    for value in attribute_values:
        subtable = sub_table(df, node, value)
        label, counts = np.unique(subtable[class_col], return_counts=True)
        if len(counts) == 1: #stopping condition - pure class
            tree[node][value] = label[0]
        else:
            tree[node][value] = build_tree(subtable)
    
    return tree
            

In [12]:
tree = build_tree(df)

### Printing the Decision Tree

In [13]:
pprint.pprint(tree)

{'Outlook': {'Overcast': {'Temperature': {'Cool': 'No',
                                          'Hot': 'Yes',
                                          'Mild': 'Yes'}},
             'Rain': {'Humidity': {'High': {'Wind': {'Strong': 'No',
                                                     'Weak': 'Yes'}},
                                   'Normal': 'Yes'}},
             'Sunny': {'Temperature': {'Cool': 'No',
                                       'Hot': 'No',
                                       'Mild': 'Yes'}}}}


### Creating a dummy dataset to test the tree

In [14]:
test_df = {'Outlook':['Sunny', 'Rain', 'Overcast'], 'Temperature': ['Hot', 'Mild', 'Mild'],
           'Humidity': ['High', 'High', 'Normal'], 'Wind': ['Strong', 'Weak', 'Strong']}
test_df = pd.DataFrame(test_df)

### The Predict Function

This function traverses the decision tree to find out what the label for the unseen data should be.

In [15]:
def predict(test, tree):
    for node in tree.keys():
        value = test[node]
        tree = tree[node][value]
        prediction = 0
        if type(tree) is dict:
            prediction = predict(test, tree)
        else:
            prediction = tree
    return prediction

In [16]:
y_labels = []

for i in range(len(test_df)):
    test = test_df.iloc[i,:]
    prediction = predict(test, tree)
    y_labels.append(prediction)
    
print(y_labels)

['No', 'Yes', 'Yes']
