In [2]:
import pandas as pd 
import numpy as np 

In [3]:
df=pd.read_csv('weather_data_tennis.csv')
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Wind,Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [4]:
class Node:
    def __init__(self,):
        self.attribute=None
        self.branches={}
        self.leaf_node=False
        self.ans=None

class ID3DecisionTree:
    def __init__(self,): 
        self.root=None

def calc_info_gain(data, feature, target):
    entropy_parent=0
    # simple entropy of all classes in target column for all training samples
    for cls_label in data[target].unique():
        p_cls=len(data[data[target]==cls_label])/len(data)
        if p_cls==0:
            continue
        entropy_parent+=-p_cls*np.log2(p_cls)
            
    #weighted average entropy of the subsets created by splitting on attribute
    # sum of sv/s* entropy(s)---> sum of sv/s* [-p1logp1 - p2logp2-p3logp3]
    children_entropy=0
    #iterating over all the branches if feature is selected as split node
    for branch in data[feature].unique():
        entropy_subset=0
        for cls_label in data[target].unique():
            p_cls_branch=len(data[(data[feature]==branch) & (data[target]==cls_label)])/len(data[data[feature]==branch])
            if p_cls_branch==0:
                continue
            entropy_subset+=-p_cls_branch*np.log2(p_cls_branch)
        children_entropy+=entropy_subset* (len(data[data[feature]==branch])/len(data))

    return entropy_parent-children_entropy

def calc_gain_ratio(data, feature, target):
    info_gain=calc_info_gain(data,feature,target)

    split_info=0
    # sum of sv/s*log(sv/s)-----> sv1/s*[-log(sv1/s)]+ sv2/sv *[-log(sv2/s)] +sv3/sv[-log(sv3/s)]
    # entropy of the distribution of instances across the branches created by the split on attribute
    # but it does not account for the target variable's entropy within each subset.
    for branch in data[feature].unique():
        p_branch=len(data[data[feature]==branch])/len(data)
        if p_branch==0:
            continue
        split_info+=-p_branch*np.log2(p_branch)

    return (info_gain)/split_info
def recursive_split(data,target):

    #pure class means a leaf node
    if len(data[target].unique())==1: 
        new_node=Node()
        new_node.leaf_node=True
        new_node.ans=data[target].unique()[0]
        return new_node


    best_gain_ratio=float('-inf')
    best_feat=None
    for feature in data.columns:
        if feature==target:
            continue
        curr_gain_ratio=calc_gain_ratio(data,feature,target)
        if curr_gain_ratio>best_gain_ratio:
            best_gain_ratio=curr_gain_ratio
            best_feat=feature

    #choosing a feature a creating a node
    new_node=Node()
    new_node.attribute=best_feat

    #making a branch for each unique value in best feature
    for x in data[best_feat].unique():
        #filtering data and droping the best feature column for down the tree
        new_data=data[data[best_feat]==x].drop(best_feat, axis=1)
        new_node.branches[x]=recursive_split(new_data,target)

    return new_node

    

    

In [5]:
tree = ID3DecisionTree()
tree.root = recursive_split(df, 'Tennis')


In [6]:
print(df)

     Outlook  Temp Humidity    Wind Tennis
0      Sunny   Hot     High    Weak     No
1      Sunny   Hot     High  Strong     No
2   Overcast   Hot     High    Weak    Yes
3       Rain  Mild     High    Weak    Yes
4       Rain  Cool   Normal    Weak    Yes
5       Rain  Cool   Normal  Strong     No
6   Overcast  Cool   Normal  Strong    Yes
7      Sunny  Mild     High    Weak     No
8      Sunny  Cool   Normal    Weak    Yes
9       Rain  Mild   Normal    Weak    Yes
10     Sunny  Mild   Normal  Strong    Yes
11  Overcast  Mild     High  Strong    Yes
12  Overcast   Hot   Normal    Weak    Yes
13      Rain  Mild   Normal  Strong     No


In [7]:
# def traverse_using_dfs(curr ,**features): 
#     if curr.leaf_node:
#         return ans

#     # return traverse_using_dfs(curr.branches[features[curr.attribute]] ,**features)
        

In [8]:
def traverse_using_dfs(curr ,u_Outlook,u_Temp,u_Humidity,u_Wind): 
    if curr.leaf_node: 
        return curr.ans
    if(curr.attribute=='Outlook'): 
        return traverse_using_dfs(curr.branches[u_Outlook],u_Outlook,u_Temp,u_Humidity,u_Wind)
    if(curr.attribute=='Temp'): 
        return traverse_using_dfs(curr.branches[u_Temp],u_Outlook,u_Temp,u_Humidity,u_Wind)
    if(curr.attribute=='Humidity'): 
        return traverse_using_dfs(curr.branches[u_Humidity],u_Outlook,u_Temp,u_Humidity,u_Wind)
    if(curr.attribute=='Wind'): 
        return traverse_using_dfs(curr.branches[u_Wind],u_Outlook,u_Temp,u_Humidity,u_Wind)

In [23]:
#taking user inputs
user_outlook=input(f"Outlook = ")
user_temp = input(f'temp = ')
user_humidity = input(f'humidity = ')
user_wind = input(f'wind = ')

#outputAssertionError
print(f"Prediction is {traverse_using_dfs(tree.root,user_outlook,user_temp,user_humidity,user_wind)}")

Outlook =  Rain
temp =  Mild
humidity =  Normal
wind =  Strong


Prediction is No


In [10]:
# Outlook = Rain
# temp =  Mild
# humidity =  Normal
# wind =  Strong

