In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
df= pd.read_csv('/content/PlayTennis.csv')
df.dropna(inplace=True)
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [4]:
def get_probabilities(df, target):
    probs = df[target].value_counts() / len(df)
    return probs.values

def get_entropy(probs):
    total_entropy = 0

    for prob in probs:
        if prob != 0:
            total_entropy += prob * np.log2(prob)
    return -total_entropy  
    
def get_parent_information(df, target):
    probs = get_probabilities(df, target)
    parent_info = get_entropy(probs)
    return parent_info  

def get_feature_information(df, feature, target):
    entities = df[feature].value_counts().index.tolist()
    counts = df[feature].value_counts().values.tolist()
    total_count = sum(counts)

    total_info = 0
    for entity, count in zip(entities, counts):
        iter_df = df[df[feature]==entity]
        probs = get_probabilities(iter_df, target)
        entity_info = get_entropy(probs)
        total_info += count / total_count * entity_info
    return total_info  

In [5]:
# criteria for tree splitting
# information gain
def get_max_information_gain(parent_info, feature_infos):
    gains = [parent_info - feature_info for feature_info in feature_infos]
    idx = gains.index(max(gains))
    return idx

# gain ratio
def get_max_gain_ratio(parent_info, feature_infos, feature_entropies):
    gains = [parent_info - feature_info for feature_info in feature_infos]
    
    gains = np.array(gains)
    feature_entropies = np.array(feature_entropies)
    feature_entropies = np.clip(feature_entropies, 1E-8, None)

    gain_ratios = gains / feature_entropies
    gain_ratios = gain_ratios.tolist()
    idx = gain_ratios.index(max(gain_ratios))
    return idx

In [6]:
# getting splitting feature for data
def get_split(df, target, splitting_criterion = 'information_gain'):
    features = df.columns.tolist()
    features.remove(target)    

    feature_infos = []
    feature_entropies = []
    for feature in features:
        parent_info = get_parent_information(df, target)
        feature_info = get_feature_information(df, feature, target)
        feature_infos.append(feature_info)

        # for gain ratio
        feature_probs = get_probabilities(df, feature)
        feature_entropy = get_entropy(feature_probs)
        feature_entropies.append(feature_entropy)

        if splitting_criterion == 'information_gain':
            feature_idx = get_max_information_gain(parent_info, feature_infos)
        else:
            feature_idx = get_max_gain_ratio(parent_info, feature_infos, feature_entropies)
            
        splitting_feature = features[feature_idx]
    return splitting_feature 

In [7]:
class TreeNode:
    def __init__(self, data, path_data):
        self.val = data
        self.parent = None
        self.path = path_data
        self.children = []

    def get_level(self):
        iter_node = self
        count = 0

        while iter_node.parent:
            iter_node = iter_node.parent
            count += 1
        return count

    def print_tree(self):
        level = self.get_level()
        space = '       ' * level

        if self.parent:
            print(f'{space} {self.path}: {self.val}')
        else:
            print(f'{space} {self.val}')        
            
        if self.children:
            for child in self.children:
                child.print_tree()

In [9]:
def decision_tree_ID3(df, target, root = None, entity = None):
    if df[target].nunique() == 1:
        val = df[target].unique()[0]
        return TreeNode(val, entity)
    
    else:
        splitting_feature = get_split(df, target, splitting_criterion = 'information_gain')
        root = TreeNode(splitting_feature, entity)
        
        # different entities for splitting feature
        entities = df[splitting_feature].unique().tolist()

        for entity in entities:
            df_child = df.loc[df[splitting_feature] == entity, :].reset_index(drop = True)
            df_child.drop(splitting_feature, axis = 1, inplace=True)      

            # recursively build child trees
            child = decision_tree_ID3(df_child, target, root = None, entity = entity)
            root.children.append(child)
            child.parent = root
        return root
            
target = 'play'            
tree = decision_tree_ID3(df, target)            
tree.print_tree()

 outlook
        sunny: humidity
               high: no
               normal: yes
        overcast: yes
        rainy: windy
               False: yes
               True: no
