In [1]:
import numpy as np
import math
import pandas as pd

def read_data(filename):
    data = pd.read_csv(filename)
    print(data, "\n")

    headers = list(data.columns)
    metadata = headers[:-1]  # Exclude the last column which is the target
    traindata = np.array(data.values)

    return (metadata, traindata)

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""
        
    def __str__(self):
        return self.attribute

def subtables(data, col, delete):
    unique_items = np.unique(data[:, col])
    count = np.zeros((unique_items.shape[0], 1), dtype=np.int32)
    dict = {}

    for x in range(unique_items.shape[0]):
        for y in range(data.shape[0]):
            if data[y, col] == unique_items[x]:
                count[x] += 1
                
    for x in range(unique_items.shape[0]):
        dict[unique_items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
        pos = 0
        for y in range(data.shape[0]):
            if data[y, col] == unique_items[x]:
                dict[unique_items[x]][pos] = data[y]
                pos += 1       
        if delete:
            dict[unique_items[x]] = np.delete(dict[unique_items[x]], col, 1)
        
    return unique_items, dict

def entropy(S):
    items, counts = np.unique(S, return_counts=True)

    if items.size == 1:
        return 0
    
    probs = counts / len(S)
    entropy = - np.sum(probs * np.log2(probs))
    return entropy

def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False) 
                
    total_size = data.shape[0]
    entropies = np.zeros((items.shape[0], 1))
    intrinsic = np.zeros((items.shape[0], 1))
    
    for x in range(items.shape[0]):
        ratio = dict[items[x]].shape[0] / total_size
        entropies[x] = ratio * entropy(dict[items[x]][:, -1])
        intrinsic[x] = ratio * math.log2(ratio)
        
    total_entropy = entropy(data[:, -1])
    iv = - np.sum(intrinsic)
    
    for x in range(entropies.shape[0]):
        total_entropy -= entropies[x]
        
    return total_entropy / iv

def create_node(data, metadata):
    if (np.unique(data[:, -1])).shape[0] == 1:
        node = Node("")
        node.answer = np.unique(data[:, -1])[0]
        return node
        
    gains = np.zeros((data.shape[1] - 1, 1))
    
    for col in range(data.shape[1] - 1):
        gains[col] = gain_ratio(data, col)
        
    split = np.argmax(gains)
    
    node = Node(metadata[split])    
    metadata = np.delete(metadata, split, 0)    
    
    items, dict = subtables(data, split, delete=True)
    
    for x in range(items.shape[0]):
        child = create_node(dict[items[x]], metadata)
        node.children.append((items[x], child))
    
    return node

def empty(size):
    s = ""
    for x in range(size):
        s += "   "
    return s

def print_tree(node, level):
    if node.answer != "":
        print(empty(level), node.answer)
        return
    print(empty(level), node.attribute)
    for value, n in node.children:
        print(empty(level + 1), value)
        print_tree(n, level + 2)

# Usage example:
metadata, traindata = read_data("3rd program tennisdata.csv ")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)


     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
3       rain        mild     high    weak        yes
4       rain        cool   normal    weak        yes
5       rain        cool   normal  strong         no
6   overcast        cool   normal  strong        yes
7      sunny        mild     high    weak         no
8      sunny        cool   normal    weak        yes
9       rain        mild   normal    weak        yes
10     sunny        mild   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
13      rain        mild     high  strong         no 

 outlook
    overcast
       b'yes'
    rain
       wind
          b'strong'
             b'no'
          b'weak'
             b'yes'
    sunny
       humidity
          b'high'
             b'no'
      