In [88]:
import numpy as np
from matplotlib import pyplot as plt
from pprint import pprint

# class definitions

class Tree:

    edge_counter = 0
    
    def entropy(X):

        # compute probability distribution of attribute X
        unique, counts = np.unique(X, return_counts=True)
        px = counts / len(X)

        # return the log base 2 entropy
        return -np.dot(px, np.log2(px))

    def compute_gain(S, i):
        """ computes the information gain H(S) - H(S|i)
        """
        H_S = entropy(S)

        # compute the conditional entropy H(S|i)
        avg_H = 0
        unique, counts = np.unique(i, return_counts=True)
        s = 1 / len(S)
        for j in range(len(unique)):
            idx = np.where(unique[j] == i)[0]
            avg_H += len(idx) * s * entropy(S[idx])

        return H_S - avg_H

    def majority_leaf(data):
        unique, counts = np.unique(data[:,0], return_counts=True)
        return unique[np.argmax(counts)]
    
    def ID3(self, d, n, data):
        """ this function implements the ID3 algorithm based on the
            definition given in Lab2.
        input: 
            d := maximum depth of the tree, will be reduced on each level
            n := the maximum number of nodes. The static edge_counter variable
            shall not exceed the number of n - 1 edges.
        data := 2d numpy array {columns: features} X {rows: samples}
   
        output:
            tree := the binary decision tree for the dataset
        """
    
    
        # return a leaf in case of
        #   - uniform labels
        #   - maximal depth
        #   - maximum number of n nodes with n-1 edges reached
        if (len(np.unique(data[:,0])) == 1) or (d == 1) or (self.edge_counter >= n-1):
            return majority_leaf(data)
        
        # else investigate node further
        else:
            
            # return a leaf if no more attributes are left 
            if data.shape[1] == 1:
                return majority_leaf(data)
            
            # compute the information gain IG for each attribute
            gains = [compute_gain(data[:,0], data[:,x]) for x in range(1, data.shape[1])]    
            
            # get the attribute with highest IG
            split_column = np.argmax(gains)
            
            # compute separate row indices for 'y' and 'n'
            # according to the attribute with highest IG
            yea_idx = np.where(data[:,split_column] == 'y')[0]
            nea_idx = np.where(data[:,split_column] == 'n')[0]

            # return a leaf if samples are empty
            if len(nea_idx) == 0:
                return majority_leaf(data)
            if len(yea_idx) == 0:
                return majority_leaf(data)
            
            # split the data and remove the attribute with highest IG
            data_yea = np.delete(data[yea_idx,:], split_column, 1)
            data_nea = np.delete(data[nea_idx,:], split_column, 1)

            # label the decision node
            decision = 'attr_' + str(split_column)

            # instantiate sub-tree for the decision node
            tree = {decision: []}
            
            # increase the edge counter and reduce the max depth
            self.edge_counter +=2
            d -= 1

            yea = self.ID3(d, n, data_yea)
            nea = self.ID3(d, n, data_nea)

            if yea == nea:
                tree = yea
            else:
                tree[decision].append(yea)
                tree[decision].append(nea)
            return tree

# function definitions

def load_data(fname):
    data = np.genfromtxt(fname, delimiter=',', dtype=str)
        
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if data[i][j] == '?':
                
                # for each element set a random choice
                data[i][j] = np.random.choice(['y', 'n'])
    return data
        
def split_data(data, split):
    
    # create permuted indices for training and test sets
    perm = np.random.permutation(np.indices((len(data),))[0])
    
    # calculate the number of rows according to the given percentage
    nrows = int(split * len(data))
    training_set = data[perm[:nrows]]
    test_set = data[perm[nrows:]]
    return training_set, test_set

def learning_curve(d, n, training_set, test_set):
    # you will probably need additional helper functions
    return plot

# main

data = load_data('house-votes-84.data')

k = np.array([1,32,4,5])

# set seed before splitting the data
np.random.seed(666)
tr_s, te_s = split_data(data, .7)

n = 20
d = 16

Tree = Tree()

pprint(Tree.ID3(d, n, tr_s))

Tree.edge_counter = 0
pprint(Tree.ID3(d, n, data))

{'attr_3': ['democrat',
            {'attr_9': [{'attr_2': ['republican',
                                    {'attr_1': ['democrat', 'republican']}]},
                        {'attr_8': ['democrat', 'republican']}]}]}
{'attr_3': ['democrat',
            {'attr_2': [{'attr_1': [{'attr_7': ['republican',
                                                {'attr_6': ['democrat',
                                                            'republican']}]},
                                    'republican']},
                        'republican']}]}


### References:
idea for building a tree: https://github.com/SebastianMantey/Decision-Tree-from-Scratch/blob/master/notebooks/handling%20continuous%20and%20categorical%20variables.ipynb