In [12]:
from collections import Counter
import numpy as np
from math import log
import copy

In [13]:
class TreeNode:
    
    """
    A class for nodes in a rooted Tree
    """
    def __init__(self, attr):
        self.label = None
        self.attr = attr
        self.children ={}
        
    def add_child(self,val,node):
        self.children[val]=node
        

In [14]:
class DecisionTree():
    
    def __init__(self, gain_type, depth):
        
        self.gain_type = gain_type
        self.depth = depth
        
    def info_gain(self, labels):

        measure = 0

        if self.gain_type == 'gini_index':
            n = len(labels)
            counter = Counter(labels)
            measure= 1 - sum((counter[count]/n)**2 for count in counter)
        
            return measure
  
        elif self.gain_type == 'majority_error':
        
            n=len(labels)
            counter = Counter(labels)
            majority = counter.most_common(1)[0][1]
            measure = 1 - (majority/n)
        
            return measure
    
        elif self.gain_type== 'Entropy':
            n=len(labels)
            counter = Counter(labels)
            measure= -sum(counter[count]/n * log(counter[count]/n, 2) for count in counter)

            return measure
        
    def select_feature(self, S, attrs, labels):
        
        entropy_D = self.info_gain(labels)
        
        max_gain = -1
        best_attr = None

        for attr in attrs:
            entropy_x= 0
            
            for v in attrs[attr]:
                    Sv_labels = [label for i, label in enumerate(labels) if S[i][attr] == v]
                    entropy_x += (len(Sv_labels)/len(labels)) * self.info_gain(Sv_labels)
                    gain_x=entropy_D-entropy_x
                    
            if gain_x > max_gain:
                max_gain = gain_x
                best_attr = attr

        return best_attr
     
    def common_label(self, label_list):
        
        count = Counter(label_list)
        C_L=count.most_common(1)[0][0]
        
        return C_L

    def ID3(self, S, attrs, labels, depth):
        
        common_label = self.common_label(labels)
        
         # If all examples have same label; or Attributes empty; or max_depth
            
        if len(set(labels)) == 1 or not attrs or depth == 0:
            leaf = TreeNode(None)
            leaf.label = common_label
            return leaf
        
        best_attr = self.select_feature(S, attrs, labels)

        root = TreeNode(best_attr)

        for v in attrs[best_attr]:
            new_branch = TreeNode(v)

            S_val = [val for i, val in enumerate(S) if S[i][best_attr] == v]
            S_val_labels = [label for i, label in enumerate(labels) if S[i][best_attr] == v]

            if not S_val:
                new_branch.label = common_label
                root.add_child(v, new_branch)
            else:
                sub_attrs = copy.deepcopy(attrs)
                sub_attrs.pop(best_attr)

                root.add_child(v, self.ID3(S_val, sub_attrs, S_val_labels, depth - 1))

        return root

    def build_tree(self, S, attrs, labels):
        
        self.root = self.ID3(S, attrs, labels, self.depth)
        
    def fit(self, instance):
        
        root=self.root
        
        while root.children:
            attr = instance[root.attr]
            if attr in root.children:
                root = root.children[attr]

        return root.label
    
    def cls(self, S):
        pred_labels = []
        for s in S:
            pred_label=self.fit(s)
            pred_labels.append(pred_label)
            
        return pred_labels

        
    def Error(self, pred_labels, true_labels):
        count = 0
        for i, j in zip(pred_labels, true_labels):
            if i == j:
                count += 1   
        return 1 - count/len(true_labels)

In [15]:
if __name__ == '__main__':
    
    label = ['unacc', 'acc', 'good', 'vgood']
    attrs = {'buying' : ['vhigh', 'high', 'med', 'low'],
                   'maint' : ['vhigh', 'high', 'med', 'low'],
                   'doors' : ['2', '3', '4', '5more'],
                   'persons' : ['2', '4', 'more'],
                   'lug_boot' : ['small', 'med', 'big'],
                   'safety' : ['low', 'med', 'high']}
    attr_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
    
    train_x=[]
    train_y=[]
    
    with open ('train.csv' ,'r') as file:
        for line in file:
            s = {}
            term = line.strip().split(',')
            for idx, item in enumerate(term[:-1]):
                s[attr_names[idx]] = item

            train_x.append(s)
            train_y.append(term[-1])
            
    tree = DecisionTree("Entropy", 6)
    tree.build_tree(train_x, attrs, train_y)
    

In [16]:
test_x=[]
test_y=[]
    
with open ('test.csv' ,'r') as file:
    for line in file:
        s = {}
        term = line.strip().split(',')
        for idx, item in enumerate(term[:-1]):
            s[attr_names[idx]] = item

        test_x.append(s)
        test_y.append(term[-1])            

In [17]:
tree = DecisionTree(gain_type="gini_index", depth=3)
tree.build_tree(train_x, attrs, train_y)

    
train_errors = []
train_predicted = tree.cls(train_x)
train_error = tree.Error(train_predicted, train_y)
train_errors.append(train_error)

In [18]:
train_errors

[0.17600000000000005]

In [19]:
tree = DecisionTree(gain_type="gini_index", depth=4)
tree.build_tree(train_x, attrs, train_y)

test_errors = []
test_predicted = tree.cls(test_x)
test_error = tree.Error(test_predicted, test_y)
test_errors.append(test_error)

In [20]:
test_errors

[0.1332417582417582]