In [59]:
from collections import Counter
import numpy as np
from math import log
import copy

In [60]:
class TreeNode:
    
    """
    A class for nodes in a rooted Tree
    """
    def __init__(self, attr):
        self.label = None
        self.attr = attr
        self.children ={}
        
    def add_child(self,val,node):
        self.children[val]=node
        

In [61]:
class DecisionTree():
    
    def __init__(self, gain_type, depth):
        
        self.gain_type = gain_type
        self.depth = depth
        
    def info_gain(self, labels):

        measure = 0

        if self.gain_type == 'gini_index':
            n = len(labels)
            counter = Counter(labels)
            measure= 1 - sum((counter[count]/n)**2 for count in counter)
        
            return measure
  
        elif self.gain_type == 'majority_error':
            n = len(labels)
            counter = Counter(labels)
            majority = counter.most_common(1)[0][1]
            measure = 1 - (majority/n)
        
            return measure
    
        elif self.gain_type== 'Entropy':
            n=len(labels)
            counter = Counter(labels)
            measure= -sum(counter[count]/n * log(counter[count]/n, 2) for count in counter)

            return measure
        
    def select_feature(self, S, attrs, labels):
        
        entropy_D = self.info_gain(labels)
        
        max_gain = -1
        best_attr = None

        for attr in attrs:
            entropy_x= 0
            
            for v in attrs[attr]:
                    Sv_labels = [label for i, label in enumerate(labels) if S[i][attr] == v]
                    entropy_x += (len(Sv_labels)/len(labels)) * self.info_gain(Sv_labels)
                    gain_x=entropy_D-entropy_x
                    
            if gain_x > max_gain:
                max_gain = gain_x
                best_attr = attr

        return best_attr
     
    def common_label(self, label_list):
        
        count = Counter(label_list)
        C_L=count.most_common(1)[0][0]
        
        return C_L

    def ID3(self, S, attrs, labels, depth):
        
        common_label = self.common_label(labels)
        
         # If all examples have same label; or Attributes empty; or max_depth
            
        if len(set(labels)) == 1 or not attrs or depth == 0:
            leaf = TreeNode(None)
            leaf.label = common_label
            return leaf
        
        best_attr = self.select_feature(S, attrs, labels)

        root = TreeNode(best_attr)

        for v in attrs[best_attr]:
            new_branch = TreeNode(v)

            S_val = [val for i, val in enumerate(S) if S[i][best_attr] == v]
            S_val_labels = [label for i, label in enumerate(labels) if S[i][best_attr] == v]

            if not S_val:
                new_branch.label = common_label
                root.add_child(v, new_branch)
            else:
                sub_attrs = copy.deepcopy(attrs)
                sub_attrs.pop(best_attr)

                root.add_child(v, self.ID3(S_val, sub_attrs, S_val_labels, depth - 1))

        return root

    def build_tree(self, S, attrs, labels):
        
        self.root = self.ID3(S, attrs, labels, self.depth)
        
    def fit(self, instance):
        
        root=self.root
        
        while root.children:
            attr = instance[root.attr]
            if attr in root.children:
                root = root.children[attr]

        return root.label
    
    def cls(self, S):
        pred_labels = []
        for s in S:
            pred_label=self.fit(s)
            pred_labels.append(pred_label)
            
        return pred_labels

        
    def Error(self, pred_labels, true_labels):
        count = 0
        for i, j in zip(pred_labels, true_labels):
            if i == j:
                count += 1   
        return 1 - count/len(true_labels)

In [62]:
features = {'age': [0, 1],  # converted to binary
        'job': ['admin.', 'unknown', 'unemployed', 'management', 'housemaid', 'entrepreneur', 'student', 'blue-collar', 'self-employed', 'retired', 'technician', 'services'], 
        'marital': ['married','divorced','single'], 
        'education': ['unknown', 'secondary', 'primary', 'tertiary'],
        'default': ['yes', 'no'],
        'balance': [0, 1],  # converted to binary
        'housing': ['yes', 'no'],
        'loan': ['yes', 'no'],
        'contact': ['unknown', 'telephone', 'cellular'],
        'day': [0, 1],  # converted to binary,
        'month': ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'],
        'duration': [0, 1],  # converted to binary
        'campaign': [0, 1],  # converted to binary
        'pdays': [0, 1],  # converted to binary
        'previous': [0, 1],  # converted to binary
        'poutcome': ['unknown', 'other', 'failure', 'success']}

In [63]:
if __name__ == '__main__':
    
    attrs = {}
    attr_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays','previous' , 'poutcome']
    attr_numeric = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
    attr_unknown=['job', 'education', 'contact', 'poutcome']
    
    train_x=[]
    train_y=[]
    with open ('train.csv' ,'r') as file:
        for line in file:
            s = {}
            term = line.strip().split(',')
            for idx, item in enumerate(term[:-1]):
                 s[attr_names[idx]] = item

            train_x.append(s)
            train_y.append(term[-1])

convert numeric variable to binary variable in train dataset

In [64]:
#make an empty list of list to extract the numeric variable in train data
Median_numeric_list = [[] for __ in range(len(attr_numeric))]

for element in train_x:
    for idx, value in enumerate(attr_numeric):
        # convert to float
        sv = float(s[value])
        Median_numeric_list[idx].append(sv)
        
for idx, med in enumerate(Median_numeric_list):
       Median_numeric_list[idx] = np.median(med)

#convert a numerical feature to a binary one
for (attr, med) in zip(attr_numeric, Median_numeric_list):
        #feature is bigger (or less) than median
        for item in train_x:
            item[attr] = 'bigger' if float(item[attr]) >= float(med) else 'less'

In [65]:
for item in train_x:
    for idx, val in enumerate(item):
        attr = item[val]
        
        if val not in attrs:
            attrs[val] = []
        if attr not in attrs[val]:
                attrs[val].append(attr)

In [66]:
#test data
test_x=[]
test_y=[]
    
with open ('test.csv' ,'r') as file:
    for line in file:
        s = {}
        term = line.strip().split(',')
        for idx, item in enumerate(term[:-1]):
            s[attr_names[idx]] = item

        test_x.append(s)
        test_y.append(term[-1])    

Treat unknow as missing value

In [67]:
#extract the unknown variables as missing value
unknown_list = [[] for __ in range(len(attr_unknown))]

for element in train_x:
    for idx, ele in enumerate(attr_unknown):
        unknown_list[idx].append(element[ele])
        
#use majority replace the unknow       
unknown_list= [Counter(each_unknow).most_common(1)[0][0] for each_unknow in unknown_list]
for (attr, each_unknow) in zip(attr_unknown, unknown_list):
    for item in train_x:
        item[attr] = each_unknow

for (attr, each_unknow) in zip(attr_unknown,unknown_list):
    for element in test_x:
        element[attr] = each_unknow

convert numeric variable to binary variable in test dataset

In [68]:
for (attr, med) in zip(attr_numeric, Median_numeric_list):
        #feature is bigger (or less) than median
        for item in test_x:
            item[attr] = 'bigger' if float(item[attr]) >= float(med) else 'less'

In [258]:
clssifier = DecisionTree(gain_type="gini_index", depth=13)
clssifier.build_tree(train_x, attrs, train_y)

In [259]:
train_errors = []
train_predicted = clssifier.cls(train_x)
train_error = clssifier.Error(train_predicted, train_y)
train_errors.append(train_error)

In [260]:
train_errors

[0.09079999999999999]

In [261]:
test_errors = []
test_predicted = clssifier.cls(test_x)
test_error = clssifier.Error(test_predicted, test_y)
test_errors.append(test_error)

In [262]:
test_errors

[0.13460000000000005]