In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import common as cm

# Part 1: Information Gain

Important note: this exercise uses Pandas (for data manipulation and analysis) and Graphviz (for graph-drawing) libraries. 

This exercise consists of 3 parts. Complete the first part to get a mark of 3.0, the first two parts to get 4.0, complete all assignments to get 5.0. 

1.1 ) There are 10 objects (data) characterized with 5 binary attributes:

In [2]:
attributeNames = ["attr 1", "attr 2", "attr 3", "attr 4", "attr 5"]

data = pd.DataFrame(
    [
        [1, 0, 1, 1, 1],
        [1, 1, 0, 0, 1],
        [0, 1, 1, 1, 1],
        [1, 0, 1, 0, 1],
        [1, 0, 0, 1, 1],
        [0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 1, 1],
    ],
    columns=attributeNames,
)

1.2) Each object is assigned to either a class "0" or "1". The assignments are as follows (cl):

In [3]:
data["cl"] = [1, 1, 1, 0, 0, 1, 1, 1, 0, 0]

Hint: How one can read data (columns) in Pandas

In [4]:
print(data["cl"])
print(list(data["cl"]))
print(set(data["cl"]))
print(data["attr 1"])

0    1
1    1
2    1
3    0
4    0
5    1
6    1
7    1
8    0
9    0
Name: cl, dtype: int64
[1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
{0, 1}
0    1
1    1
2    0
3    1
4    1
5    0
6    1
7    1
8    0
9    0
Name: attr 1, dtype: int64


1.3 )  Finish the below function for calculating entropy. $H(CL) = - \sum_{y \in CL}p(y)log_2p(y)$ It should return a value of entropy for an input vector CL. Assume that $log_2(0)$ is equal to 0.

In [5]:
def getEntropy(cl):
    class_num = len(set(cl))
    class_probab = [0]*class_num
    
    for val in cl:
        if val != class_num:
            class_probab[val] += 1
        else:
            class_probab[0] += 1
    for i in range(len(class_probab)):
        class_probab[i] /= len(cl)
        
    entropy = 0
    for probab in class_probab:
        if probab != 0:
            entropy += - probab * math.log(probab, 2)
    entropy = round(entropy, 5)
    return entropy

1.4 ) Calculate the entropy for the CL vector:

In [6]:
getEntropy(data["cl"])

0.97095

1.5) Finish the below function for calculating a conditional entropy: $H(CL|X) = - \sum_{x \in X} \sum_{y \in CL} p(x,y) log_2 \frac{p(x,y)}{p(x)}$. Assume that $log_2(0)$ is equal to 0 and if $p(x) = 0$, $\frac{p(x,y)}{p(x)}$ is equal to 0 as well.

In [7]:
def getConditionalEntropy(cl, attr):
    table = {0:[0, 0], 1:[0, 0]}
    # 'key' is att's value (0 or 1) and 'value' is list which contains count of belonging to that group with value of its index
    for i in range(len(attr)):
        att = attr[i]
        cl_att = cl[i]
        
        table[att][cl_att] += 1 # eg. att == 1 for cl == 0 occurs 4 times in the entire dataset, so we wanna count that
    
    for att, clas in table.items():
        clas_elems = sum(clas)
        partial_entropy = 0
        for val in clas:
            if clas_elems != 0:
                probab = (val / clas_elems)
                if probab != 0:
                    partial_entropy += -probab * math.log(probab, 2)
        table[att] = [partial_entropy, clas_elems]
    
    entropy = 0
    for att, entr_elems in table.items():
        clas_proba = entr_elems[1] / len(cl) # entr_elems[0] = partial entropy; entr_elems[1] = number of classes it belonged to
        entropy += clas_proba * entr_elems[0]
    
    return entropy

1.6 ) Calculate conditional entropies for given attribiutes.

In [8]:
print(getConditionalEntropy(data["cl"], data["attr 1"]))
print(getConditionalEntropy(data["cl"], data["attr 5"]))

0.9509775004326937
0.9709505944546686


1.7 ) Which entropy is lesser and why?

1.8) Finish the below function for calculating information gain:

In [9]:
def getInformationGain(cl, attr):
    conditionalEntropy = getConditionalEntropy(cl, attr)
    entropy = getEntropy(cl)
    information_gain = entropy - conditionalEntropy
    return round(abs(information_gain), 5)

In [10]:
print(getInformationGain(data["cl"], data["attr 1"]))
print(getInformationGain(data["cl"], data["attr 5"]))

0.01997
0.0


# Part 2: ID3 algorithm

Decision tree consists of decision nodes and leaves. Nodes split data while leaves classify objects. Consider the class "Node" provided below. It consists of 4 fields:
- attr - attribute ID (use the names in attributeNames vector)
- left - left branch, i.e., a reference to other node
- right - right branch, i.e., a reference to other node
- value - a decision. If node = None, then the node is not a leaf. If value is not None, then a node is considered a leaf. 

Method __call__ returns the decision if the node is a leaf (i.e., when value is not None). 
Otherwise, it calls either the left or the right branch of an input object, based on the attribute value (0 -> left children; 1 -> right children). In this way, we can traverse the decision tree in order to find the final decision.

In [11]:
class Node:
    def __init__(self, attr, left, right, value):
        self.attr = attr
        self.left = left
        self.right = right
        self.value = value

    def __call__(self, obj):
        if self.value is None:
            if obj[self.attr] == 0:
                return self.left(obj)
            else:
                return self.right(obj)
        else:
            return self.value
        
### EXAMPLE
def example(obj):
    root = Node(0, None, None, None)
    lChildren = Node(1, None, None, None)
    rChildren = Node(None, None, None, 2)
    root.left = lChildren
    root.right = rChildren
    llChildren = Node(None, None, None, 3)
    rrChildren = Node(None, None, None, 4)
    lChildren.left = llChildren
    lChildren.right = rrChildren
    print(root(obj))
    
example([0, 0])
example([0, 1])
example([1, 0])
example([1, 1])

3
4
2
2


2.1) Create an initial root. Set the value (decision) to 1. 

In [12]:
root = Node(0, None, None, 1)

2.2) Use a getErrorRate method in common.py auxiliary file to calculate the error rate. The decision is made based on the majority rule. In case of tie, the method takes 0 as the default class.

In [13]:
error_rate = round(cm.getErrorRate(root, data), 5)
print(error_rate)

0.4


2.3) Use printGraph method (see the common.py file) to draw the decision tree and save it in a png file.

In [14]:
cm.printGraph(root, data)

2.4) Calculate information gain for all attribiutes.

In [15]:
def printInformationGain(data):
    gains = []
    for attribute_name in attributeNames:
        gains.append(getInformationGain(data['cl'], data[attribute_name]))
    return gains
        
gains = printInformationGain(data)

2.5) Choose the best attribute to split the data. Construct two new nodes: one for $x_i$ = 0 decision and the second for $x_i$ = 1; connect them with the root (left and right branch). Remember to update the root. 

In [16]:
def get_indices_for_att(possible_val, att_values):
    indices = [i for i in range(len(att_values)) if att_values[i] == possible_val]
#     print('Indices for att = ' + str(possible_val) + ':', indices)
    return indices

def get_decision_value(att_best_gain, data):
    att_values = data[att_best_gain]
    classes = data['cl']
    possible_values = set(att_values)
    
    majority = []
    for possible_val in possible_values:
        indices = get_indices_for_att(possible_val, att_values)
        class_values = [classes[index] for index in indices]
        majority_class = max(set(class_values), key=class_values.count)
        majority.append(majority_class)
    return majority

best_gain = gains.index(max(gains))
att_best_gain = attributeNames[best_gain]
majority_classes = get_decision_value(att_best_gain, data)

left_node = Node(majority_classes[0], None, None, majority_classes[0])
right_node = Node(majority_classes[1], None, None, majority_classes[1])

root.attr = att_best_gain
root.left = left_node
root.right = right_node
root.value = None

2.6) Print the graph and calculate the error rate. What happened with the error rate?

In [17]:
cm.printGraph(root, data)
error_rate = round(cm.getErrorRate(root, data), 5)
print(error_rate)

0.3


2.7) Split the 'data' (table) based on the selected attribiute, i.e., create two new tables.

In [18]:
columns = ["attr 1", "attr 2", "attr 3", "attr 4", "attr 5", "cl"]
att_values = data[att_best_gain]

indices = get_indices_for_att(possible_val=0, att_values=att_values)
left_data = pd.DataFrame([data.values[index] for index in indices], columns=columns)

indices = get_indices_for_att(possible_val=1, att_values=att_values)
right_data = pd.DataFrame([data.values[index] for index in indices], columns=columns)

print('\nleft data:\n' + str(left_data))
print('\nright data:\n' + str(right_data))


left data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       1       0       0       1   1
1       1       0       0       1       1   0
2       1       0       0       1       1   1
3       0       1       0       0       1   0
4       0       0       0       1       1   0

right data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       1       1       1   1
1       0       1       1       1       1   1
2       1       0       1       0       1   0
3       0       0       1       1       1   1
4       1       1       1       1       1   1


2.8) Let us start with the left node. Firstly, calculate information gain for this node.

In [19]:
gains = printInformationGain(left_data)

2.9) Choose the best attribute to split the data and then update the decision tree.

In [20]:
best_gain = gains.index(max(gains))
att_best_gain = attributeNames[best_gain]
majority_classes = get_decision_value(att_best_gain, data)

left_node_1 = Node(majority_classes[0], None, None, majority_classes[0])
right_node_1 = Node(majority_classes[1], None, None, majority_classes[1])

left_node.attr = att_best_gain
left_node.left = left_node_1
left_node.right = right_node_1
left_node.value = None

2.10) Print the graph and calculate the error rate. What happened with the error rate?

In [22]:
cm.printGraph(root, data)
error_rate = round(cm.getErrorRate(root, data), 5)
print(error_rate)

0.2


2.11) Split data (remember that we split left_data, not data).

In [23]:
att_values = left_data[att_best_gain]

indices = get_indices_for_att(possible_val=0, att_values=att_values)
left_data_1 = pd.DataFrame([left_data.values[index] for index in indices], columns=columns)

indices = get_indices_for_att(possible_val=1, att_values=att_values)
right_data_1 = pd.DataFrame([left_data.values[index] for index in indices], columns=columns)

print('\nleft data:\n' + str(left_data_1))
print('\nright data:\n' + str(right_data_1))


left data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       0       1       0       0       1   0
1       0       0       0       1       1   0

right data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       1       0       0       1   1
1       1       0       0       1       1   0
2       1       0       0       1       1   1


2.12) Repeat the whole process for the right node.

In [24]:
gains = printInformationGain(right_data)

In [25]:
best_gain = gains.index(max(gains))
att_best_gain = attributeNames[best_gain]
majority_classes = get_decision_value(att_best_gain, data)

left_node_2 = Node(majority_classes[0], None, None, majority_classes[0])
right_node_2 = Node(majority_classes[1], None, None, majority_classes[1])

right_node.attr = att_best_gain
right_node.left = left_node_2
right_node.right = right_node_2
right_node.value = None

In [26]:
cm.printGraph(root, data)
error_rate = round(cm.getErrorRate(root, data), 5)
print(error_rate)

0.1


In [27]:
att_values = right_data[att_best_gain]

indices = get_indices_for_att(possible_val=0, att_values=att_values)
left_data_2 = pd.DataFrame([right_data.values[index] for index in indices], columns=columns)

indices = get_indices_for_att(possible_val=1, att_values=att_values)
right_data_2 = pd.DataFrame([right_data.values[index] for index in indices], columns=columns)

print('\nleft data:\n' + str(left_data_2))
print('\nright data:\n' + str(right_data_2))


left data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       1       0       1   0

right data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       1       1       1   1
1       0       1       1       1       1   1
2       0       0       1       1       1   1
3       1       1       1       1       1   1


2.13) Let's consider left-left node. Calculate information gain for it.

In [28]:
gains = printInformationGain(left_data_1)

2.14) Will adding a new node to the tree improve its effectiveness? Why? Why not?

2.15) Calculate information gain for the left-right node.

In [29]:
gains = printInformationGain(right_data_1)

In [30]:
best_gain = gains.index(max(gains))
att_best_gain = attributeNames[best_gain]
majority_classes = get_decision_value(att_best_gain, data)

left_node_3 = Node(majority_classes[0], None, None, majority_classes[0])
right_node_3 = Node(majority_classes[1], None, None, majority_classes[1])

right_node_1.attr = att_best_gain
right_node_1.left = left_node_3
right_node_1.right = right_node_3
right_node_1.value = None

In [31]:
cm.printGraph(root, data)
error_rate = round(cm.getErrorRate(root, data), 5)
print(error_rate)

0.1


In [32]:
att_values = right_data_1[att_best_gain]

indices = get_indices_for_att(possible_val=0, att_values=att_values)
left_data_3 = pd.DataFrame([right_data_1.values[index] for index in indices], columns=columns)

indices = get_indices_for_att(possible_val=1, att_values=att_values)
right_data_3 = pd.DataFrame([right_data_1.values[index] for index in indices], columns=columns)

print('\nleft data:\n' + str(left_data_3))
print('\nright data:\n' + str(right_data_3))


left data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       0       0       1       1   0
1       1       0       0       1       1   1

right data:
   attr 1  attr 2  attr 3  attr 4  attr 5  cl
0       1       1       0       0       1   1


2.16) What happened with the error rate? Is it necessary to keep these two newly added leaves?

2.17) Finish creating the right side of the tree

In [33]:
gains = printInformationGain(right_data_2)

# Part 3: automated construction of decision trees

3.1 Complete the following function for automated construct of decision trees, so that it returns a decision tree for the given data and attribute list. Note that this is a recursive method, i.e., calls itself.

In [34]:
def _get_left_right_node(data, attributeNames, current_node):
    gains = printInformationGain(data)
    best_gain = gains.index(max(gains))
    
    att_best_gain = attributeNames[best_gain]
    
    if max(gains) == 0:
        return None
    
    majority_classes = get_decision_value(att_best_gain, data)

    left_node = Node(majority_classes[0], None, None, majority_classes[0])
    right_node = Node(majority_classes[1], None, None, majority_classes[1])

    current_node.attr = att_best_gain
    current_node.left = left_node
    current_node.right = right_node
    current_node.value = None

    return left_node, right_node

def _get_left_right_data(data):
    gains = printInformationGain(data)
    best_gain = gains.index(max(gains))
    att_best_gain = attributeNames[best_gain]
    
    att_values = data[att_best_gain]

    indices = get_indices_for_att(possible_val=0, att_values=att_values)
    left_data = pd.DataFrame([data.values[index] for index in indices], columns=data.columns)

    indices = get_indices_for_att(possible_val=1, att_values=att_values)
    right_data = pd.DataFrame([data.values[index] for index in indices], columns=data.columns)
    
    return left_data, right_data
    
def go_left(data, attributeNames, current_node, current_depth, max_depth):
    
    if current_depth == max_depth:
        return
    
    try:
        left_node, right_node = _get_left_right_node(data, attributeNames, current_node)
        current_depth += 1
    except: # best gain == 0, None returned
        return
    
    left_data, right_data = _get_left_right_data(data)
    
    go_right(right_data, attributeNames, right_node, current_depth, max_depth)
    go_left(left_data, attributeNames, left_node, current_depth, max_depth)

def go_right(data, attributeNames, current_node, current_depth, max_depth):
    
    if current_depth == max_depth:
        return
    
    try:
        left_node, right_node = _get_left_right_node(data, attributeNames, current_node)
        current_depth += 1
    except: # best gain == 0, None returned
        return
    
    left_data, right_data = _get_left_right_data(data)
    
    go_left(left_data, attributeNames, left_node, current_depth, max_depth)
    go_right(right_data, attributeNames, right_node, current_depth, max_depth)

def create_tree(data, attributeNames, max_depth=0, current_node=None, left_data=None, right_data=None):
    data = data.reset_index().drop("index", axis=1)
    
    left_node = None
    right_node = None
    
    major = max(set(list(data['cl'].values)), key=list(data['cl'].values).count)
    current_node = Node(0, left_node, right_node, major)
        
    left_node, right_node = _get_left_right_node(data, attributeNames, current_node)

    current_depth = 1

    left_data, right_data = _get_left_right_data(data)
    go_left(left_data, attributeNames, current_node, current_depth, max_depth)

    current_depth = 1
    go_right(right_data, attributeNames, current_node, current_depth, max_depth)
            
    return current_node

3.2) Build a decision tree for a training dataset in the common.py auxiliary file, for diffrent values of max_depth.  Calculate & compare the error rates for training and validation datasets.

In [35]:
max_depth = 10

In [36]:
attributeNames_val, validation_data = cm.getValidationDataSet()
attributeNames, data = cm.getTrainingDataSet()

root = create_tree(data, attributeNames, max_depth)

print('validation data:\n' + str(validation_data))
print('\ndata:\n' + str(data))

cm.printGraph(root, data)
error_rate = round(cm.getErrorRate(root, data), 5)
print('\nError rate: ' + str(error_rate))

validation data:
   attr 1  attr 2  attr 3  attr 4  attr 5  attr 6  cl
0       0       0       0       0       0       0   0
1       0       0       0       0       0       1   0
2       0       0       0       1       1       1   1
3       0       1       1       0       0       1   0
4       0       1       0       1       0       1   0
5       1       0       0       1       1       1   1
6       1       1       1       1       0       1   1
7       1       1       0       1       0       1   1
8       1       1       0       0       1       0   0
9       0       1       1       1       1       1   1

data:
    attr 1  attr 2  attr 3  attr 4  attr 5  attr 6  cl
0        0       0       1       1       0       1   0
1        1       1       1       1       0       1   0
2        0       1       1       0       1       0   1
3        1       1       0       0       0       0   1
4        0       1       1       1       0       1   1
5        1       1       0       1       1       1  

3.3) Consider only the training data set and answer the following questions:
* What is the miximum depth of the tree (consider only the training data set)?       
* The tree building process should stop when there is no improvement in error rate (why?). Check for which value of "max_dept" there is no improvement in error rate. 

In [38]:
for i in range(10):
    max_depth = i
    root = create_tree(data, attributeNames, max_depth)
    cm.printGraph(root, data, additional_name=str(i))
    error_rate = round(cm.getErrorRate(root, data), 5)
    print('\nError rate: ' + str(error_rate))


Error rate: 0.35

Error rate: 0.35

Error rate: 0.4

Error rate: 0.4

Error rate: 0.4

Error rate: 0.35

Error rate: 0.35

Error rate: 0.35

Error rate: 0.35

Error rate: 0.35
