# MLEA : Arbres de Décisions

## ID3

### Toy Problem: Play Tennis?

#### Data

In [32]:
from io import StringIO
from graphviz import Digraph
import pandas as pd
import numpy as np
import ete3

In [33]:
csv_data = StringIO("""
Outlook,Temperature,Humidity,Wind
Sunny,Hot,High,Weak
Sunny,Hot,High,Strong
Overcast,Hot,High,Weak
Rain,Mild,High,Weak
Rain,Cool,Normal,Weak
Rain,Cool,Normal,Strong
Overcast,Cool,Normal,Strong
Sunny,Mild,High,Weak
Sunny,Cool,Normal,Weak
Rain,Mild,Normal,Weak
Sunny,Mild,Normal,Strong
Overcast,Mild,High,Strong
Overcast,Hot,Normal,Weak
Rain,Mild,High,Strong
""")

In [34]:
df=pd.read_csv(csv_data, sep=',',header=0)

In [35]:
print(df.index[df.loc[:, 'Outlook'] == 'Sunny'])
df

Int64Index([0, 1, 7, 8, 10], dtype='int64')


Unnamed: 0,Outlook,Temperature,Humidity,Wind
0,Sunny,Hot,High,Weak
1,Sunny,Hot,High,Strong
2,Overcast,Hot,High,Weak
3,Rain,Mild,High,Weak
4,Rain,Cool,Normal,Weak
5,Rain,Cool,Normal,Strong
6,Overcast,Cool,Normal,Strong
7,Sunny,Mild,High,Weak
8,Sunny,Cool,Normal,Weak
9,Rain,Mild,Normal,Weak


In [36]:
x = df
labels = np.array(["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"])

### Implementation

In [54]:
class Node:
    """Simple custom tree structures containing the ID3 data.
    
    """
    
    def __init__(self, dot, parent_attrib):
        self.attribute = None
        self.label = None
        self.parent_attrib = parent_attrib
        self.dot = dot
        self._children = {}
    
    def setLabel(self, label):
        """Sets the label of the current node.
        
        Parameters
        ----------
        
        label:
            Value of the label.
    
        """
        self.label = label
    
    def update(self, attribute):
        """Updates the value of the node, by specifying its atrribute,
        as well as making the link in the dot tree, for rendering.
        
        Parameters
        ----------
        
        attribute:
            Value of the attribute of the node.
    
        """
        
        self.attribute = attribute
        
        # This is gross, but it allows to easily
        # pretty print the tree.
        self.dot.node(attribute, attribute)
        if not(self.parent_attrib is None):
            self.dot.edge(self.parent_attrib, self.attribute, constraint='false')
        
        #for child_key in self._children:
         #   self.dot.edge(attribute, self._children[child_key].attribute, constraint='false')
    
    def add(self, idx, attribute=None):
        """Adds a new node as a child, using a custom edge
        value.
        
        Parameters
        ----------
        
        idx:
            Name of the edge added between self and the new node.
    
        """
        
        child = Node(self.dot, self.attribute)
        self._children[idx] = child
        return child
    
class ID3:
    
    @staticmethod
    def entropy(s, unique_labels):
        """Computes the entropy of a given set of values.
        
        Parameters
        ----------
        
        s:
            The set on which the entropy is computed.
    
        """
        # Used to avoid having invalid value in the logarithm.
        EPSILON = 0.0001
        nb_rows = len(s)
        entropy = 0.0
        for label in unique_labels:
            label_count = 0
            for i in range(0, nb_rows):
                if s[i] == label:
                    label_count += 1
            proba = (label_count / nb_rows) + 0.0001
            entropy += - proba * np.log2(proba)
        return entropy
    
    @staticmethod
    def gain(s, target_attrib, attributes, x, y, unique_labels):
        """Computes the gain .
        
        Parameters
        ----------
        
        s:
            The set on which the entropy is computed.
    
        """
        
        column = x.ix[:, target_attrib][s]
        values = attributes[target_attrib]
        nb_rows = len(s)
        
        gain = ID3.entropy(y[s], unique_labels)
        
        for value in values:
            indices = column.index[column.loc[:] == value]
            factor = len(indices) / nb_rows
            y_new = y[indices.tolist()]
            if (len(y_new) == 0):
                continue
            gain += - factor * ID3.entropy(y_new, unique_labels)
            
        return gain
    
    def __init__(self, x, y):
        # If all the labels are equals, we return a unique
        # node.
        label = y[0]
        for i in range(1, len(y)):
            if not(label == y[i]):
                label = None
                break
        if not(label == None):
            return Node(label, len(y))
        
        unique_labels = set(y)
        
        # This maps store, for all attributes, a
        # set of unique values.
        unique_attributes = {}
        for column in enumerate(x.columns):
            column_id = column[1]
            unique_attributes[column_id] = set(x.ix[:, column_id])
        
        self.dot = Digraph(comment='The Round Table')
        
        s = [i for i in range(0, len(y))]
        self.root = Node(self.dot, None)
        self.__build(self.root, x, y, s, unique_attributes, unique_labels)
    
    def __build(self, node, x, y, s, attributes, unique_labels):
        unique = np.unique(y[s])
        if unique.shape[0] < 2:
            #node.label = unique[0]
            node.setLabel(unique[0])
            return

        from IPython.core.debugger import Tracer;
        # Choose the best attribute for classification.
        best_gain = -0.01
        best_attribute = None
        for attr in attributes:
            gain = ID3.gain(s, attr, attributes, x, y, unique_labels)
            if gain > best_gain:
                best_gain = gain
                best_attribute = attr
        
        column = x.ix[s, best_attribute]
        values = attributes[best_attribute]
        
        del attributes[best_attribute]
        if len(attributes) == 0:
            return
    
        for value in values:
            # Adds a child node pointed by value, and
            # then we can recursively create it.
            new_s = column.index[column.loc[:] == value].tolist()
            child = node.add(value)
            self.__build(child, x, y, new_s, dict(attributes), unique_labels)
        
        # Updates the name and the branche names
        if node.attribute == None:
            node.update(best_attribute)
        
    def pretty_print(self):
        self.root.dot.render()

In [55]:
tree = ID3(x, labels)
tree.pretty_print()

ExecutableNotFound: failed to execute ['dot', '-Tpdf', '-O', 'Digraph.gv'], make sure the Graphviz executables are on your systems' PATH