# Decision trees on iris dataset

The iris dataset contains sepal length, sepal width, petal length and petal width for classifying flowers between 3 classes. 

Here, we will be using decision trees to classify a given input into one of the 3 classes.

In [44]:
# sklearn for only the dataset, pandas for managing the dataset and numpy for processing
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from IPython.display import display
import matplotlib.pyplot as plt

In [2]:
# Importing the dataset
data = load_iris()

x = data['data']
y = data['target']
col_names = data['feature_names']

x = pd.DataFrame(x, columns=col_names)
x['target'] = y

tgt_types = x.loc[(x['sepal length (cm)'] > 5)]['target'].unique()

In [57]:
# Splitting the dataset
train, test = train_test_split(x, test_size=25, train_set=125)

TypeError: Invalid parameters passed: {'train_set': 125}

In [58]:
def gini(x, col_name=None, thresh_val=None, debug=False):
    
    if(col_name == None):
        tot_no = x.shape[0]
        class_loss = np.zeros((tgt_types.size,1))
        
        for i in range(tgt_types.size):
            class_no = x.loc[x['target'] == tgt_types[i]].shape[0]
            class_p  = class_no / tot_no
            class_loss[i, 0] = class_p * (1 - class_p)

        return sum(class_loss)

    upper = x.loc[x[col_name] >  thresh_val]
    lower = x.loc[x[col_name] <= thresh_val]


    class_loss = np.zeros((tgt_types.size, 2))

    upper_tot = upper.shape[0]   
    lower_tot = lower.shape[0] 

    if(upper_tot == 0 or lower_tot == 0):
        return 1

    for i in range(tgt_types.size):
        upper_cls_no = upper.loc[upper['target'] == tgt_types[i]].shape[0]
        lower_cls_no = lower.loc[lower['target'] == tgt_types[i]].shape[0]
        
        upper_cls_p  = upper_cls_no / upper_tot
        lower_cls_p  = lower_cls_no / lower_tot

        class_loss[i, 0] = upper_cls_p * (1 - upper_cls_p)
        class_loss[i, 1] = lower_cls_p * (1 - lower_cls_p)

    gini = np.sum(class_loss, axis=0)

    impurity = (upper_tot*gini[0]/x.shape[0]) + (lower_tot*gini[1]/x.shape[0])

    if debug:
        plt.scatter(upper[col_name], upper['target'], color="green")
        plt.scatter(lower[col_name], lower['target'], color="blue")
        plt.ylabel("Class")
        plt.xlabel(col_name)
        plt.axline((thresh_val, 0), (thresh_val, 2))
        plt.title("Split : %.2f, Impurity : %.3f"%(thresh_val, impurity))
        plt.show()

        
    return impurity

In [59]:
def split_data(x, col_name, thresh):
    upper = x.loc[x[col_name] > thresh].drop(col_name, axis=1)
    lower = x.loc[x[col_name] <= thresh].drop(col_name, axis=1)

    return (upper, lower)

In [60]:
def find_best_split(x):
    inputs = x.drop('target', axis=1)
    outputs = x['target']
    
    min_impurity = 1
    min_col = ""
    min_thresh = -1

    for col_name in inputs.columns:

        values = x.sort_values(col_name)[col_name].unique()
        thresholds = [(values[i]+values[i+1])/2 for i in range(values.shape[0]-1)]

        for i in thresholds:
            impurity = gini(x, col_name, i, False)

            if(impurity < min_impurity):
                min_impurity = impurity
                min_col      = col_name
                min_thresh   = i

    return (min_impurity, min_col, min_thresh)

In [61]:
class node():
    def __init__(self, parent, level=None, name=None, debug=False):
        self.parent = parent
        
        if(level == None):
            if(parent == None):
                self.level = 0
            else:
                self.level = parent.level + 1
        else:
            self.level = level
        
        self.leaf   = False             # Is this node a leaf node?
        self.state  = None              # The output state (if a leaf node)
        self.col    = None              # Threshold column name
        self.thresh = 0                 # Threshold value
        
        self.upper  = None              # Upper child node
        self.lower  = None              # Lower child node

        self.debug  = debug             # If true, displays debug information

        if(name == None):
            if(parent == None):
                self.name = "root_node"
            else:
                self.name = "level%d_node" % self.level
        else:
            self.name = name

        if(debug):
            print(" %15s : Initialised node with level : %2d" % (self.name, self.level))

    # Turn the node into a leaf node, with set output
    def make_leaf(self, output):
        self.leaf  = True
        self.state = output

        if(self.debug):
            print(" %15s : Made into leaf node, with output '%d'" % (self.name, self.state))

    # Train with data (set column and threshold). If the gini impurity has deteriorated or not improved, them the node is made a leaf node.
    # If we do not want the node to be automatically turned into a leaf node, we can set force_branch to True
    def train(self, x, force_branch=False):
        data_impurity = gini(x)
        impurity, self.col, self.thresh = find_best_split(x)

        if(self.debug):
            print(" %15s : Trained. Impurity before : %.2f, Impurity : %.2f, Column : '%s', Threshold : %.2f" % (self.name, data_impurity, impurity, self.col, self.thresh))
        
        if(impurity >= data_impurity and not force_branch):
            self.make_leaf(x.mode()['target'][0])
            self.col    = None
            self.thresh = None

    # Split the data into two, if its not a leaf node
    def split(self, x):
        if(self.leaf):
            if(self.debug):
                print(" %15s : Cant split, is a leaf node."%self.name)
            return False

        if(self.debug):
            print(" %15s : Splitting input..."%self.name)
        
        return split_data(x, self.col, self.thresh)
    
    # Attach the upper child node
    def attach_upper(self, upper_node):
        if(self.leaf):
            print(" %15s : Cant attach, is a leaf node"%self.name)
            return False
        
        self.upper = upper_node
        return True

        if(self.debug):
            print(" %15s : Attached upper node '%s'"%(self.name, self.upper.name))

    # Attach the lower child node
    def attach_lower(self, lower_node):
        if(self.leaf):
            print(" %15s : Cant attach, is a leaf node"%self.name)
            return False

        self.lower = lower_node
        return True

        if(self.debug):
            print(" %15s : Attached upper node '%s'"%(self.name, self.lower.name))

    # Classify a given data sample
    def classify(self, x):
        if(self.leaf):
            return self.state

        if(x[self.col] > self.thresh):
            if(self.debug):
                print(" %15s : Moving to upper node"%self.name)
            return self.upper.classify(x)
        else:
            if(self.debug):
                print(" %15s : Moving to lower node"%self.name)
            return self.lower.classify(x)

    def make_children(self, debug=None):
        if(self.debug):
            print(" %15s : Making children nodes..."%self.name)

        if(debug==None):
            debug=self.debug

        upper = node(self, name='%s+'%(self.name), debug=debug)
        lower = node(self, name='%s-'%(self.name), debug=debug)

        self.attach_upper(upper)
        self.attach_lower(lower)

        return upper, lower

    def train_children(self, x):
        if(self.debug):
            print(" %15s : Training children"%self.name)

        upper_ds, lower_ds = self.split(x)
        upper_gini, lower_gini = gini(upper_ds), gini(lower_ds)
        
        self.upper.train(upper_ds, upper_gini)
        self.lower.train(lower_ds, lower_gini)

    def get_children(self):
        return self.upper, self.lower

    def set_debug(self, debug=True, propagate=False):
        if(self.debug != debug):
            print(" %15s : Setting debug to %s"%(self.name, debug))
        self.debug=debug
        if(propagate):
            if(not self.leaf):
                self.upper.set_debug(debug, True)
                self.lower.set_debug(debug, True)

In [62]:
# Recurseive depth-first search, while building the tree
def build_tree(data, level, root_node=None, debug=False):
    if(level == 0):
        leaf_out = data.mode()['target'][0]
        root_node.make_leaf(leaf_out)
        return
    
    if(root_node == None):
        root_node = node(None, name='root', debug=debug)
    
    root_node.train(data)
    
    if(root_node.leaf):
        return

    root_node.make_children()
    upper_ds, lower_ds = root_node.split(data)
    
    build_tree(upper_ds, level-1, root_node.upper)
    build_tree(lower_ds, level-1, root_node.lower)

    return root_node

In [67]:
root = build_tree(train, 2, debug=False)
root.set_debug(False, propagate=True)

In [68]:
error = 0

for i in test.index:
    inp = test.loc[i]
    out = root.classify(inp)
    if(out != inp['target']):
        error = error+1

print(" Test set : %d/%d were wrongly classified. " % (error,test.shape[0]))

 Test set : 3/25 were wrongly classified. 


In [69]:
error = 0

for i in train.index:
    inp = train.loc[i]
    out = root.classify(inp)
    if(out != inp['target']):
        error = error+1

print(" Train set : %d/%d were wrongly classified. " % (error,train.shape[0]))

 Train set : 3/125 were wrongly classified. 
