# Decision trees on iris dataset

The iris dataset contains sepal length, sepal width, petal length and petal width for classifying flowers between 3 classes. 

Here, we will be using decision trees to classify a given input into one of the 3 classes.

---

## Importing libraries and data pre-processing

Libraries :
- sklearn
  - Importing iris dataset
  - Splitting dataset into train and test sets
- pandas
  - Dataset is stored as pandas dataframe
- numpy
- matplotlib
  - For visualisation

In [1]:
# sklearn for only the dataset, pandas for managing the dataset and numpy for processing
from sklearn.datasets import *
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from time import time

from IPython.display import display
import matplotlib.pyplot as plt

### About the dataset


In [2]:
# Importing the dataset
data = load_breast_cancer()

x = data['data']
y = data['target']
col_names = data['feature_names']

x = pd.DataFrame(x, columns=col_names)
display(x)
x['target'] = y

tgt_types = x['target'].unique()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
# Splitting the dataset
train, test = train_test_split(x, test_size=60)

## The code!
I have used an object-oriented approach, and have implemented a class for each node in the decision tree (inspired by keras and pytorch APIs). The tree is build using a recursive breadth-first search algorithm, while simultaneously fitting the data.

In [4]:
def gini(x, col_name=None, thresh_val=None, debug=False):
    
    if(col_name == None):
        tot_no = x.shape[0]
        class_loss = np.zeros((tgt_types.size,1))
        
        for i in range(tgt_types.size):
            class_no = x.loc[x['target'] == tgt_types[i]].shape[0]
            class_p  = class_no / tot_no
            class_loss[i, 0] = class_p * (1 - class_p)

        return sum(class_loss)

    upper = x.loc[x[col_name] >  thresh_val]
    lower = x.loc[x[col_name] <= thresh_val]


    class_loss = np.zeros((tgt_types.size, 2))

    upper_tot = upper.shape[0]   
    lower_tot = lower.shape[0] 

    if(upper_tot == 0 or lower_tot == 0):
        return 1

    for i in range(tgt_types.size):
        upper_cls_no = upper.loc[upper['target'] == tgt_types[i]].shape[0]
        lower_cls_no = lower.loc[lower['target'] == tgt_types[i]].shape[0]
        
        upper_cls_p  = upper_cls_no / upper_tot
        lower_cls_p  = lower_cls_no / lower_tot

        class_loss[i, 0] = upper_cls_p * (1 - upper_cls_p)
        class_loss[i, 1] = lower_cls_p * (1 - lower_cls_p)

    gini = np.sum(class_loss, axis=0)

    impurity = (upper_tot*gini[0]/x.shape[0]) + (lower_tot*gini[1]/x.shape[0])

    if debug:
        plt.scatter(upper[col_name], upper['target'], color="green")
        plt.scatter(lower[col_name], lower['target'], color="blue")
        plt.ylabel("Class")
        plt.xlabel(col_name)
        plt.axline((thresh_val, 0), (thresh_val, 2))
        plt.title("Split : %.2f, Impurity : %.3f"%(thresh_val, impurity))
        plt.show()

        
    return impurity

In [5]:
def split_data(x, col_name, thresh):
    upper = x.loc[x[col_name] > thresh].drop(col_name, axis=1)
    lower = x.loc[x[col_name] <= thresh].drop(col_name, axis=1)

    return (upper, lower)

In [6]:
def find_best_split(x):
    inputs = x.drop('target', axis=1)
    outputs = x['target']
    
    min_impurity = 1
    min_col = ""
    min_thresh = -1

    for col_name in inputs.columns:

        values = x.sort_values(col_name)[col_name].unique()
        thresholds = [(values[i]+values[i+1])/2 for i in range(values.shape[0]-1)]

        for i in thresholds:
            impurity = gini(x, col_name, i, False)

            if(impurity < min_impurity):
                min_impurity = impurity
                min_col      = col_name
                min_thresh   = i

    return (min_impurity, min_col, min_thresh)

In [7]:
class DecisionNode():
    def __init__(self, parent, level=None, name=None, debug=False):
        self.parent = parent
        
        if(level == None):
            if(parent == None):
                self.level = 0
            else:
                self.level = parent.level + 1
        else:
            self.level = level
        
        self.leaf   = False             # Is this node a leaf node?
        self.state  = None              # The output state (if a leaf node)
        self.col    = None              # Threshold column name
        self.thresh = 0                 # Threshold value
        
        self.upper  = None              # Upper child node
        self.lower  = None              # Lower child node

        self.debug  = debug             # If true, displays debug information

        if(name == None):               # The name is displayed if debug is turned on
            if(parent == None):
                self.name = "root_node"
            else:
                self.name = "level%d_node" % self.level
        else:
            self.name = name

        if(debug):
            print(" %15s : Initialised node with level : %2d" % (self.name, self.level))

    # Turn the node into a leaf node, with provided output
    def make_leaf(self, output):
        self.leaf  = True
        self.state = output

        if(self.debug):
            print(" %15s : Made into leaf node, with output '%d'" % (self.name, self.state))

    # Train with data (set column and threshold).
    # If the gini impurity has deteriorated or has not improved, then the node is made into a leaf node.
    # force_decision can be set to True to disable automatic conversion to leaf nodes
    def train(self, x, force_decision=False):
        data_impurity = gini(x)
        impurity, self.col, self.thresh = find_best_split(x)

        if(self.debug):
            print(" %15s : Trained. Impurity before : %.2f, Impurity : %.2f, Column : '%s', Threshold : %.2f" % (self.name, data_impurity, impurity, self.col, self.thresh))
        
        if(impurity >= data_impurity and not force_decision):
            self.make_leaf(x.mode()['target'][0])
            self.col    = None
            self.thresh = None

    # Split the data into two, if its not a leaf node
    def split(self, x):
        if(self.leaf):
            if(self.debug):
                print(" %15s : Cant split, is a leaf node."%self.name)
            return False

        if(self.debug):
            print(" %15s : Splitting input..."%self.name)
        
        return split_data(x, self.col, self.thresh)
    
    # Attach the upper child node
    def attach_upper(self, upper_node):
        if(self.leaf):
            print(" %15s : Cant attach, is a leaf node"%self.name)
            return False
        
        self.upper = upper_node
        return True

        if(self.debug):
            print(" %15s : Attached upper node '%s'"%(self.name, self.upper.name))

    # Attach the lower child node
    def attach_lower(self, lower_node):
        if(self.leaf):
            print(" %15s : Cant attach, is a leaf node"%self.name)
            return False

        self.lower = lower_node
        return True

        if(self.debug):
            print(" %15s : Attached upper node '%s'"%(self.name, self.lower.name))

    # Classify a given data sample
    # If the node is a leaf node, it returns the assigned state
    # Else, it uses the decision criteria to call either the upper or lower child node.
    def classify(self, x):
        if(self.leaf):
            if(self.debug):
                print(" %15s : Leaf node - returning result %d"%(self.name, self.state))
            return self.state

        if(self.debug):
            print(" %15s : Classifying criteria - %s >= %.2f"%(self.name, self.col, self.thresh))
        if(x[self.col] > self.thresh):
            if(self.debug):
                print(" %15s : Moving to upper node"%self.name)
            return self.upper.classify(x)
        else:
            if(self.debug):
                print(" %15s : Moving to lower node"%self.name)
            return self.lower.classify(x)

    # Create and link upper and lower child nodes
    def make_children(self, debug=None):
        if(self.debug):
            print(" %15s : Making children nodes..."%self.name)

        if(debug==None):
            debug=self.debug

        upper = DecisionNode(self, name='%s+'%(self.name), debug=debug)
        lower = DecisionNode(self, name='%s-'%(self.name), debug=debug)

        self.attach_upper(upper)
        self.attach_lower(lower)

        return upper, lower

    # Returns the upper and lower child nodes
    def get_children(self):
        return self.upper, self.lower

    # Change debug state.
    # Setting propagate to True will propagate the change in debug state down its child nodes.
    def set_debug(self, debug=True, propagate=False):
        if(self.debug != debug):
            print(" %15s : Setting debug to %s"%(self.name, debug))
        self.debug=debug
        if(propagate):
            if(not self.leaf):
                self.upper.set_debug(debug, True)
                self.lower.set_debug(debug, True)

In [8]:
# Recurseive depth-first search, while building the tree
def build_tree(data, level, node=None, debug=False):
    # If at the last level, make the node a leaf node
    if(level == 0):
        leaf_out = data.mode()['target'][0]
        node.make_leaf(leaf_out)
        return
    
    # If no parent node is passed, create a new node, called the 'root' node.
    if(node == None):
        node = DecisionNode(None, name='root', debug=debug)
    
    # Train the node
    node.train(data)
    
    # If the node decided to be a leaf node, stop building further
    if(node.leaf):
        return

    # Else, make child nodes, and split the dataset between these nodes
    node.make_children()
    upper_ds, lower_ds = node.split(data)
    
    # Build the tree down from these nodes, using the split dataset.
    # Note : Its a depth-first search because the upper node is called first, and only after
    #   its tree has been built, is the lower node called
    build_tree(upper_ds, level-1, node.upper)
    build_tree(lower_ds, level-1, node.lower)

    return node

In [9]:
# The level is the maximum number of decision nodes from root to the end (excluding leaf nodes)
tick = time()
root = build_tree(train, 5, debug=True)
print(" Fitting data took %.3f seconds"%(time()-tick))

            root : Initialised node with level :  0
            root : Trained. Impurity before : 0.47, Impurity : 0.14, Column : 'worst radius', Threshold : 16.80
            root : Making children nodes...
           root+ : Initialised node with level :  1
           root- : Initialised node with level :  1
            root : Splitting input...
           root+ : Trained. Impurity before : 0.11, Impurity : 0.06, Column : 'mean texture', Threshold : 14.99
           root+ : Making children nodes...
          root++ : Initialised node with level :  2
          root+- : Initialised node with level :  2
           root+ : Splitting input...
          root++ : Trained. Impurity before : 0.04, Impurity : 0.02, Column : 'worst concavity', Threshold : 0.22
          root++ : Making children nodes...
         root+++ : Initialised node with level :  3
         root++- : Initialised node with level :  3
          root++ : Splitting input...
         root+++ : Trained. Impurity before : 0.00, 

In [10]:
num_samples = 5

for i in np.random.choice(test.shape[0], num_samples, replace=False):
    print(" Testing on a sample : test set index '%d'"%i)
    display(test.iloc[i])
    result = root.classify(test.iloc[i])
    print()
    print(" Classification result : %d, Label : %d" % (result, test.iloc[i]['target']))
    print()
    
root.set_debug(False, propagate=True)

 Testing on a sample : test set index '22'


mean radius                 12.720000
mean texture                17.670000
mean perimeter              80.980000
mean area                  501.300000
mean smoothness              0.078960
mean compactness             0.045220
mean concavity               0.014020
mean concave points          0.018350
mean symmetry                0.145900
mean fractal dimension       0.055440
radius error                 0.295400
texture error                0.883600
perimeter error              2.109000
area error                  23.240000
smoothness error             0.007337
compactness error            0.011740
concavity error              0.005383
concave points error         0.005623
symmetry error               0.019400
fractal dimension error      0.001180
worst radius                13.820000
worst texture               20.960000
worst perimeter             88.870000
worst area                 586.800000
worst smoothness             0.106800
worst compactness            0.096050
worst concav

            root : Classifying criteria - worst radius >= 16.80
            root : Moving to lower node
           root- : Classifying criteria - worst concave points >= 0.14
           root- : Moving to lower node
          root-- : Classifying criteria - radius error >= 1.05
          root-- : Moving to lower node
         root--- : Classifying criteria - area error >= 38.61
         root--- : Moving to lower node
        root---- : Classifying criteria - smoothness error >= 0.00
        root---- : Moving to upper node
       root----+ : Leaf node - returning result 1

 Classification result : 1, Label : 1

 Testing on a sample : test set index '44'


mean radius                  19.020000
mean texture                 24.590000
mean perimeter              122.000000
mean area                  1076.000000
mean smoothness               0.090290
mean compactness              0.120600
mean concavity                0.146800
mean concave points           0.082710
mean symmetry                 0.195300
mean fractal dimension        0.056290
radius error                  0.549500
texture error                 0.663600
perimeter error               3.055000
area error                   57.650000
smoothness error              0.003872
compactness error             0.018420
concavity error               0.037100
concave points error          0.012000
symmetry error                0.019640
fractal dimension error       0.003337
worst radius                 24.560000
worst texture                30.410000
worst perimeter             152.900000
worst area                 1623.000000
worst smoothness              0.124900
worst compactness        

            root : Classifying criteria - worst radius >= 16.80
            root : Moving to upper node
           root+ : Classifying criteria - mean texture >= 14.99
           root+ : Moving to upper node
          root++ : Classifying criteria - worst concavity >= 0.22
          root++ : Moving to upper node
         root+++ : Leaf node - returning result 0

 Classification result : 0, Label : 0

 Testing on a sample : test set index '55'


mean radius                  19.730000
mean texture                 19.820000
mean perimeter              130.700000
mean area                  1206.000000
mean smoothness               0.106200
mean compactness              0.184900
mean concavity                0.241700
mean concave points           0.097400
mean symmetry                 0.173300
mean fractal dimension        0.066970
radius error                  0.766100
texture error                 0.780000
perimeter error               4.115000
area error                   92.810000
smoothness error              0.008482
compactness error             0.050570
concavity error               0.068000
concave points error          0.019710
symmetry error                0.014670
fractal dimension error       0.007259
worst radius                 25.280000
worst texture                25.590000
worst perimeter             159.800000
worst area                 1933.000000
worst smoothness              0.171000
worst compactness        

            root : Classifying criteria - worst radius >= 16.80
            root : Moving to upper node
           root+ : Classifying criteria - mean texture >= 14.99
           root+ : Moving to upper node
          root++ : Classifying criteria - worst concavity >= 0.22
          root++ : Moving to upper node
         root+++ : Leaf node - returning result 0

 Classification result : 0, Label : 0

 Testing on a sample : test set index '47'


mean radius                 12.470000
mean texture                18.600000
mean perimeter              81.090000
mean area                  481.900000
mean smoothness              0.099650
mean compactness             0.105800
mean concavity               0.080050
mean concave points          0.038210
mean symmetry                0.192500
mean fractal dimension       0.063730
radius error                 0.396100
texture error                1.044000
perimeter error              2.497000
area error                  30.290000
smoothness error             0.006953
compactness error            0.019110
concavity error              0.027010
concave points error         0.010370
symmetry error               0.017820
fractal dimension error      0.003586
worst radius                14.970000
worst texture               24.640000
worst perimeter             96.050000
worst area                 677.900000
worst smoothness             0.142600
worst compactness            0.237800
worst concav

            root : Classifying criteria - worst radius >= 16.80
            root : Moving to lower node
           root- : Classifying criteria - worst concave points >= 0.14
           root- : Moving to lower node
          root-- : Classifying criteria - radius error >= 1.05
          root-- : Moving to lower node
         root--- : Classifying criteria - area error >= 38.61
         root--- : Moving to lower node
        root---- : Classifying criteria - smoothness error >= 0.00
        root---- : Moving to upper node
       root----+ : Leaf node - returning result 1

 Classification result : 1, Label : 1

 Testing on a sample : test set index '36'


mean radius                  18.310000
mean texture                 20.580000
mean perimeter              120.800000
mean area                  1052.000000
mean smoothness               0.106800
mean compactness              0.124800
mean concavity                0.156900
mean concave points           0.094510
mean symmetry                 0.186000
mean fractal dimension        0.059410
radius error                  0.544900
texture error                 0.922500
perimeter error               3.218000
area error                   67.360000
smoothness error              0.006176
compactness error             0.018770
concavity error               0.029130
concave points error          0.010460
symmetry error                0.015590
fractal dimension error       0.002725
worst radius                 21.860000
worst texture                26.200000
worst perimeter             142.200000
worst area                 1493.000000
worst smoothness              0.149200
worst compactness        

            root : Classifying criteria - worst radius >= 16.80
            root : Moving to upper node
           root+ : Classifying criteria - mean texture >= 14.99
           root+ : Moving to upper node
          root++ : Classifying criteria - worst concavity >= 0.22
          root++ : Moving to upper node
         root+++ : Leaf node - returning result 0

 Classification result : 0, Label : 0

            root : Setting debug to False
           root+ : Setting debug to False
          root++ : Setting debug to False
         root+++ : Setting debug to False
         root++- : Setting debug to False
        root++-+ : Setting debug to False
        root++-- : Setting debug to False
          root+- : Setting debug to False
         root+-+ : Setting debug to False
         root+-- : Setting debug to False
           root- : Setting debug to False
          root-+ : Setting debug to False
         root-++ : Setting debug to False
         root-+- : Setting debug to False
        

In [11]:
error = 0

for i in test.index:
    inp = test.loc[i]
    out = root.classify(inp)
    if(out != inp['target']):
        error = error+1

print(" Test set : %d/%d were wrongly classified. " % (error,test.shape[0]))

error = 0

for i in train.index:
    inp = train.loc[i]
    out = root.classify(inp)
    if(out != inp['target']):
        error = error+1

print(" Train set : %d/%d were wrongly classified. " % (error,train.shape[0]))

 Test set : 4/60 were wrongly classified. 
 Train set : 4/509 were wrongly classified. 
