In [1]:
import numpy as np
import math
import pandas as pd
from sklearn import datasets

In [2]:
iris = datasets.load_iris()  #importing iris dataset

In [3]:
feature = pd.DataFrame(iris.data) #converting dataset into pandas DataFrame

In [4]:
feature.columns = iris.feature_names #Definig column names of dataset

In [5]:
feature['target'] = iris.target #marging target into features

In [6]:
feature.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
# This function will calculate the Entropy which would be used to find the information gain
def entropy(arr):
    add = sum(arr)
    temp = 0
    for i in arr:
        if i >0:
            temp -= (i/add)*(math.log(i/add,2))
    return temp

In [8]:
# This function will calculate the gini index it takes an list as input and outputs a scalar
def gini(arr):
    add = sum(arr)
    temp = 0
    for i in arr:
        if i >0:
            temp += (i/add)**2
    return 1-temp

In [9]:
# This function is used to find the gini split. It takes two lists as argument
def gini_split(child1,child2):
    gini_c1 = gini(child1)
    gini_c2 = gini(child2)
    c1 = sum(child1)
    c2 = sum(child2)
    return ((c1*gini_c1) + (c2*gini_c2))/(c1+c2)

In [10]:
# This function is used to calculate the information gain. This function takes three lists as arguments
def info_gain(parent,child1,child2):
    p_sum = sum(parent)
    c1_sum = sum(child1)
    c2_sum = sum(child2)
    p_ent = entropy(parent)
    c1_ent = entropy(child1)
    c2_ent = entropy(child2)
    final = p_ent - ((c1_sum/p_sum)*c1_ent + (c2_sum/p_sum)*c2_ent)
    return final
    

In [11]:
'''# This function is used to find the split information which is later being used to calculate gain ratio. 
It takes a list and a list as argument'''
def split_info(sum_parent,children):
    temp = 0
    for i in children:
        if i>0:
            temp -= (i/sum_parent)*(math.log(i/sum_parent,2))

    return temp

In [12]:
# This function is used to calculate gain ratio. It takes three lists as input arguments
def gain_ratio(parent,child1,child2):
    infogain = info_gain(parent,child1,child2)
    splitinfo = split_info(sum(parent),[sum(child1),sum(child2)])
    return infogain/splitinfo
    
    

In [13]:
# This function prints the values in a given list.
def print_values(arr):
    for i in range(len(arr)):
        print('Count of',i,'=',arr[i])
    return

In [14]:
# This function is being used to find the classes present in a dataframe
def find_values(feature):
    arr = []
    for i in range(3):
        p = feature[feature['target'] == i]['target'].value_counts()
        if len(p) == 0:
            arr.append(0)
        else:
            arr.append(p[i])
    return arr

In [15]:
def run(feature,level):
    total_class = find_values(feature) #Total classes in the given dataframe

    if  entropy(total_class) == 0: #checks if current node is a leaf node
        print('Level', level)
        print_values(find_values(feature))
        print('Current Entropy is =', entropy(total_class))
        print('Reached at leaf Node')
        print()
        return
        
        
    split_f = 0    #Splitting Feature
    max_v = 0     #Point of split
    gain = float('inf')
    for i in feature.columns[:-1]:     #Iterates on every feature
        temp1 = np.sort(feature[i])
        temp = (temp1[:-1] + temp1[1:])/2    #Finds the middle points
        lis = np.unique(temp)
        for j in lis:                    #Iterating on middle points
            x = feature[feature[i]>j]
            y = feature[feature[i]<=j]

            node1 = find_values(x)
            node2 = find_values(y)
            store = gini_split(node1,node2)
            if store < gain:
                gain = store
                split_f = i
                max_v = j
                left_node = node1
                right_node = node2
                left_data = x.copy()
                right_data = y.copy()
                
                
                
    print('Level', level)
    print_values(total_class)
    print('Current Entropy is =', entropy(total_class))
    print('Splliting on', split_f,'with Gain Ratio',gain_ratio(total_class,left_node,right_node))
    print()


    run(left_data,level+1)
    run(right_data,level+1)

                
    return 
    
            

In [16]:
run(feature,0)

Level 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 1.584962500721156
Splliting on petal length (cm) with Gain Ratio 0.9999999999999999

Level 1
Count of 0 = 0
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 1.0
Splliting on petal width (cm) with Gain Ratio 0.6933647985912662

Level 2
Count of 0 = 0
Count of 1 = 1
Count of 2 = 45
Current Entropy is = 0.15109697051711368
Splliting on petal length (cm) with Gain Ratio 0.2622302372762406

Level 3
Count of 0 = 0
Count of 1 = 0
Count of 2 = 43
Current Entropy is = 0.0
Reached at leaf Node

Level 3
Count of 0 = 0
Count of 1 = 1
Count of 2 = 2
Current Entropy is = 0.9182958340544896
Splliting on sepal length (cm) with Gain Ratio 1.0

Level 4
Count of 0 = 0
Count of 1 = 0
Count of 2 = 2
Current Entropy is = 0.0
Reached at leaf Node

Level 4
Count of 0 = 0
Count of 1 = 1
Count of 2 = 0
Current Entropy is = 0.0
Reached at leaf Node

Level 2
Count of 0 = 0
Count of 1 = 49
Count of 2 = 5
Current Entropy is = 0.4450648

## Implementing Decision Tree using sklearn

In [17]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pydotplus


In [18]:
data = pd.DataFrame(iris.data)
label = iris.target

In [19]:
data.columns = iris.feature_names

In [20]:
instance = DecisionTreeClassifier()

In [21]:
instance = DecisionTreeClassifier()

In [22]:
instance.fit(data, label)

In [23]:
dot_data = export_graphviz(instance, out_file=None,feature_names = iris.feature_names,class_names = iris.target_names)

In [24]:
graph = pydotplus.graph_from_dot_data(dot_data)

In [25]:
graph.write_pdf('iris.pdf')

True