# Importing the required packages

In [1]:
! pip install pydotplus
! pip install graphviz



In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.tree import export_graphviz
import pydotplus

# Loading the dataset 

In [3]:
iris_data = datasets.load_iris()

# Converting the dataset into dataframe

In [4]:
df_iris = pd.DataFrame(iris_data.data,columns= ['sl','sw','pl','pw'])

In [5]:
df_iris['target'] = iris_data.target

In [6]:
t_name = iris_data.target_names
df_iris['names_target'] = df_iris['target'].apply(lambda x: t_name[0] if x == 0 else (t_name[1] if x == 1 else t_name[2]))

# Analyzing the dataFrame

In [7]:
df_iris.isnull().sum()

sl              0
sw              0
pl              0
pw              0
target          0
names_target    0
dtype: int64

In [8]:
df_iris.head(2)

Unnamed: 0,sl,sw,pl,pw,target,names_target
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa


# Splitting the data into target and feature variables


In [9]:
df = df_iris.drop(['target', 'names_target'], axis = 1)
target = df_iris['names_target']

# Changing the continuous value into categorical values

In [10]:
def change_label(df, x, attr):
    second_limit = df[attr].mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5* second_limit
    if x < first_limit:
        return 'a'
    elif x < second_limit:
        return 'b'
    elif x < third_limit:
        return 'c' 
    else:
        return 'd'

In [11]:
df['sl_i']= df_iris['sl'].apply(lambda x: change_label(df_iris,x, 'sl'))
df['sw_i'] = df_iris['sw'].apply(lambda x: change_label(df_iris, x, 'sw'))
df['pl_i'] = df_iris['pl'].apply(lambda x: change_label(df_iris, x, 'pl'))
df['pw_i'] = df_iris['pw'].apply(lambda x: change_label(df_iris, x, 'pw'))

In [12]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [13]:
df

Unnamed: 0,sl_i,sw_i,pl_i,pw_i
0,b,c,a,a
1,b,b,a,a
2,b,c,a,a
3,b,c,a,a
4,b,c,a,a
...,...,...,...,...
145,c,b,c,d
146,c,b,c,d
147,c,b,c,d
148,c,c,c,d


# Converting into categorical values

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Defining the categorical values 
categories = ['sl_i','sw_i','pl_i','pw_i']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',one_hot,categories)],remainder='passthrough')
transformed_x = transformer.fit_transform(df)

In [15]:
y= pd.DataFrame(iris_data.target, columns= ["values"])

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size = 0.25, random_state = 42)

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

# Implementation

In [18]:
y["values"].value_counts().values

array([50, 50, 50], dtype=int64)

In [31]:
import math   
def entropy(list_val): #Implementation of entropy
    val=0.0
    for i in list_val:
        val-=(i/sum(list_val))*(math.log(i/sum(list_val),2))
        
    return val
    pass


def build_tree(df,y,features,level):
    if len(features)==0: #In case all the features are over
        return
    
    if len(set(y["values"]))==0: #Avoiding the case of no outputs 
        return
    
    if  len(set(y["values"]))==1: #Case of pure node 
        info_y=y["values"].value_counts()
        E_y=entropy(list(info_y.values))
        
        print("Level:",level)
        index=0
        for i in info_y.index:
            print("Count of "+str(i)+" : "+str(info_y.values[index]))
            index+=1
        
        print("Current Entropy :",E_y)
        print("Reached Leaf Node")
        print()
        dot_data = export_graphviz(clf,filled = True, rounded = True)
        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("iris.pdf")
        return
    
    max_gain=0.0
    best_feature=""
    info_y=y["values"].value_counts() #Determing values of differnent classes of outputs
    N=sum(list(info_y.values))        
    E_y=entropy(list(info_y.values)) #Entropy of the node to calculate gain ratio
    
    for i in features:
        possible_values=set(df[i])
        split_info=0.0
        E_f=0.0
        for j in possible_values:
            f=y[df[i]==j].value_counts() #Subset of Y and feature==val
            
            f_val=list(f.values)
            
            E_f+=(sum(f_val)/N)*entropy(f_val) #finding the entropy of the feature
            
            if sum(f_val)!=0: # If condition to avoid error in log function
                split_info-=(sum(f_val)/N)*(math.log(sum(f_val)/N,2))
            
            else:
                split_info-=0.0
        
        gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute 
        
        if gain_ratio>max_gain: #Updating the best feature
            best_feature=i
            max_gain=gain_ratio
    
    print("Level:",level)
    index=0
    for i in info_y.index:
        print("Count of "+str(i)+" : "+str(info_y.values[index]))
        index+=1
        
    print("Current Entropy :",E_y)
    if i in features:
        features.remove(best_feature) #removing the best feature from the list
    print("Splitting on "+best_feature+" with gain ratio "+str(max_gain))
    print()
    
    #Itreating of all possible values of the best features
    #Calling recursively with respect to each value of feature
    if len(best_feature) != 0:
        for i in set(df[best_feature]):
            build_tree(df,y[df[best_feature]==i],features,level+1)


In [32]:
build_tree(df, y, list(set(df.columns)), 0)


Level: 0
Count of 0 : 50
Count of 1 : 50
Count of 2 : 50
Current Entropy : 1.584962500721156
Splitting on pw_i with gain ratio 0.7350016280496156

Level: 1
Count of 2 : 45
Count of 1 : 1
Current Entropy : 0.15109697051711368
Splitting on sw_i with gain ratio 0.031037861792700953

Level: 2
Count of 2 : 17
Count of 1 : 1
Current Entropy : 0.3095434291503252
Splitting on pl_i with gain ratio 0.057914261762502306

Level: 3
Count of 2 : 9
Current Entropy : 0.0
Reached Leaf Node



  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  build_tree(df,y[df[best_feature]==i],features,level+1)
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_coun

Level: 3
Count of 2 : 8
Count of 1 : 1
Current Entropy : 0.5032583347756457
Splitting on  with gain ratio 0.0

Level: 2
Count of 2 : 28
Current Entropy : 0.0
Reached Leaf Node



  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  build_tree(df,y[df[best_feature]==i],features,level+1)


Level: 1
Count of 1 : 39
Count of 2 : 5
Current Entropy : 0.5107878229540133
Splitting on pl_i with gain ratio 0.2488471906913506

Level: 2
Count of 2 : 1
Current Entropy : 0.0
Reached Leaf Node



  build_tree(df,y[df[best_feature]==i],features,level+1)
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  build_tree(df,y[df[best_feature]==i],features,level+1)


Level: 2
Count of 1 : 38
Count of 2 : 4
Current Entropy : 0.4537163391869448
Splitting on sw_i with gain ratio 0.04070432026142338

Level: 3
Count of 1 : 7
Current Entropy : 0.0
Reached Leaf Node

Level: 3
Count of 1 : 31
Count of 2 : 4
Current Entropy : 0.512709142030877
Splitting on sl_i with gain ratio 0.012981006561098145

Level: 4
Count of 1 : 17
Count of 2 : 3
Current Entropy : 0.6098403047164004
Splitting on  with gain ratio 0.0

Level: 4
Count of 1 : 14
Count of 2 : 1
Current Entropy : 0.35335933502142136
Splitting on  with gain ratio 0.0

Level: 2
Count of 1 : 1
Current Entropy : 0.0
Reached Leaf Node



  build_tree(df,y[df[best_feature]==i],features,level+1)
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  build_tree(df,y[df[b

Level: 1
Count of 0 : 49
Current Entropy : 0.0
Reached Leaf Node



  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  gain_ratio=(E_y-E_f)/split_info #gain ratio for each featute
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  f=y[df[i]==j].value_counts() #Subset of Y and feature==val
  build_tree(df,y[df[best_feature]==i],features,level+1)


Level: 1
Count of 1 : 10
Count of 0 : 1
Current Entropy : 0.4394969869215134
Splitting on sw_i with gain ratio 1.0

Level: 2
Count of 0 : 1
Current Entropy : 0.0
Reached Leaf Node



  build_tree(df,y[df[best_feature]==i],features,level+1)


Level: 2
Count of 1 : 10
Current Entropy : 0.0
Reached Leaf Node

