In [17]:
#import all the required libraries like pandas numpy etc that will be required throughout the code 
from sklearn import datasets
import pandas as pd
import math
import numpy as np
import sys

#the function that convert the continiuos dataset into classes so that we can implement and make decision tree from it. Here we categorise the entire dataset into 4 categories a,b,c,d respectively
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'
#the function to find the values on basis of which we will divide the categories and give call to label function so that the value could be assigned a label here we divide the entire dataset based on the mean values and their average with min and max values 
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    #now we will make a call to the label function and pass the values calculated as the arguments 
    return df[old_feature_name].apply(label, args= (first, second, third))

#info function is used to calculate the  entropy 
def info(y):
        # d is the total number of values that is the denominator part and then we calculate the entropy using the  formula and return the final value                               
        d=len(y)
        val=0
        x=y.value_counts()
        for i in x:
            val-=((i/d)*(math.log2(i)-math.log2(d)))
        return val  

#This function recieves the input data and results along with the feature list and levels as an input and then makes the final structure            
def build_tree(df, y, unused_features,lev=0):
    #we will check on our base case 1 which is in case we are left with no other features to split upon and thus we have to return from there 
    if len(unused_features)==0:
        print("Level ",lev)
        print("No more features to split on ")
        return 
    
    best_feature = ""
    #Now we will lookout for the entropy value by calling the info function  
    y_info=info(y)
    gain_ratio=0
    
    #check the value of the results in case we have Entropy(y_info) as 0 that means all the data we have is pure and cant be filtered ahead and thus we return from there as we reached the leaf node 
    if y_info==0 :
        
        print("Level ",lev)
        print("Current Entropy is=",y_info)
        print("Reached Leaf Node")
        return 
    
    #now we will start iterating over the unused features 
    for f in unused_features:
        # now for each feature we find list of possible values and then iterate over each value 
        possible_values = set(df[f])
        inf=0
        split_info=0
        for j in possible_values:
            #now we will find the entropy and split info and will calculate the information gain as well the denominator will be total number of values 
            a=df[f]
            den=len(a)
            
            true_vals=(a==j)
            t=a[true_vals]
            #the numerator will be the number of entries with that particualr value 
            num=len(t)
            y2=y[true_vals]
            
            y2_info=info(y2)
            
            #now we will do the weighted addition of the entropy for the individual splits 
            inf+=((num/den)*y2_info)
            #we will also calculate the split info using the formula 
            split_info-=((num/den)*(math.log2(num)-math.log2(den)))
        #now we will calulate the gain ratio and will compare it with the present value if its greater then update the values to the greater value of gain ratio and change the best feature of that to the feature whose value we are storing in gain ratio
        gain=((y_info-inf)/split_info)
        if gain>gain_ratio:
            gain_ratio=gain
            best_feature=f
            
    #print the required information 
    print("Level ",lev)
    print("Current Entropy is=",y_info)
    print("Splitting on feature",best_feature,"with gain ratio",gain_ratio)
    possible_values=set(df[best_feature])
    
    #now split the dataset based on the possible values of the best feature and then start sending the split data to build tree function for further analysis of data 
    for i in possible_values:
        x2=df[df[best_feature]==i]
        y2=y[df[best_feature]==i]
        #here we will filter data and before sending the data we will reomve the current feature from best feature so that data doesnt split on same feature twice 
        unused_features.remove(best_feature)
        build_tree(x2,y2,unused_features,lev+1)
        unused_features.add(best_feature)
    #once everythinng is done we will return 
    return

#load the iris dataset from the datasets and convert that into a pandas dataframe                                      
iris = datasets.load_iris()
df = pd.DataFrame(iris.data)
#name the coulums 
df.columns = ["sl", "sw", 'pl', 'pw']
#now convert the continious data to the classified/labeled data by making a call to toLabel function 
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
#now drop the coulums with continious data 
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)
#load the final results from the target frame 
y = pd.DataFrame(iris.target)
#now store all the unused parameters in the set and pass all the required values to the build tree function 
unused_features = set(df.columns)

build_tree(df, y, unused_features)






       

Level  0
Current Entropy is= 1.5849625007211559
Splitting on feature pw_labeled with gain ratio 0.6996382036222091
Level  1
Current Entropy is= 0.8631205685666303
Splitting on feature pl_labeled with gain ratio 0.4334099495621061
Level  2
Current Entropy is= 0.6581912658132184
Splitting on feature sl_labeled with gain ratio 0.12674503775809312
Level  3
Current Entropy is= 0.7837769474847011
Splitting on feature sw_labeled with gain ratio 0.07092036405148876
Level  4
No more features to split on 
Level  4
No more features to split on 
Level  4
No more features to split on 
Level  3
Current Entropy is= 0.0
Reached Leaf Node
Level  3
Current Entropy is= 0.0
Reached Leaf Node
Level  3
Current Entropy is= 0.0
Reached Leaf Node
Level  2
Current Entropy is= 0.0
Reached Leaf Node
Level  2
Current Entropy is= 0.0
Reached Leaf Node
Level  1
Current Entropy is= 0.0
Reached Leaf Node
Level  1
Current Entropy is= 0.0
Reached Leaf Node
Level  1
Current Entropy is= 0.0
Reached Leaf Node


34