In [92]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [93]:
iris = datasets.load_iris()

In [94]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [95]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 0
    elif (val < boundaries[1]):
        return 1
    elif (val < boundaries[2]):
        return 2
    else:
        return 3

#Function to convert a continuous data into labelled data
#There are 4 lables  - 0, 1, 2, 3
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [96]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,1,2,0,0
1,4.9,3.0,1.4,0.2,0,1,0,0
2,4.7,3.2,1.3,0.2,0,2,0,0
3,4.6,3.1,1.5,0.2,0,2,0,0
4,5.0,3.6,1.4,0.2,0,2,0,0
5,5.4,3.9,1.7,0.4,1,3,0,0
6,4.6,3.4,1.4,0.3,0,2,0,0
7,5.0,3.4,1.5,0.2,0,2,0,0
8,4.4,2.9,1.4,0.2,0,1,0,0
9,4.9,3.1,1.5,0.1,0,2,0,0


In [97]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [98]:
#changing feature names after labelling
df.columns = ["sl", "sw", 'pl', 'pw']
df.head()

Unnamed: 0,sl,sw,pl,pw
0,1,2,0,0
1,0,1,0,0
2,0,2,0,0
3,0,2,0,0
4,0,2,0,0


In [99]:
import math as m

In [100]:
def entropybs(df,y):
    l=len(y)
    total_entropy = 0
    for i in set(y):
        Di=(y==i).sum() #count of Di in output taken i as class.
        total_entropy+= ((-1*Di/l)*m.log(Di/l))
    return total_entropy

In [101]:

    # function for finding entropy after splitting
def entropyas(df,y,b_f):
    total_entropy=0
    l=len(y)
    for v in set(df[b_f]):    #v=a,b,c,d
        data=df[(df[b_f]==v)]  #new df of selected labels
        new_y=y[(df[b_f]==v)]  #class corresponding to that label
        Di=len(data)
        for i in set(new_y):
            count=(new_y==i).sum() #count of rows.
           # if(count!=0):  #if not leaf node
            total_entropy+=(Di/l*((-1*count/Di)*m.log(count/Di)))
    return total_entropy

In [102]:
# function for split info
def split_info(df,y,s_f):
    split_info=0
    l=len(y)
    for v in set(df[s_f]):
        data=df[(df[s_f]==v)] #new df of selected labels
       # new_y=y[(df[s_f]==v)] #class corresponding to that label
        Di=len(data)
        split_info+=((-1*Di/l)*m.log(Di/l))
    return split_info

In [103]:
def gain(df,y,bf):
    entropy_beforesplit=entropybs(df,y)
    entropy_aftersplit=entropyas(df,y,bf)
    info_gain=entropy_beforesplit-entropy_aftersplit
    gain_ratio=info_gain/split_info(df,y,bf)
    return gain_ratio

In [104]:
def build_tree(df, y, unused_features,level):
    print()
    #base case
    # 1. unused is empty
    if(len(unused_features)==0):
        print("level:",level)
        print("reached leaf node")
        print("current entropy : 0.0")
        for i in set(y):
            Di=(y==i).sum()
            print("count of",i,":",Di)
        return
        
    # 2. y contains only one distinct value(pure node)
    label_left=set(y)
    if (len(label_left)==1):
        print("levl:",level)
        print("reached leaf node")
        print("current entropy: 0")
        for i in set(y):
            Di=(y==i).sum()
            print("count of",i,":",Di)
        return
    
        
    
   #small calculation
    best_feature = ""
    max_gain=0
    for f in unused_features:         #finding best feature with max gain
        gain_ratio=gain(df,y,f)
        if gain_ratio>max_gain:
            max_gain=gain_ratio
            best_feature=f
   
    
        
    # here you should know the best feature
    # print it out

    print("level:",level)
    print("Best Feature:", best_feature)
    print("gain ratio:",max_gain)
    print("Entropy:",entropybs(df,y))
    for i in set(y):
        Di=(y==i).sum()
        print("count of",i,":",Di)
    
    
    # remove best feature from unused features
    unused_features.remove(best_feature)
    
    b_f = best_feature
    # loop over possible values of best feature
    for v in set(df[b_f]):    #v=a,b,c,d
        data=df[(df[b_f]==v)]  #new df of selected labels
        new_y=y[(df[b_f]==v)]   #class according to new df
        # call build tree recursively
        build_tree(data,new_y,unused_features,level+1) 
    
    return
    
    
    

In [105]:
#main
y = iris.target
unused_features = list(df.columns)
build_tree(df, y, unused_features,0)



level: 0
Best Feature: pw
gain ratio: 0.6996382036222091
Entropy: 1.0986122886681096
count of 0 : 50
count of 1 : 50
count of 2 : 50

levl: 1
reached leaf node
current entropy: 0
count of 0 : 50

levl: 1
reached leaf node
current entropy: 0
count of 1 : 10

level: 1
Best Feature: pl
gain ratio: 0.43340994956210666
Entropy: 0.5982695885852573
count of 1 : 40
count of 2 : 16

levl: 2
reached leaf node
current entropy: 0
count of 1 : 1

level: 2
Best Feature: sl
gain ratio: 0.1267450377580933
Entropy: 0.45622342016761397
count of 1 : 39
count of 2 : 8

levl: 3
reached leaf node
current entropy: 0
count of 2 : 1

levl: 3
reached leaf node
current entropy: 0
count of 1 : 14

level: 3
Best Feature: sw
gain ratio: 0.0709203640514889
Entropy: 0.5432727813369008
count of 1 : 23
count of 2 : 7

level: 4
reached leaf node
current entropy : 0.0
count of 1 : 3
count of 2 : 1

level: 4
reached leaf node
current entropy : 0.0
count of 1 : 14
count of 2 : 6

level: 4
reached leaf node
current entropy

In [106]:
unused_features

[]