In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
iris_data = datasets.load_iris()
iris_data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
X = iris_data.data
Y = iris_data.target
print(X.shape)
print(Y.shape)

(150, 4)
(150,)


In [4]:
X_df = pd.DataFrame(X)
X_df.columns = iris_data.feature_names
X_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
def label(x, *boundary):
    if(x <= boundary[0]):
        return "a"
    elif(boundary[0] < x <= boundary[1]):
        return "b"
    elif(boundary[1] < x <= boundary[2]):
        return "c"
    else:
        return "d"

In [6]:
def discrete_val(x, f_name):
    min_val = x[f_name].min()
    mean_val = x[f_name].mean()
    max_val = x[f_name].max()
    b1 = (min_val+mean_val)/2
    b2 = (max_val+mean_val)/2
    return x[f_name].apply(label, args = (b1, mean_val, b2))

In [7]:
X_df["sl"] = discrete_val(X_df, "sepal length (cm)")
X_df["sw"] = discrete_val(X_df, "sepal width (cm)")
X_df["pl"] = discrete_val(X_df, "petal length (cm)")
X_df["pw"] = discrete_val(X_df, "petal width (cm)")

In [8]:
X_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sl,sw,pl,pw
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [9]:
X_df.drop(["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"], axis=1, inplace = True)
X_df.head()

Unnamed: 0,sl,sw,pl,pw
0,b,c,a,a
1,a,b,a,a
2,a,c,a,a
3,a,c,a,a
4,a,c,a,a


In [10]:
set(Y)

{0, 1, 2}

In [11]:
def entropy_node(y):
    entropy = 0
    for i in set(y):
        p = len(y[y == i])/len(y)
        entropy -= (p)*np.log(p)
    return entropy    

In [12]:
def gain_ratio(x, y, f, entropy_prev):
    entropy = 0
    for i in set(x):
        entropy -= (entropy_node(y[x == i]))*(len(y[x == i])/len(y))
    return (entropy_prev - entropy)

In [14]:
level = 0
def d_tree_implement(x, y, features):
    global level
    print("\n")
    print("Level: ", level)
    for cls in range(3):
        print("Class ", cls," =", len(y[y==cls]))
        
    entropy = entropy_node(y)        
    print("Current Entropy is: ", entropy)
    if(len(set(y)) == 1):        
        level-=1
        print("Pure leaf node")
        return 0 
    elif(len(features) == 0):        
        level-=1
        print("Leaf node")
        return 0
    else:        
        max_gain = 0
        for f in features:
            gain = gain_ratio(x[f], y, f, entropy)
            if(max_gain <= gain):
                max_gain = gain
                best_f = f
        features.remove(best_f)
        print("Splitting on ", best_f, " feature")
        for val in set(x[best_f]):
            level+=1
            x1 = x[x[best_f] == val]
            y1 = y[x[best_f] == val]
            d_tree_implement(x1, y1, features)
    level-=1
    return 0

In [15]:
f = list(set(X_df.columns))
print(f)
d_tree_implement(X_df, Y, f)

['pw', 'sw', 'sl', 'pl']


Level:  0
Class  0  = 50
Class  1  = 50
Class  2  = 50
Current Entropy is:  1.0986122886681096
Splitting on  sw  feature


Level:  1
Class  0  = 7
Class  1  = 29
Class  2  = 28
Current Entropy is:  0.9624037646424741
Splitting on  sl  feature


Level:  2
Class  0  = 0
Class  1  = 15
Class  2  = 4
Current Entropy is:  0.5146531654289292
Splitting on  pl  feature


Level:  3
Class  0  = 0
Class  1  = 2
Class  2  = 0
Current Entropy is:  0.0
Pure leaf node


Level:  3
Class  0  = 0
Class  1  = 13
Class  2  = 4
Current Entropy is:  0.5455945739691843
Splitting on  pw  feature


Level:  4
Class  0  = 0
Class  1  = 1
Class  2  = 0
Current Entropy is:  0.0
Pure leaf node


Level:  4
Class  0  = 0
Class  1  = 12
Class  2  = 0
Current Entropy is:  0.0
Pure leaf node


Level:  4
Class  0  = 0
Class  1  = 0
Class  2  = 4
Current Entropy is:  0.0
Pure leaf node


Level:  2
Class  0  = 7
Class  1  = 0
Class  2  = 0
Current Entropy is:  0.0
Pure leaf node


Level:  2
Class

0