### Module Imports

In [1]:
from sklearn import datasets
import pandas as pd
import math

## Dataset - Iris

In [2]:
iris = datasets.load_iris()

# Functions

### label(val, *boundaries) => to find label for a value

In [3]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'


### toLabel(df, old_feature_name) => to convert a continuous data into labelled data

In [4]:
#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

# Input data df

In [5]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

## Convert all columns to labelled data

In [6]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
# df

### drop ['sl', 'sw', 'pl', 'pw'] columns from df

In [7]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)
df

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,b,c,a,a
1,a,b,a,a
2,a,c,a,a
3,a,c,a,a
4,a,c,a,a
...,...,...,...,...
145,c,b,c,d
146,c,a,c,d
147,c,b,c,d
148,c,c,d,d


# Output data y

In [8]:
y = pd.DataFrame(iris.target)
y.columns = ['ftype']
y

Unnamed: 0,ftype
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


### flower_map to map: code --> species

In [9]:
flower_map = {
    0: 'Iris Setosa',
    1: 'Iris Versicolour',
    2: 'Iris Virginica'
}

# Decision Tree Node Class

In [10]:
#  A class to represent a node of a Decision Tree
class DecisionTreeNode:
    def __init__(self):
        self.depth_level = 0
        self.count_map = None
        self.entropy = 0
        self.children = []
        self.predicted_ftype = None

# Building Tree (Functions)

### print_flower_counts(count_map) => function to print flower counts

In [11]:
def print_flower_counts(count_map):
    for i in range(len(count_map)):
        fcode = count_map.index[i]
        fname = flower_map[fcode]
        count = count_map.values[i]
        
        print(f'Count of  {fcode}({fname})  =  {count}')

### get_entropy(count_map, total_count) => function to calculate entropy

In [12]:
def get_entropy(count_map, total_count):
    entropy = 0
    
    for count in count_map.values:
        probab_val = count/total_count
        log_val = math.log2(probab_val)
        
        entropy -= probab_val*log_val
        
    return entropy

### get_gain_ratio(df, y, best_feature, info_before_split) => function to calculate gain ratio

In [13]:
def get_gain_ratio(df, y, best_feature, info_before_split):
    # initialize split_info and info_after_split
    info_after_split = 0
    split_info = 0
    
    # possible values for best_feature
    possible_values = set(df[best_feature])
    
    # loop over possible values : val
    for val in possible_values:
        # find subset of y with f == val
        y_df = y[df[best_feature] == val]
        
        # weight of subset
        weight = len(y_df)/len(y)
        
        # calculate entropy of subset
        count_map = y_df.ftype.value_counts()
        entropy = get_entropy(count_map, len(y_df))
        
        # add entropy with weight in info_after_split
        info_after_split += weight * entropy
            
        # add weighted split_info in split_info
        split_info -= weight*math.log2(weight)
    
    
    # info_gain
    info_gain = info_before_split - info_after_split
    
    # gain_ratio
    gain_ratio = info_gain / split_info
    return gain_ratio
        

## build_tree(df, y, unused_features, depth_level) => function to build the tree

In [14]:
def build_tree(df, y, unused_features, depth_level):
    # Create Decision Tree Node
    node = DecisionTreeNode()
    
    # add depth level to node
    node.depth_level = depth_level
    
    # add count map to node
    node.count_map = y.ftype.value_counts()
    
    # add entropy to node
    node.entropy = get_entropy(node.count_map, len(y))
    
    
    # base cases
    # 1. y contains only one distinct value
    # 2. unused_features is empty => predict flower_type with majority
    if len(set(y['ftype'])) == 1 or len(unused_features) == 0:
        node.predicted_ftype = flower_map[node.count_map.index[0]]
        return node

    
    # Initialize best_feature and min_mistakes
    best_feature = ""
    min_mistakes = float('infinity')
    
    for f in unused_features:
        possible_values = set(df[f])
        mistakes = 0
        
        # loop over possible values : val
        for val in possible_values:
            # find subset of df & y with f == val
            val_df = df[df[f] == val]
            y_df = y[df[f] == val]
            
            # find number of mistakes in this subset
            # if we predict the most common y as the output
            # find sum of all these mistakes
            count_map = y_df.ftype.value_counts()
            mistakes += len(y_df) - count_map.values[0]
            
            
        # update best feature so that that particular feature
        # makes least number of mistakes
        if mistakes < min_mistakes:
            min_mistakes = mistakes
            best_feature = f
            
        
    # here you should know the best feature, print it out
    print("Best Feature ", best_feature)
    
    # print info_gain
    info_gain = get_gain_ratio(df, y, best_feature, node.entropy)
    print(f'Splitting on feature  {best_feature}  with gain ratio {info_gain}')
    
    print('----------------')
    print()
    
    # remove best feature from unused features
    unused_features.discard(best_feature)
    
    # loop over possible values of best feature
    possible_values = set(df[best_feature])
    for val in possible_values:
        # call build tree recursively
        new_df = df[df[best_feature] == val]
        new_y = y[df[best_feature] == val]
        
        child_node = build_tree(new_df, new_y, unused_features, depth_level +1)
        node.children.append(child_node)
        
    return node

In [15]:
# def build_tree(df, y, unused_features, depth_level):
#     # print depth level
#     print('Level ', depth_level)
    
#     # Get count map for each flower
#     count_map = y.ftype.value_counts()
    
#     # Print each flower type with its count
#     print_flower_counts(count_map)
    
#     # Print entropy
#     entropy = get_entropy(count_map, len(y))
#     print(f'Current Entropy  is = {entropy}')
    
    
#     # base cases
#     # 1. y contains only one distinct value
#     # 2. unused_features is empty
#     if len(set(y['ftype'])) == 1 or len(unused_features) == 0:
#         print('Predicted Flower type: ', flower_map[count_map.index[0]])
#         print('Reached leaf Node')
#         print('----------------')
#         print()
#         return

    
#     best_feature = ""
#     min_mistakes = float('infinity')
    
#     for f in unused_features:
#         possible_values = set(df[f])
#         mistakes = 0
        
#         # loop over possible values : val
#         for val in possible_values:
#             # find subset of df & y with f == val
#             val_df = df[df[f] == val]
#             y_df = y[df[f] == val]
            
#             # find number of mistakes in this subset
#             # if we predict the most common y as the output
#             # find sum of all these mistakes
#             count_map = y_df.ftype.value_counts()
#             mistakes += len(y_df) - count_map.values[0]
            
            
#         # update best feature so that that particular feature
#         # makes least number of mistakes
#         if mistakes < min_mistakes:
#             min_mistakes = mistakes
#             best_feature = f
            
        
#     # here you should know the best feature, print it out
#     print("Best Feature ", best_feature)
    
#     # print info_gain
#     info_gain = get_gain_ratio(df, y, best_feature, entropy)
#     print(f'Splitting on feature  {best_feature}  with gain ratio {info_gain}')
    
#     print('----------------')
#     print()
    
#     # remove best feature from unused features
#     unused_features.discard(best_feature)
    
#     # loop over possible values of best feature
#     possible_values = set(df[best_feature])
#     for val in possible_values:
#         # call build tree recursively
#         new_df = df[df[best_feature] == val]
#         new_y = y[df[best_feature] == val]
        
#         build_tree(new_df, new_y, unused_features, depth_level +1)

# Build the Decision Tree | Run Code

In [16]:
unused_features = set(df.columns)
root = build_tree(df, y, unused_features, 0)

Best Feature  pw_labeled
Splitting on feature  pw_labeled  with gain ratio 0.6996382036222091
----------------

Best Feature  pl_labeled
Splitting on feature  pl_labeled  with gain ratio 0.4334099495621067
----------------

Best Feature  sl_labeled
Splitting on feature  sl_labeled  with gain ratio 0.12674503775809332
----------------

Best Feature  sw_labeled
Splitting on feature  sw_labeled  with gain ratio 0.07092036405148876
----------------



# Print tree data - Pre-order traversal 

In [17]:
def pre_order_traversal(root):
    if root is None:
        return
    
    # print depth level
    print('Level ', root.depth_level)
    
    # Print each flower type with its count
    print_flower_counts(root.count_map)
    
    # print entropy
    print(f'Current Entropy  is = {root.entropy}')
    
    # print prediction of flower type
    if len(root.children) == 0:
        print('Reached leaf Node')
        
    print('----------------')
    print()
        
    # traverse children from left to right
    for child_node in root.children:
        pre_order_traversal(child_node)

In [18]:
pre_order_traversal(root)

Level  0
Count of  0(Iris Setosa)  =  50
Count of  1(Iris Versicolour)  =  50
Count of  2(Iris Virginica)  =  50
Current Entropy  is = 1.584962500721156
----------------

Level  1
Count of  1(Iris Versicolour)  =  40
Count of  2(Iris Virginica)  =  16
Current Entropy  is = 0.863120568566631
----------------

Level  2
Count of  1(Iris Versicolour)  =  39
Count of  2(Iris Virginica)  =  8
Current Entropy  is = 0.6581912658132185
----------------

Level  3
Count of  1(Iris Versicolour)  =  23
Count of  2(Iris Virginica)  =  7
Current Entropy  is = 0.783776947484701
----------------

Level  4
Count of  1(Iris Versicolour)  =  6
Current Entropy  is = 0.0
Reached leaf Node
----------------

Level  4
Count of  1(Iris Versicolour)  =  3
Count of  2(Iris Virginica)  =  1
Current Entropy  is = 0.8112781244591328
Reached leaf Node
----------------

Level  4
Count of  1(Iris Versicolour)  =  14
Count of  2(Iris Virginica)  =  6
Current Entropy  is = 0.8812908992306927
Reached leaf Node
-----------

# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||