In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Tree Implementation
    
    - Class Node
        - contains some default values such as information gain , entropy threshold and feature_index on which it    divided the dataset
    - class DecisionTreeClassfier
        - it contains the functions for building the decision tree, calculating information gain , entropy with other helper functions

In [2]:
# class Node for tree
class Node:
    def __init__(self,feature_index=None,threshold=None,left=None,right=None,info_gain=None,entropy=None,value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.entropy = entropy
        self.value = value

In [3]:
# decision tree classifier class
class myDecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2,feature_list=[],target_list=[]):
        ''' constructor '''
        # initialize the root of the tree 
        self.root = None
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
        # feature list and target lists
        self.feature_list = feature_list
        self.target_list = target_list
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # printing the current node metadata of tree
        print('\nLevel ',curr_depth)
        val,count = np.unique(Y,return_counts=True)
#         print(val,count)
#         print('Y:',Y)
#         print(dataset)
        val,count = [int(i) for i in val],[int(j) for j in count]
        for idx in range(len(val)):
#             print(int(uv),type(uv))
            print('Count of '+str(self.target_list[val[idx]])+' = '+str(count[idx]))
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # print the node parameters
                gain_ratio = best_split["info_gain"]/best_split["entropy_curr_node"]
                print('Current Entropy is = '+str(best_split["entropy_curr_node"]))
                print('Splitting on feature '+str(self.feature_list[best_split["feature_index"]])+' with gain ratio = '+str(gain_ratio))
                
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"],left_subtree, right_subtree, best_split["info_gain"],best_split["entropy_curr_node"])
        
        # compute leaf node
        print('Current entropy is = 0.0')
        print('Reached leaf node')
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain, curr_entropy = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        best_split["entropy_curr_node"] = curr_entropy
                        max_info_gain = curr_info_gain
                        
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        entropy_parent = self.entropy(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = entropy_parent - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain,entropy_parent
    
    def entropy(self, y):
        ''' function to compute entropy '''
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
#         print('y =>',y)
        class_labels = np.unique(y)
#         print('here')
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
    
    def get_unique_counts(self,y):
        l = np.unique(y)
        print()
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        Y = list(Y)
        return max(Y, key=Y.count)

    def fit(self, X, Y):
        ''' function to train the tree '''
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

# IRIS Dataset

### dataset preprocessing
     - loading the dataset and doing some basic preprocessing before giving it as an input.

In [4]:
iris_data = pd.read_csv('iris.csv')
iris_data['species'].value_counts()
dd={
    'Iris-setosa':0,
    'Iris-versicolor':1,
    'Iris-virginica':2
}
iris_data['species'].replace(dd,inplace=True)
f_list = ['sepal_length','sepal_width','petal_length','petal_width']
t_list = list(dd.keys())
print(iris_data.info())
print(iris_data['species'].value_counts())
iris_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None
0    50
1    50
2    50
Name: species, dtype: int64


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### testing dataset

In [5]:
X = iris_data.iloc[:, :-1].values
Y = iris_data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [6]:
clf = myDecisionTreeClassifier(min_samples_split=3, max_depth=3,feature_list = f_list,target_list=t_list)
clf.fit(X_train,Y_train)


Level  0
Count of Iris-setosa = 41
Count of Iris-versicolor = 39
Count of Iris-virginica = 40
Current Entropy is = 1.5846619079379884
Splitting on feature petal_length with gain ratio = 0.21292482140004135

Level  1
Count of Iris-setosa = 41
Current entropy is = 0.0
Reached leaf node

Level  1
Count of Iris-versicolor = 39
Count of Iris-virginica = 40
Current Entropy is = 0.9998844148717589
Splitting on feature petal_width with gain ratio = 0.4271560110626066

Level  2
Count of Iris-versicolor = 37
Count of Iris-virginica = 1
Current Entropy is = 0.17556502585750278
Splitting on feature petal_length with gain ratio = 0.2918949098536057

Level  3
Count of Iris-versicolor = 37
Current entropy is = 0.0
Reached leaf node

Level  3
Count of Iris-virginica = 1
Current entropy is = 0.0
Reached leaf node

Level  2
Count of Iris-versicolor = 2
Count of Iris-virginica = 39
Current Entropy is = 0.2811937964320427
Splitting on feature petal_length with gain ratio = 0.06981367359652843

Level  3
C

In [7]:
Y_pred = clf.predict(X_test) 
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.9333333333333333

In [8]:
# # graph pdf generate   ===> python-graphiz not working for my current custom model
# from sklearn.tree import export_graphviz
# import pydotplus

# export_graphviz(clf,out_file='decision_tree.dot')
# (tr,) = pydotplus.graph_from_dot_file('decision_tree.dot')
# tr.write_pdf('demo.pdf')
# # (graph,) = pydot.graph_from_dot_file('tree_from_forest.dot')
# # graph.write_png('tree_from_forest.png')

In [9]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=10)
model.fit(X,Y)

DecisionTreeClassifier(max_depth=10)

In [10]:
from sklearn.tree import export_graphviz
import pydotplus

export_graphviz(model,out_file='tree_iris.dot')
tr = pydotplus.graph_from_dot_file('tree_iris.dot')
tr.write_pdf('demo.pdf')

InvocationException: GraphViz's executables not found

In [11]:
!pip3 install graphiz

[31mERROR: Could not find a version that satisfies the requirement graphiz[0m
[31mERROR: No matching distribution found for graphiz[0m


# XOR Dataset 
    - full xor table is used as a dataset (4 rows , 3 cols[x1,x2,y])

In [12]:
xor_dataset = pd.read_csv('xor.csv')
X = xor_dataset.iloc[:, :-1].values
Y = xor_dataset.iloc[:, -1].values.reshape(-1,1)
clf_xor = myDecisionTreeClassifier(min_samples_split=2, max_depth=3,feature_list = ['X1','X2'],target_list=['0','1'])
clf_xor.fit(X,Y)


Level  0
Count of 0 = 1
Count of 1 = 3
Current Entropy is = 0.8112781244591328
Splitting on feature X1 with gain ratio = 0.15407786335091392

Level  1
Count of 0 = 1
Count of 1 = 1
Current Entropy is = 1.0
Splitting on feature X2 with gain ratio = 0.5

Level  2
Count of 0 = 1
Current entropy is = 0.0
Reached leaf node

Level  2
Count of 1 = 1
Current entropy is = 0.0
Reached leaf node

Level  1
Count of 1 = 2
Current entropy is = 0.0
Reached leaf node


In [13]:
# # graph pdf generate
# from sklearn.tree import export_graphviz
# import pydotplus

# export_graphviz(clf,out_file='decision_tree.dot')
# (tr,) = pydotplus.graph_from_dot_file('decision_tree.dot')
# tr.write_pdf('demo.pdf')
# # (graph,) = pydot.graph_from_dot_file('tree_from_forest.dot')
# # graph.write_png('tree_from_forest.png')

 - unable to install graphiz module , so no pdf files available/uploading for tree visualization
         - tried through conda - broken packages
         - installed through pip3 but not compatabile with anaconda packages

## End Project