In [None]:
#Import necessary libraries
import numpy as np
import pandas as pd

# Getting data and preprocessing it

In [None]:
col_names = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']  #features
data = pd.read_csv("iris.csv")
data.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [None]:
data['variety'].unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [None]:
#Label encoding
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
data['variety']= label_encoder.fit_transform(data['variety'])

data['variety'].unique()

array([0, 1, 2])

In [None]:
data.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


#Building the tree

In [None]:
#Node class
class Node:
  def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
    ''' constructor '''

    #for Decision node
    self.feature_index = feature_index    #The conditional statement at a decision node is defined by the feature
    self.threshold = threshold            #Threshold value of the feature
    self.left = left                      #For accessing the left child of the parent node
    self.right = right                    #For accessing the right child
    self.info_gain = info_gain            #Stores the information gain

    #for Leaf node
    self.value = value                    #The class of the leaf node

In [None]:
#Tree class
class DecisionTreeClassifier():
  def __init__(self, min_samples_split=2, max_depth=2):
    '''constructor'''

    #Initialise the root of the tree
    self.root = None

    #Stopping conditions
    self.min_samples_split = min_samples_split   #If a particular node has less than these many samples, then we won't split it further
    self.max_depth = max_depth                   #If depth of the tree reaches maxinum depth, then we won't split it again

  def build_tree(self, dataset, curr_depth=0):
    '''recursive function to build the tree'''

    X, Y =  dataset[:,:-1], dataset[:,-1]   #Splitting data into features and target
    num_samples, num_features = np.shape(X)

    #split until stopping conditions are met
    if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
      #find the best split
      best_split = self.get_best_split(dataset, num_samples, num_features)
      #check if information gain is positive
      if best_split['info_gain']>0:
        #recur left
        left_subtree = self.build_tree(best_split['dataset_left'],curr_depth+1)   #recursion
        #recur right
        right_subtree = self.build_tree(best_split['dataset_right'], curr_depth+1) #recursion
        #return decision node
        return Node(best_split['feature_index'], best_split['threshold'], left_subtree,
                    right_subtree, best_split['info_gain'])

    #compute leaf node
    leaf_value = self.calculate_leaf_value(Y)
    #return leaf node
    return Node(value=leaf_value)

  def get_best_split(self, dataset, num_samples, num_features):
    '''function to get best split'''

    #dictionary to store the best split
    best_split = {}   #initialise empty dictionary
    max_info_gain = -float('inf')

    #loop over all the features
    for feature_index in range(num_features):
      feature_values = dataset[:, feature_index]
      possible_threshold = np.unique(feature_values)
      #loop over all the possible values present in the data
      for threshold in possible_threshold:
        #get current split
        dataset_left, dataset_right, = self.split(dataset, feature_index, threshold)
        #check if childs are not null
        if len(dataset_right)>0 and len(dataset_left)>0:
          y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
          #compute information gain
          curr_info_gain = self.information_gain(y, left_y, right_y, "entropy")
          #update the best split if needed
          if curr_info_gain > max_info_gain:
            best_split['feature_index'] = feature_index
            best_split['threshold'] = threshold
            best_split['dataset_left'] = dataset_left
            best_split['dataset_right'] = dataset_right
            best_split['info_gain'] = curr_info_gain
            max_info_gain = curr_info_gain


    #return best split
    return best_split

  def split(self, dataset,  feature_index, threshold):
    '''function to split dataset'''

    dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
    return dataset_left, dataset_right


  def information_gain(self, parent, l_child, r_child, mode = 'entropy'):
    '''function to compute information gain'''

    weight_l = len(l_child)/len(parent)
    weight_r = len(r_child)/len(parent)
    if mode == 'gini':
      gain = self.gini_index(parent) - (weight_l * self.gini_index(l_child) + weight_r * self.gini_index(r_child))
    else:
      gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
    return gain

  def entropy(self, y):
    '''function to compute entropy'''

    class_labels = np.unique(y)
    entropy = 0
    for cls in class_labels:
      p_cls = len(y[y==cls])/len(y)
      entropy += -p_cls * np.log2(p_cls)
    return entropy

  def gini_index(self, y):
    '''function to compute gini index'''

    class_labels = np.unique(y)
    gini = 0
    for cls in class_labels:
      p_cls = len(y[y==cls])/len(y)
      gini += p_cls**2
    return 1-gini

  def calculate_leaf_value(self, y):
    '''function to compute leaf node'''

    y = list(y)
    return max(y, key=y.count)

  def print_tree(self, tree=None, indent=' '):
    '''function to print the tree'''

    if not tree:
      tree = self.root

    if tree.value is not None:
      print(tree.value)

    else:
      print("X"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
      print("%sleft:" % (indent), end="")
      self.print_tree(tree.left, indent + indent)
      print("%sright:" % (indent), end="")
      self.print_tree(tree.right, indent + indent)

  def fit(self, X, Y):
    '''function to train the trees'''

    dataset = np.concatenate((X,Y), axis=1)
    self.root = self.build_tree(dataset)

  def predict(self, X):
    '''function to predict class of dataset'''

    predictions = [self.make_prediction(x, self.root) for x in X]
    return predictions

  def make_prediction(self, x, tree):
    '''function to predict a single data point'''

    if tree.value!=None:
      return tree.value
    feature_val = x[tree.feature_index]
    if feature_val<=tree.threshold:
      return self.make_prediction(x, tree.left)
    else:
      return self.make_prediction(x, tree.right)



# Train-Test split

In [None]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test,  Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state= 1234)


# Fit the model

In [None]:
classifier = DecisionTreeClassifier(min_samples_split = 3, max_depth =3)
classifier.fit(X_train, Y_train)
classifier.print_tree()

X2 <= 1.9 ? 0.9264046681474137
 left:0.0
 right:X3 <= 1.7 ? 0.6350305085499441
  left:X2 <= 4.9 ? 0.24077433336810117
    left:X3 <= 1.6 ? 0.18717625687320816
        left:1.0
        right:2.0
    right:X3 <= 1.5 ? 0.4591479170272448
        left:2.0
        right:1.0
  right:X2 <= 4.8 ? 0.10306798632688519
    left:X0 <= 5.9 ? 0.9182958340544896
        left:1.0
        right:2.0
    right:2.0


# Test the model

In [None]:
Y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)


1.0