In [None]:
from sklearn.datasets import load_iris

In [None]:
import numpy as np
import pandas as pd

In [None]:
col = ['sepal_length','sepal_width','petal_length','petal_width']
df = pd.DataFrame(load_iris().data, columns=col)
df['type'] = 0
df.iloc[0:50,-1] = 0
df.iloc[50:100,-1] = 1
df.iloc[100:150,-1] = 2
df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# Node class 
class Node():
  def __init__(self, feature_index=None, threshold=None, left=None, 
               right=None, information_gain=None, value=None):
    ''' this is constructor function for creating tree nodes'''
    # feature_index = based on which feature(column value) of the dataset we spliting
    self.feature_index= feature_index 
    self.threshold= threshold # threshold = based which feature value we are spliting 
    self.left= left
    self.right= right
    self.information_gain= information_gain

    self.value = value # for leaf node


In [None]:
# Decision tree calssifier class
class DTClassifier():
  def __init__(self, min_sample=2, max_depth=2):
    self.root = None

    self.min_sample = min_sample
    self.max_depth = max_depth
  
  
  
  def tree_builder(self, dataset, current_depth=0):
    #we storing dataset's rows and columns (except 'type' column) in x
    x = dataset[:,:-1]
    y = dataset[:,-1]
    #np.shape gives values like - (samples, features)
    samples, features = np.shape(x)

    # according to the minnimum and maximum condition we are spliting 
    if samples >= self.min_sample and current_depth<=self.max_depth:

      best_split = self.create_best_split(dataset, samples, features)
      '''check if this node is leaf node or-not by information gain because 
      for the leaf node information gain is zero.'''
      if best_split['information_gain']>0:
        # here we recursively creating tree, firstly left subtree and then right subtree
        left_stree = self.tree_builder(best_split["left"], current_depth+1)
        right_stree = self.tree_builder(best_split["right"], current_depth+1)

        # here based on our split we createing node by passing value in Node class.
        return Node(best_split['feature_index'], best_split['threshold'], left_stree, 
                    right_stree ,best_split['information_gain'])

    #here we are creating leaf node.
    leaf_value = self.get_leaf_value(y)
    return Node(value = leaf_value)
  
  def create_best_split(self, dataset, samples, features):

    # creating dictoinary for storing best split and then return it at last 
    best_split = {}
    #set information gain to negetive infinity
    max_information_gain = -float("inf")

    for feature in range(features):
      # here feature = simple index of column like(for example (0)-'sepal-length')
      feature_values = dataset[:, feature]
      #p_threshold = possible_threshold - (unique values of rows according to feature)  
      p_thresholds = np.unique(feature_values)

      for threshold in p_thresholds:
        #for this threshold value we are spliting feature_values in left and right 
        dataset_left = np.array([row for row in dataset if row[feature]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature]>threshold])

        if len(dataset_left>0) and len(dataset_right>0) :
          #storing in different variables
          d, left_d, right_d = dataset[:, -1], dataset_left[:, -1], dataset_right[:,-1]
          #calculate information gain
          current_information_gain = self.calc_information_gain(d, left_d, right_d)

          if current_information_gain > max_information_gain:
            #set the best_split dictionary values
            best_split['feature_index'] = feature
            best_split['threshold'] = threshold
            best_split['left'] = dataset_left
            best_split['right'] = dataset_right
            best_split['information_gain'] = current_information_gain
            max_information_gain = current_information_gain
      
    return best_split
  
  #function for calculating informatin gain
  def calc_information_gain(self, parent, left_child, right_child):
    #calculating weights for left and right child
    w_left_child = len(left_child)/len(parent)
    w_right_child =  len(right_child)/len(parent)
    # information gain = gini(parent) - (sum of all -- weight*gini(child))
    info_gain = self.calc_gini_index(parent)-(w_left_child*self.calc_gini_index(left_child) + 
                                              w_right_child*self.calc_gini_index(right_child))

    return info_gain

  #function for calculating gini index
  def calc_gini_index(self, Y):
    lables = np.unique(Y)
    gini =0
    for lable in lables:
      #calculating the possibility of different lables 
      pi = len(Y[Y==lable])/len(Y)
      gini += pi**2
    return 1-gini # 1-gini because parent node has possibility of 1
  
  def get_leaf_value(self, Y):
    Y = list(Y)
    return max(Y, key=Y.count)

  # here we are simply print this tree
  def print(self, tree=None, indent=" "):
    if not tree:
      tree = self.root
    if tree.value is not None:
      print(tree.value)
    else:
      print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.information_gain)
      print("%sleft:" % (indent), end="" )
      self.print(tree.left, indent + indent)
      print("%sright:" % (indent), end="")
      self.print(tree.right, indent + indent)
  
  def fit(self, x, y):
    #here we are train the tree
    dataset = np.concatenate((x,y), axis=1)
    self.root =  self.tree_builder(dataset)

  def predict(self, X):
    prediction = [self.get_prediction(x, self.root) for x in X]
    return prediction
  
  def get_prediction(self, x, tree):
    if tree.value != None:
      return tree.value
    f_val = x[tree.feature_index]
    if f_val<=tree.threshold:
      return self.get_prediction(x, tree.left)
    else:
      return self.get_prediction(x, tree.right)


In [None]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=31) 

In [None]:
classifier =  DTClassifier(min_sample=3, max_depth=3)
classifier.fit(X_train, Y_train)
classifier.print()

X_2 <= 1.9 ? 0.3413247863247863
 left:0.0
 right:X_2 <= 4.8 ? 0.4023703242326723
  left:X_3 <= 1.6 ? 0.06463527239150524
    left:1.0
    right:X_1 <= 2.8 ? 0.4444444444444444
        left:2.0
        right:1.0
  right:X_3 <= 1.7 ? 0.03500000000000009
    left:X_0 <= 6.3 ? 0.21333333333333332
        left:2.0
        right:1.0
    right:2.0


In [None]:
Y_pred = classifier.predict(X_test) 
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.9