# Implementing Decision Tree Classifier


## Importing Libraries


In [155]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Importing and preprocessing the data


In [156]:
data = pd.read_csv(r'./data/nursery.csv')


In [157]:
data.count()


parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.


In [158]:
y = data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)


In [159]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


## Implementing the model


In [160]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity
        self.main_tree = None

        if max_depth is not None and self.max_depth <= 0:
            raise Exception('max_depth should be positive')

        if impurity == 'entropy':
            self.impurity = self._get_entropy

        elif impurity == 'gini':
            self.impurity = self._get_gini

    def encode_X(self, X):
        return np.array(X.apply(LabelEncoder().fit_transform))

    def encode_y(self, y):
        self.label_encoder = LabelEncoder()
        return self.label_encoder.fit_transform(y)

    def decode_y(self, y):

        if self.label_encoder is None:
            raise Exception('Label encoder is not initialized')

        return self.label_encoder.inverse_transform(y)

    def _get_probablity(self, X):
        return np.bincount(X)/X.shape[0]

    def _get_entropy(self, X):
        X = self._get_probablity(X)
        X = X[X != 0]
        return np.dot(X, -np.log2(X))

    def _get_gini(self, X):
        X = self._get_probablity(X)
        return 1 - np.dot(X, X)

    def check_is_fitted(self):
        if self.main_tree is None:
            raise Exception('model is not trained')

    def get_information_gain(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        info_gain = list()
        for i in range(X.shape[1]):
            X1 = X[:, i]

            info_gain.append(self.impurity(y)
                             - np.sum(
                                 [self._get_probablity(X1)[j] *
                                  self.impurity(y[X1 == j])
                                  for j in np.unique(X1)]))

        return np.array(info_gain)

    def _name_branch(self, depth, branch_number):
        return branch_number
        # return 'depth {} sub_cat {}'.format(depth, branch_number)

    def _split_nodes(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        if len(self.count_categories(X)) == 0:
            raise Exception('model is not trained')

        self.number_of_nodes += 1
        X_feature_arg = np.argmax(self.get_information_gain(X[1:, :], y))
        feature_arg = X[0, X_feature_arg]
        leftover_features = np.delete(X[0, :], X_feature_arg).reshape(1, -1)
        X = X[1:, :]
        X1 = X[:, X_feature_arg]
        X = np.delete(X, X_feature_arg, axis=1)

        sub_nodes = dict()

        for i in self.columns_categories[feature_arg]:

            splitted_data = np.concatenate((leftover_features,
                                            X[X1 == i]), axis=0)

            splitted_labels = y[X1 == i]

            branch_name = self._name_branch(self._depth_iter, i)

            if len(splitted_labels) == 0:
                sub_nodes[branch_name] = [np.bincount(y).argmax(), [len(y)]]
                self.number_of_leafs += 1
                continue

            if np.all(splitted_labels == splitted_labels[0]):
                sub_nodes[branch_name] = [
                    splitted_labels[0], [len(splitted_labels)]]
                self.number_of_leafs += 1
                continue

            sub_nodes[branch_name] = (splitted_data, splitted_labels)

        return {(feature_arg, tuple(np.bincount(y))): sub_nodes}

    def count_categories(self, columns):
        if len(columns.shape) != 2:
            raise Exception('X should be 2D array')

        return [np.unique(columns[:, i]) for i in range(columns.shape[1])]

    def summary(self):
        pass

    def fit(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        if len(y.shape) != 1:
            raise Exception('y should be 1D array')

        if X.shape[0] != y.shape[0]:
            raise Exception('X and y should have same number of rows')

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        self.columns_categories = self.count_categories(X)
        self._depth_iter = 0
        self.number_of_nodes = 0
        self.number_of_leafs = 0

        X = np.concatenate((np.arange(X.shape[1]).reshape(1, -1), X), axis=0)

        self.main_tree = self._split_nodes(X, y)
        sub_trees = [self.main_tree]

        for self._depth_iter in range(1, self.max_depth):
            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for branch in sub_tree[node]:

                    X_ys = sub_tree[node][branch]
                    if type(X_ys) == tuple:
                        sub_tree[node][branch] = self._split_nodes(
                            X_ys[0], X_ys[1])
                        sub_trees_temp.append(sub_tree[node][branch])

                sub_trees = sub_trees_temp

        # Replacing the not learned data with the maximum frequent one
        for sub_tree in sub_trees:
            node = list(sub_tree.keys())[0]
            for branch in sub_tree[node]:
                X_ys = sub_tree[node][branch]
                if type(X_ys) == tuple:
                    sub_tree[node][branch] = [np.bincount(
                        X_ys[1]).argmax(), np.bincount(X_ys[1]).tolist()]
                    self.number_of_leafs += 1
                    continue

        self.number_of_nodes += self.number_of_leafs

        return self.main_tree

    def _predict_on_tree(self, X, tree):

        self.check_is_fitted()
        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        y_predicted = list()
        for x in X:
            sub_tree = tree
            for depth in range(self.max_depth):
                if type(sub_tree) != dict:
                    break
                node = list(sub_tree.keys())[0]
                x_category = self._name_branch(depth, x[node[0]])
                sub_tree = sub_tree[node][x_category]

            y_predicted.append(sub_tree[0])

        return np.array(y_predicted)

    def _score_on_tree(self, X, y, tree):

        return np.sum(self._predict_on_tree(X, tree) == y)/y.shape[0]

    def _error_on_tree(self, X, y, tree):

        return 1 - self._score_on_tree(X, y, tree)

    def predict(self, X):
        return self._predcit_on_tree(X, self.main_tree)

    def score(self, X, y):
        return self._score_on_tree(X, y, self.main_tree)

    def error(self, X, y):
        return self._error_on_tree(X, y, self.main_tree)

    def get_depth(self, tree):
        sub_trees = [tree]
        depth = 0
        while True:

            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for branch in sub_tree[node]:

                    if type(sub_tree[node][branch]) == dict:
                        sub_trees_temp.append(sub_tree[node][branch])

            depth += 1
            sub_trees = sub_trees_temp
            if len(sub_trees) == 0:
                return depth

    def count_nodes(self, tree):
        if len(tree) != 1:
            raise Exception('tree should have one root node')

        number_of_nodes = 1
        sub_trees = [tree]
        for _ in range(self.get_depth(tree)):

            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for branch in sub_tree[node]:
                    number_of_nodes += 1
                    if type(sub_tree[node][branch]) == dict:
                        sub_trees_temp.append(sub_tree[node][branch])

            sub_trees = sub_trees_temp

        return number_of_nodes
    
    def count_leafs(self, tree):
        if len(tree) != 1:
            raise Exception('tree should have one root node')

        number_of_leafs = 0
        sub_trees = [tree]
        for _ in range(self.get_depth(tree)):

            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for branch in sub_tree[node]:
                    if type(sub_tree[node][branch]) == dict:
                        sub_trees_temp.append(sub_tree[node][branch])
                    else:
                        number_of_leafs += 1

            sub_trees = sub_trees_temp

        return number_of_leafs

    def _get_subtrees(self, tree):

        if type(tree) != dict:
            raise Exception('tree should be dict')

        sub_trees = [[tree]]

        for depth in range(1, self.max_depth):
            sub_trees_on_depth = []
            for sub_tree in sub_trees[depth-1]:
                node = list(sub_tree.keys())[0]
                for branch in sub_tree[node]:
                    sub_branch = sub_tree[node][branch]
                    if type(sub_branch) == dict:
                        sub_trees_on_depth.append(sub_branch)

            sub_trees.append(sub_trees_on_depth)

        return sub_trees

    def prune(self, X_val, y_val,
              X_train=None,
              y_train=None,
              X_test=None,
              y_test=None,
              max_prune_depth=1):

        self.check_is_fitted()

        if len(X_val.shape) != 2:
            raise Exception('X should be 2D array')
        if len(y_val.shape) != 1:
            raise Exception('y should be 1D array')
        if X_val.shape[0] != y_val.shape[0]:
            raise Exception('X and y should have the same number of rows')
        if max_prune_depth > self.max_depth:
            raise Exception('max_prune_depth should be less than max_depth')

        if (X_train is not None) and (y_train is not None):
            self.train_errors_list = [self.score(X_train, y_train)]

        if (X_test is not None) and (y_test is not None):
            self.test_errors_list = [self.score(X_test, y_test)]

        self.validation_errors_list = [self.score(X_val, y_val)]
        self.list_of_nodes = [self.number_of_nodes]

        self.main_tree_pruned = self.main_tree.copy()
        sub_trees = self._get_subtrees(self.main_tree_pruned)

        pass
        # for depth_on_prune in range(1, max_prune_depth+1):
        #     for sub_tree in sub_trees[-depth_on_prune]:
        #         node = list(sub_tree.keys())[0]
        #         back_up_sub_tree = sub_tree[node].copy()
        #         sub_tree[node] = np.argmax(np.array(node[1]))

        #         for cat in
        # Compare the validation error with the previous one
        # if self._error_on_tree(
        #         X_val,
        #         y_val,
        #         self.main_tree_puned
        # ) < self._error_on_tree(
        #         X_val,
        #         y_val,
        #         self.main_tree
        # ):

        #     self.main_tree = \
        #         self.main_tree_pruned
        #     self.validation_errors_list.append(self.error(X_val,
        #                                                   y_val))

        #     if (X_train is not None) and (y_train is not None):
        #         self.train_errors_list.append(self.error(X_train,
        #                                                  y_train))

        #     if (X_test is not None) and (y_test is not None):
        #         self.test_errors_list.append(self.error(X_test,
        #                                                 y_test))


## Pruning the tree

In [161]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=2/9, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='entropy')

X_train_encoded = model.encode_X(X_train)
X_val_encoded = model.encode_X(X_val)
X_test_encoded = model.encode_X(X_test)

y_train_encoded = model.encode_y(y_train)
y_val_encoded = model.encode_y(y_val)
y_test_encoded = model.encode_y(y_test)

model.fit(X_train_encoded,y_train_encoded)

{(7, (3050, 2989, 2, 2808, 223)): {0: [0, [3050]],
  1: {(1,
    (0,
     1300,
     0,
     1685)): {0: {(0,
      (0,
       71,
       0,
       527)): {0: {(3,
        (0,
         1,
         0,
         205)): {0: {(2,
          (0,
           1,
           0,
           47)): {0: {(4,
            (0,
             1,
             0,
             12)): {0: {(5, (0, 1, 0, 3)): {0: [1, [1]], 1: [3, [3]]}}, 1: [3,
             [5]], 2: [3, [4]]}}, 1: [3, [10]], 2: [3, [12]], 3: [3,
           [13]]}}, 1: [3, [55]], 2: [3, [48]], 3: [3, [55]]}}, 1: {(2,
        (0,
         1,
         0,
         193)): {0: {(4,
          (0,
           1,
           0,
           46)): {0: {(3,
            (0,
             1,
             0,
             9)): {0: {(5, (0, 1, 0, 1)): {0: [1, [1]], 1: [3, [1]]}}, 1: [3,
             [1]], 2: [3, [4]], 3: [3, [3]]}}, 1: [3, [18]], 2: [3, [19]]}},
        1: [3, [51]],
        2: [3, [49]],
        3: [3, [47]]}}, 2: {(4,
        (0,
         69,
      

In [162]:
model.get_depth(model.main_tree)

8

In [163]:
len(model.main_tree)

1

In [164]:
model.number_of_nodes

1107

In [165]:
model.number_of_leafs

796

In [166]:
model.count_nodes(model.main_tree)

1107

In [167]:
model.count_leafs(model.main_tree)

796

In [168]:
model.prune(X_val_encoded,y_val_encoded,max_prune_depth=3)

## Training the Decision Tree Classification model on the Training set


### Encoding the categorical data


In [169]:
model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)


In [170]:
model.fit(X_train_encoded, y_train_encoded)

{(7, (3050, 2989, 2, 2808, 223)): {0: [0, [3050]],
  1: {(1,
    (0,
     1300,
     0,
     1685)): {0: {(0,
      (0,
       71,
       0,
       527)): {0: {(3,
        (0,
         1,
         0,
         205)): {0: {(2,
          (0,
           1,
           0,
           47)): {0: {(4,
            (0,
             1,
             0,
             12)): {0: {(5, (0, 1, 0, 3)): {0: [1, [1]], 1: [3, [3]]}}, 1: [3,
             [5]], 2: [3, [4]]}}, 1: [3, [10]], 2: [3, [12]], 3: [3,
           [13]]}}, 1: [3, [55]], 2: [3, [48]], 3: [3, [55]]}}, 1: {(2,
        (0,
         1,
         0,
         193)): {0: {(4,
          (0,
           1,
           0,
           46)): {0: {(3,
            (0,
             1,
             0,
             9)): {0: {(5, (0, 1, 0, 1)): {0: [1, [1]], 1: [3, [1]]}}, 1: [3,
             [1]], 2: [3, [4]], 3: [3, [3]]}}, 1: [3, [18]], 2: [3, [19]]}},
        1: [3, [51]],
        2: [3, [49]],
        3: [3, [47]]}}, 2: {(4,
        (0,
         69,
      

##


In [171]:
model.number_of_leafs

796

In [172]:
model.number_of_nodes

1107

In [173]:
model.count_categories(X_train_encoded)


[array([0, 1, 2]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2]),
 array([0, 1]),
 array([0, 1, 2]),
 array([0, 1, 2])]

## Evaluating the model on the training set and test set


In [174]:
model.score(X_train_encoded[1:2,:], y_train_encoded[1:2])


1.0

In [175]:
model.score(X_test_encoded, y_test_encoded)


0.6651234567901234

### Evaluate the the model with different size of training set and different hyperparameters

In [176]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 8, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 8, impurity = entropy
Train score: 1.0
Test score: 0.6514
Number of leafs: 671
Number of nodes(size of the tree): 941


In [177]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 8, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 8, impurity = entropy
Train score: 1.0
Test score: 0.6623
Number of leafs: 767
Number of nodes(size of the tree): 1075


In [178]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 6, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 6, impurity = entropy
Train score: 0.9784
Test score: 0.6494
Number of leafs: 405
Number of nodes(size of the tree): 571


In [179]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 6, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 6, impurity = entropy
Train score: 0.9754
Test score: 0.6559
Number of leafs: 416
Number of nodes(size of the tree): 589


#### Gini impurity

In [180]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 8, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 8, impurity = gini
Train score: 1.0
Test score: 0.652
Number of leafs: 663
Number of nodes(size of the tree): 931


In [181]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 8, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 8, impurity = gini
Train score: 1.0
Test score: 0.6623
Number of leafs: 756
Number of nodes(size of the tree): 1062


In [182]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 6, impurity = gini')
print('Train score:', model.score(X_train_encoded, y_train_encoded))
print('Test score:', model.score(X_test_encoded, y_test_encoded))
print('Number of leafs:', model.number_of_leafs)
print('Number of nodes(size of the tree):', model.number_of_nodes)


training size 50%, depth = 6, impurity = gini
Train score: 0.979783950617284
Test score: 0.6484567901234568
Number of leafs: 410
Number of nodes(size of the tree): 578


In [183]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 6, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 6, impurity = gini
Train score: 0.9756
Test score: 0.6559
Number of leafs: 408
Number of nodes(size of the tree): 580
