# Implementing Decision Tree Classifier


## Importing Libraries


In [133]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Importing and preprocessing the data


In [134]:
data = pd.read_csv(r'./data/nursery.csv')


In [135]:
data.count()


parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.


In [136]:
y = data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)


In [137]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


## Implementing the model


In [138]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity
        self.decision_tree_pathes = None

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        if impurity == 'entropy':
            self.impurity = self.entropy

        elif impurity == 'gini':
            self.impurity = self.gini

    def encode_X(self, X):
        return np.array(X.apply(LabelEncoder().fit_transform))

    def encode_y(self, y):
        self.label_encoder = LabelEncoder()
        return self.label_encoder.fit_transform(y)

    def decode_y(self, y):

        if self.label_encoder is None:
            raise Exception('Label encoder is not initialized')

        return self.label_encoder.inverse_transform(y)

    def probablity(self, X):
        return np.bincount(X)/X.shape[0]

    def entropy(self, X):
        X = self.probablity(X)
        X = X[X != 0]
        return np.dot(X, -np.log2(X))

    def gini(self, X):
        X = self.probablity(X)
        return 1 - np.dot(X, X)

    def information_gain(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        info_gain = list()
        for i in range(X.shape[1]):
            X1 = X[:, i]

            info_gain.append(self.impurity(y)
                             - np.sum(
                                 [self.probablity(X1)[j] *
                                  self.impurity(y[X1 == j])
                                  for j in np.unique(X1)]))

        return np.array(info_gain)

    def _name_branch(self, depth, branch_number):
        # return branch_number
        return 'depth {} sub_cat {}'.format(depth, branch_number)

    def _split_nodes(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        if len(self.count_categories(X)) == 0:
            raise Exception('model is not trained')

        self.number_of_nodes += 1
        X_feature_arg = np.argmax(self.information_gain(X[1:, :], y))
        feature_arg = X[0, X_feature_arg]
        leftover_features = np.delete(X[0, :], X_feature_arg).reshape(1, -1)
        X = X[1:, :]
        X1 = X[:, X_feature_arg]
        X = np.delete(X, X_feature_arg, axis=1)

        sub_nodes = dict()

        for i in self.columns_categories[feature_arg]:

            splitted_data = np.concatenate((leftover_features,
                                            X[X1 == i]), axis=0)

            splitted_labels = y[X1 == i]

            branch_name = self._name_branch(self.depth_iter, i)

            if len(splitted_labels) == 0:
                sub_nodes[branch_name] = [np.bincount(y).argmax(), len(y)]
                self.number_of_leafs += 1
                continue

            if np.all(splitted_labels == splitted_labels[0]):
                sub_nodes[branch_name] = [
                    splitted_labels[0], len(splitted_labels)]
                self.number_of_leafs += 1
                continue

            sub_nodes[branch_name] = (splitted_data, splitted_labels)

        return {feature_arg: sub_nodes}

    def count_categories(self, columns):
        if len(columns.shape) != 2:
            raise Exception('X should be 2D array')

        return [np.unique(columns[:, i]) for i in range(columns.shape[1])]

    def fit(self, X, y):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        if len(y.shape) != 1:
            raise Exception('y should be 1D array')

        if X.shape[0] != y.shape[0]:
            raise Exception('X and y should have same number of rows')

        self.columns_categories = self.count_categories(X)
        self.depth_iter = 0
        self.number_of_nodes = 0
        self.number_of_leafs = 0

        X = np.concatenate((np.arange(X.shape[1]).reshape(1, -1), X), axis=0)

        self.decision_tree_pathes = self._split_nodes(X, y)
        sub_trees = [self.decision_tree_pathes]

        for self.depth_iter in range(1, self.max_depth):
            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for cat in sub_tree[node]:

                    X_ys = sub_tree[node][cat]
                    if type(X_ys) == tuple:
                        sub_tree[node][cat] = self._split_nodes(
                            X_ys[0], X_ys[1])
                        sub_trees_temp.append(sub_tree[node][cat])

                sub_trees = sub_trees_temp

        # Replacing the not learned data with the maximum frequent one
        for sub_tree in sub_trees:
            node = list(sub_tree.keys())[0]
            for cat in sub_tree[node]:
                X_ys = sub_tree[node][cat]
                if type(X_ys) == tuple:
                    sub_tree[node][cat] = [np.bincount(
                        X_ys[1]).argmax(), ] + np.bincount(X_ys[1]).tolist()
                    self.number_of_leafs += 1
                    continue

        self.number_of_nodes += self.number_of_leafs

        return self.decision_tree_pathes

    def predict(self, X):

        if len(X.shape) != 2:
            raise Exception('X should be 2D array')

        y_predicted = list()
        for x in X:
            sub_tree = self.decision_tree_pathes
            for depth in range(self.max_depth):
                if type(sub_tree) != dict:
                    break
                node = list(sub_tree.keys())[0]
                x_category = self._name_branch(depth, x[node])
                sub_tree = sub_tree[node][x_category]

            y_predicted.append(sub_tree[0])

        return np.array(y_predicted)

    def score(self, X, y):
        return np.sum(self.predict(X) == y)/y.shape[0]


## Training the Decision Tree Classification model on the Training set


### Encoding the categorical data


In [139]:
model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)


In [140]:
model.fit(X_train_encoded, y_train_encoded)

{7: {'depth 0 sub_cat 0': [0, 3450],
  'depth 0 sub_cat 1': {1: {'depth 1 sub_cat 0': {0: {'depth 2 sub_cat 0': {3: {'depth 3 sub_cat 0': {2: {'depth 4 sub_cat 0': {4: {'depth 5 sub_cat 0': {5: {'depth 6 sub_cat 0': [1,
               2],
              'depth 6 sub_cat 1': [2, 3]}},
            'depth 5 sub_cat 1': [2, 5],
            'depth 5 sub_cat 2': [2, 5]}},
          'depth 4 sub_cat 1': [2, 15],
          'depth 4 sub_cat 2': [2, 11],
          'depth 4 sub_cat 3': [2, 16]}},
        'depth 3 sub_cat 1': [2, 57],
        'depth 3 sub_cat 2': [2, 51],
        'depth 3 sub_cat 3': [2, 63]}},
      'depth 2 sub_cat 1': {2: {'depth 3 sub_cat 0': {3: {'depth 4 sub_cat 0': {4: {'depth 5 sub_cat 0': {5: {'depth 6 sub_cat 0': [1,
               2],
              'depth 6 sub_cat 1': [2, 3]}},
            'depth 5 sub_cat 1': [2, 5],
            'depth 5 sub_cat 2': [2, 5]}},
          'depth 4 sub_cat 1': [2, 15],
          'depth 4 sub_cat 2': [2, 12],
          'depth 4 sub_cat 3': 

##


In [141]:
model.number_of_leafs

774

In [142]:
model.number_of_nodes

1081

In [143]:
model.count_categories(X_train_encoded)


[array([0, 1, 2]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2]),
 array([0, 1]),
 array([0, 1, 2]),
 array([0, 1, 2])]

## Evaluating the model on the training set and test set


In [144]:
model.score(X_train_encoded[1:2,:], y_train_encoded[1:2])


1.0

In [145]:
model.score(X_test_encoded, y_test_encoded)


0.6685956790123457

### Evaluate the the model with different size of training set and different hyperparameters

In [146]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 8, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 8, impurity = entropy
Train score: 1.0
Test score: 0.6514
Number of leafs: 671
Number of nodes(size of the tree): 941


In [147]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 8, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 8, impurity = entropy
Train score: 1.0
Test score: 0.6623
Number of leafs: 767
Number of nodes(size of the tree): 1075


In [148]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 6, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 6, impurity = entropy
Train score: 0.9784
Test score: 0.6494
Number of leafs: 405
Number of nodes(size of the tree): 571


In [149]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='entropy')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 6, impurity = entropy')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 6, impurity = entropy
Train score: 0.9754
Test score: 0.6559
Number of leafs: 416
Number of nodes(size of the tree): 589


#### Gini impurity

In [150]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 8, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 50%, depth = 8, impurity = gini
Train score: 1.0
Test score: 0.652
Number of leafs: 663
Number of nodes(size of the tree): 931


In [151]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=None, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 8, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 8, impurity = gini
Train score: 1.0
Test score: 0.6623
Number of leafs: 756
Number of nodes(size of the tree): 1062


In [152]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 50%, depth = 6, impurity = gini')
print('Train score:', model.score(X_train_encoded, y_train_encoded))
print('Test score:', model.score(X_test_encoded, y_test_encoded))
print('Number of leafs:', model.number_of_leafs)
print('Number of nodes(size of the tree):', model.number_of_nodes)


training size 50%, depth = 6, impurity = gini
Train score: 0.979783950617284
Test score: 0.6484567901234568
Number of leafs: 410
Number of nodes(size of the tree): 578


In [153]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

model = DecisionTreeClassifier(max_depth=6, impurity='gini')
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
X_test_encoded = model.encode_X(X_test)
y_test_encoded = model.encode_y(y_test)
model.fit(X_train_encoded, y_train_encoded)
print('training size 75%, depth = 6, impurity = gini')
print('Train score: {:.4}'.format(
    model.score(X_train_encoded, y_train_encoded)))
print('Test score: {:.4}'.format(model.score(X_test_encoded, y_test_encoded)))
print('Number of leafs: {}'.format(model.number_of_leafs))
print('Number of nodes(size of the tree): {}'.format(model.number_of_nodes))


training size 75%, depth = 6, impurity = gini
Train score: 0.9756
Test score: 0.6559
Number of leafs: 408
Number of nodes(size of the tree): 580
