# Implementing Decision Tree Classifier

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Importing and preprocessing the data

In [3]:
data = pd.read_csv(r'./data/nursery.csv')

In [4]:
data.count()

parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.

In [5]:
y =  data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


## Implementing the model

In [232]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity
        self.decision_tree_pathes = None

        if impurity == 'entropy':
            self.impurity = self.entropy

        elif impurity == 'gini':
            self.impurity = self.gini

    def encode_X(self, X):
        return np.array(X.apply(LabelEncoder().fit_transform))

    def encode_y(self, y):
        self.label_encoder = LabelEncoder()
        return self.label_encoder.fit_transform(y)

    def decode_y(self, y):

        if self.label_encoder is None:
            raise Exception('Label encoder is not initialized')

        return self.label_encdoer.inverse_transform(y)

    def probablity(self, X):
        return np.bincount(X)/X.shape[0]

    def entropy(self, X):
        X = self.probablity(X)
        X = X[X != 0]
        return np.dot(X, -np.log2(X))

    def gini(self, X):
        X = self.probablity(X)
        return 1 - np.dot(X, X)

    def information_gain(self, X, y):

        if len(X.shape) == 1:
            np.expand_dims(X, axis=1)

        info_gain = list()
        for i in range(X.shape[1]):
            X1 = X[:, i]

            info_gain.append(self.impurity(y)
                             - np.sum(
                                 [self.probablity(X1)[j] *
                                  self.impurity(y[X1 == j])
                                  for j in np.unique(X1)]))

        return np.array(info_gain)

    def split_nodes(self, X, y):

        X_feature_arg = np.argmax(self.information_gain(X[1:, :], y))
        feature_arg = X[0, X_feature_arg]
        leftover_features = np.delete(X[0, :], X_feature_arg).reshape(1, -1)
        X = X[1:, :]
        X1 = X[:, X_feature_arg]
        X = np.delete(X, X_feature_arg, axis=1)

        return feature_arg, [(np.concatenate((leftover_features,
                                              X[X1 == i]), axis=0),
                              y[X1 == i])
                             for i in np.unique(X1)]

    def count_leafs(self, X):
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=1)

        return [np.unique(X[:, i]) for i in range(X.shape[1])]

    def fit(self, X, y):

        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=1)

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        X = np.concatenate((np.arange(X.shape[1]).reshape(1, -1), X), axis=0)

        temp = self.split_nodes(X, y)
        self.decision_tree_pathes = [[np.array([temp[0]])]]
        Xs_ys = temp[1]
        for depth in range(1, self.max_depth):
            Xs_ys_new = list()
            nodes_of_depth = np.empty(0, dtype=int)
            for i in range(len(Xs_ys)):
                if np.all(Xs_ys[i][1] == Xs_ys[i][1][0]):
                    nodes_of_depth = np.append(nodes_of_depth, -1)
                    continue
                if len(Xs_ys[i][0]) == 0:
                    nodes_of_depth = np.append(nodes_of_depth, -2)
                    continue

                temp = self.split_nodes(Xs_ys[i][0], Xs_ys[i][1])
                nodes_of_depth = np.append(nodes_of_depth, temp[0])
                Xs_ys_new += temp[1]


            # Splitting the nodes of depth
            nodes_of_depth = np.split(
                nodes_of_depth,
                np.cumsum([len(self.count_leafs(X[1:, i])[0])
                           for i in
                           np.concatenate(
                    [j for j in self.decision_tree_pathes[depth-1]], axis=0)
                    if i >= 0]))[:-1]

            self.decision_tree_pathes.append(nodes_of_depth)
            Xs_ys = Xs_ys_new


    def tree(self):
        if self.decision_tree_pathes is None:
            raise Exception('Decision tree is not trained')

        print('------------------------')
        print('root(depth 1): {}'.format(self.decision_tree_pathes[0]))
        print('------------------------')

        for i in range(1, len(self.decision_tree_pathes)):
            print('depth {}: {}'.format(i+1, self.decision_tree_pathes[i][:]))
            print('------------------------')



In [236]:
model = DecisionTreeClassifier(max_depth=None)
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
model.fit(X_train_encoded, y_train_encoded)
# model.count_leafs(X_train_encoded)
# temp = model.split_nodes(X_train_encoded, y_train_encoded)
# temp

In [237]:
model.count_leafs(X_train_encoded)

[array([0, 1, 2]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2]),
 array([0, 1]),
 array([0, 1, 2]),
 array([0, 1, 2])]

In [238]:
model.tree()

------------------------
root(depth 1): [array([7])]
------------------------
depth 2: [array([-1,  1,  1])]
------------------------
depth 3: [array([0, 0, 0, 0, 3]), array([0, 0, 0, 0, 6])]
------------------------
depth 4: [array([3, 2, 4]), array([ 2,  4, -1]), array([ 4, -1, -1]), array([ 4, -1, -1]), array([ 2, -1, -1, -1]), array([4, 4, 6]), array([4, 6, 6]), array([6, 6, 6]), array([6, 6, 6]), array([4, 3, 4])]
------------------------
depth 5: [array([ 2, -1, -1, -1]), array([ 3, -1, -1, -1]), array([5, 2, 3]), array([ 3, -1, -1, -1]), array([5, 3, 3]), array([5, 2, 3]), array([5, 2, 3]), array([ 4, -1, -1, -1]), array([6, 2, 3]), array([6, 3, 3]), array([-1,  4, -1]), array([6, 2, 3]), array([-1,  3, -1]), array([ 4, -1,  3]), array([-1,  4, -1]), array([ 4, -1,  3]), array([ 4, -1,  4]), array([-1,  4, -1]), array([ 4, -1,  4]), array([ 4, -1,  4]), array([5, 3, 3]), array([ 2, -1, -1, -1]), array([5, 2, 3])]
------------------------
depth 6: [array([ 4, -1, -1, -1]), array(