# Implementing Decision Tree Classifier

## Importing Libraries

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Importing and preprocessing the data

In [11]:
data = pd.read_csv(r'./data/nursery.csv')

In [12]:
data.count()

parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.

In [13]:
y =  data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


## Implementing the model

In [54]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity
        self.decision_tree_pathes = None
        self.depth_iter = 0

        if impurity == 'entropy':
            self.impurity = self.entropy

        elif impurity == 'gini':
            self.impurity = self.gini

    def encode_X(self, X):
        return np.array(X.apply(LabelEncoder().fit_transform))

    def encode_y(self, y):
        self.label_encoder = LabelEncoder()
        return self.label_encoder.fit_transform(y)

    def decode_y(self, y):

        if self.label_encoder is None:
            raise Exception('Label encoder is not initialized')

        return self.label_encdoer.inverse_transform(y)

    def probablity(self, X):
        return np.bincount(X)/X.shape[0]

    def entropy(self, X):
        X = self.probablity(X)
        X = X[X != 0]
        return np.dot(X, -np.log2(X))

    def gini(self, X):
        X = self.probablity(X)
        return 1 - np.dot(X, X)

    def information_gain(self, X, y):

        if len(X.shape) == 1:
            np.expand_dims(X, axis=1)

        info_gain = list()
        for i in range(X.shape[1]):
            X1 = X[:, i]

            info_gain.append(self.impurity(y)
                             - np.sum(
                                 [self.probablity(X1)[j] *
                                  self.impurity(y[X1 == j])
                                  for j in np.unique(X1)]))

        return np.array(info_gain)

    def split_nodes(self, X, y):

        X_feature_arg = np.argmax(self.information_gain(X[1:, :], y))
        feature_arg = X[0, X_feature_arg]
        leftover_features = np.delete(X[0, :], X_feature_arg).reshape(1, -1)
        X = X[1:, :]
        X1 = X[:, X_feature_arg]
        X = np.delete(X, X_feature_arg, axis=1)

        sub_nodes = dict()

        for i in np.unique(X1):

            splitted_data = np.concatenate((leftover_features,
                                            X[X1 == i]), axis=0)

            splitted_labels = y[X1 == i]

            branch_name = 'depth {} sub_cat {}'.format(self.depth_iter, i)

            if np.all(splitted_labels == splitted_labels[0]):
                sub_nodes[branch_name] = splitted_labels[0]
                continue

            if len(splitted_data) == 0:
                sub_nodes[branch_name] = np.bincount(y).argmax()
                continue

            sub_nodes[branch_name] = (splitted_data, splitted_labels)

        return {feature_arg: sub_nodes}

    def count_leafs(self, X):
        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=1)

        return [np.unique(X[:, i]) for i in range(X.shape[1])]

    def fit(self, X, y):

        if len(X.shape) == 1:
            X = np.expand_dims(X, axis=1)

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        X = np.concatenate((np.arange(X.shape[1]).reshape(1, -1), X), axis=0)
        # return self.split_nodes(X,y)
        self.decision_tree_pathes = self.split_nodes(X, y)
        sub_trees = [self.decision_tree_pathes]
        X_ys = [(X, y)]
        for self.depth_iter in range(1, self.max_depth):
            sub_trees_temp = []
            for sub_tree in sub_trees:
                node = list(sub_tree.keys())[0]
                for cat in sub_tree[node]:
                    X_ys = sub_tree[node][cat]
                    if type(X_ys) == tuple:
                        sub_tree[node][cat] = self.split_nodes(
                            X_ys[0], X_ys[1])
                        sub_trees_temp.append(sub_tree[node][cat])

                    # if self.depth_iter == self.max_depth - 1:
                

                sub_trees = sub_trees_temp

        return self.decision_tree_pathes


In [58]:
model = DecisionTreeClassifier(max_depth=None)
X_train_encoded = model.encode_X(X_train)
y_train_encoded = model.encode_y(y_train)
model.fit(X_train_encoded, y_train_encoded)


{7: {'depth 0 sub_cat 0': 0,
  'depth 0 sub_cat 1': (array([[0, 1, 2, ..., 4, 5, 6],
          [2, 0, 2, ..., 1, 0, 2],
          [2, 4, 0, ..., 1, 1, 1],
          ...,
          [0, 0, 0, ..., 1, 1, 1],
          [1, 2, 0, ..., 0, 0, 1],
          [1, 0, 1, ..., 2, 1, 1]]),
   array([2, 2, 1, ..., 2, 1, 2])),
  'depth 0 sub_cat 2': (array([[0, 1, 2, ..., 4, 5, 6],
          [1, 0, 3, ..., 1, 0, 0],
          [2, 0, 1, ..., 1, 0, 2],
          ...,
          [2, 1, 2, ..., 0, 0, 2],
          [1, 4, 3, ..., 0, 0, 1],
          [0, 0, 2, ..., 2, 1, 2]]),
   array([2, 1, 1, ..., 3, 2, 2]))}}

In [33]:
a = {None: None, 2:{4:6, 7:8}}
# b = [1, 2>]
# a[b]
a[[2]]

TypeError: unhashable type: 'list'

In [237]:
model.count_leafs(X_train_encoded)

[array([0, 1, 2]),
 array([0, 1, 2, 3, 4]),
 array([0, 1, 2, 3]),
 array([0, 1, 2, 3]),
 array([0, 1, 2]),
 array([0, 1]),
 array([0, 1, 2]),
 array([0, 1, 2])]