In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [8]:
data = pd.read_csv(r'./data/nursery.csv')

In [9]:
data.count()

parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.

In [10]:
y =  data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [20]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity

        if impurity == 'entropy':
            self.impurity = self.entropy

        elif impurity == 'gini':
            self.impurity = self.gini

    # def encode_X(self, X):
    #     return X.apply(LabelEncoder().fit_transform)

    # def encode_y(self, y):
    #     self.label_encoder = LabelEncoder()
    #     return self.label_encoder.fit_transform(y)

    # def decode_y(self, y):

    #     if self.label_encoder is None:
    #         raise Exception('Label encoder is not initialized')

    #     return self.label_encoder.inverse_transform(y)

    def probablity(self, X):

        return X.value_counts()/X.shape[0]

    def entropy(self, X):
        X = self.probablity(X)

        return np.dot(X, -np.log2(X))

    def gini(self, X):
        X = self.probablity(X)
        return 1 - np.dot(X, X)

    def information_gain(self, X, y):

        if len(X.shape) == 1:
            X = X.to_frame()

        info_gain = list()
        for column in range(X.shape[1]):
            X1 = X.iloc[:, column]
            info_gain.append(self.impurity(y)
                             - sum([self.probablity(X1)[j] *
                                    self.impurity(y[X1 == j])
                                    for j in X1.unique()]))

        return np.array(info_gain)

    def split_nodes(self, X, y):

        feature_arg = np.argmax(self.information_gain(X, y))
        X1 = X.iloc[:, feature_arg]
        X = X.drop(X.columns[feature_arg], axis=1)

        return [(X[X1 == i], y[X1 == i]) for i in X1.unique()]

    def fit(self, X, y):

        if len(X.shape) == 1:
            X = X.to_frame()

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        # print(self.information_gain(X, y))
        Xs_ys = self.split_nodes(X, y)
        for _ in range(self.max_depth-1):
            Xs_ys_new = list()
            for i in range(len(Xs_ys)):
                print(self.information_gain(Xs_ys[i][0], Xs_ys[i][1]))
                Xs_ys_new += self.split_nodes(Xs_ys[i][0], Xs_ys[i][1])

            Xs_ys = Xs_ys_new

        # return Xs_ys


In [21]:
model = DecisionTreeClassifier(max_depth=None, impurity='entropy')
model.fit(X_train,y_train)

[1.56908750e-01 3.56881677e-01 3.91947563e-03 9.55926707e-03
 1.31538478e-02 3.03437927e-03 8.94183068e-05]
[-0. -0. -0. -0. -0. -0. -0.]
[0.0991759  0.29528345 0.01490313 0.0329029  0.04834773 0.00895848
 0.09240787]
[1.63630443e-01 2.12208556e-02 3.59145196e-02 5.40188601e-02
 1.21968942e-02 5.77180164e-05]
[0.00034875 0.02083024 0.02137619 0.01618701 0.01010176 0.00126276]
[0.66254481 0.00741499 0.01158345 0.01588879 0.00342375 0.00105521]
[4.60318280e-01 7.75324940e-03 2.08841048e-02 3.21346411e-02
 5.57503273e-03 7.92779139e-05]
[4.59589384e-01 8.54858129e-03 1.86957269e-02 2.28019111e-02
 7.51293564e-03 2.97394901e-04]
[-0. -0. -0. -0. -0. -0.]
[-0. -0. -0. -0. -0. -0.]
[-0. -0. -0. -0. -0. -0.]
[0.19422241 0.02430694 0.04233231 0.06691322 0.01072923 0.12399227]
[0.48411798 0.0138248  0.03922685 0.04232192 0.00970589 0.10769531]
[0.21862553 0.02719944 0.04328595 0.07830672 0.0138664  0.21847812]
[0.19995614 0.01833367 0.04906448 0.07658357 0.0137579  0.19992278]
[0.00051397 0.037