# Implementing Decision Tree Classifier

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


## Importing and preprocessing the data

In [2]:
data = pd.read_csv(r'./data/nursery.csv')

In [3]:
data.count()

parents             12960
has_nurs            12960
form                12960
children            12960
housing             12960
finance             12960
social              12960
health              12960
final evaluation    12960
dtype: int64

There is no missing data in the dataset. So, we can directly move on to the next step.

In [4]:
y =  data['final evaluation']
X = data.drop(labels=['final evaluation'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


## Implementing the model

In [6]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, impurity='entropy'):
        self.max_depth = max_depth
        self.impurity_name = impurity

        if impurity == 'entropy':
            self.impurity = self.entropy

        elif impurity == 'gini':
            self.impurity = self.gini

    # def encode_X(self, X):
    #     return X.apply(LabelEncoder().fit_transform)

    # def encode_y(self, y):
    #     self.label_encoder = LabelEncoder()
    #     return self.label_encoder.fit_transform(y)

    # def decode_y(self, y):

    #     if self.label_encoder is None:
    #         raise Exception('Label encoder is not initialized')

    #     return self.label_encoder.inverse_transform(y)

    def probablity(self, X):

        return X.value_counts()/X.shape[0]

    def entropy(self, X):
        X = self.probablity(X)

        return np.dot(X, -np.log2(X))

    def gini(self, X):
        X = self.probablity(X)
        return 1 - np.dot(X, X)

    def information_gain(self, X, y):

        if len(X.shape) == 1:
            X = X.to_frame()

        info_gain = list()
        for column in range(X.shape[1]):
            X1 = X.iloc[:, column]
            info_gain.append(self.impurity(y)
                             - sum([self.probablity(X1)[j] *
                                    self.impurity(y[X1 == j])
                                    for j in X1.unique()]))

        return np.array(info_gain)

    def split_nodes(self, X, y):

        feature_arg = np.argmax(self.information_gain(X, y))
        X1 = X.iloc[:, feature_arg]
        X = X.drop(X.columns[feature_arg], axis=1)

        return [(X[X1 == i], y[X1 == i]) for i in X1.unique()]

    def fit(self, X, y):

        if len(X.shape) == 1:
            X = X.to_frame()

        if self.max_depth is None:
            self.max_depth = X.shape[1]

        Xs_ys = self.split_nodes(X, y)
        for _ in range(self.max_depth-1):
            Xs_ys_new = list()
            for i in range(len(Xs_ys)):
                # if 
                Xs_ys_new += self.split_nodes(Xs_ys[i][0], Xs_ys[i][1])

            Xs_ys = Xs_ys_new

        return Xs_ys


In [13]:
model = DecisionTreeClassifier(max_depth=1)
# X_train_encoded = model.encode_X(X_train)
# y_train_encoded = model.encode_y(y_train)
# model.information_gain(X_train, y_train)
model.probablity(X_train.iloc[:,7])

recommended    0.334780
not_recom      0.332755
priority       0.332465
Name: health, dtype: float64

In [11]:
model.fit(X_train, y_train)

[(           parents     has_nurs        form children     housing     finance  \
  3334         usual     critical      foster        2    critical  convenient   
  3616         usual    very_crit    complete        3    critical      inconv   
  6127   pretentious     improper    complete        2   less_conv  convenient   
  7981   pretentious    very_crit    complete     more    critical  convenient   
  5125   pretentious       proper      foster        3    critical      inconv   
  ...            ...          ...         ...      ...         ...         ...   
  6265   pretentious     improper   completed        1  convenient  convenient   
  5734   pretentious  less_proper  incomplete        3  convenient      inconv   
  11284   great_pret     critical    complete        1    critical      inconv   
  5191   pretentious  less_proper    complete        1  convenient  convenient   
  7270   pretentious     critical   completed        3   less_conv      inconv   
  
             