In [1]:
import numpy as np
from collections import Counter

In [4]:
class Node:
    def __init__(self, feature=None, f_values=None, childs=None, label=None):
        self.feature = feature
        self.f_values = f_values
        self.childs = childs
        self.label = label
        
    def is_leaf_node(self):
        return self.label is not None

In [None]:
class DecisionTree:
    def __init__(self, min_samples_split=2):
        self.min_samples_split=min_samples_split
        self.root=None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))

        # check the stopping criteria
        if (n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        # find the best split
        best_feature = self._best_split(X, y)

        # create child nodes
        childs = []
        best_column = X[:, best_feature]
        for v in np.unique(best_column):
            index = np.argwhere(best_column == v).flatten()
            child_X = np.delete(X[index], best_feature, axis=1)
            child_y = y[index]
            childs.append(self._grow_tree(child_X, child_y))
        
        return Node(best_feature, childs)


    def _best_split(self, X, y):
        best_gain = -1
        split_idx = None

        for feat_idx in X.shape[1]:
            X_column = X[:, feat_idx]
            
            gain = self._information_gain(y, X_column)
            
            if gain > best_gain:
                best_gain = gain
                split_idx = feat_idx

        return split_idx


    def _information_gain(self, y, X_column):
        # parent entropy
        parent_entropy = self._entropy(y)

        # create children
        childs_idx = self._split(X_column)
        
        # calculate the weighted avg. entropy of children
        n = len(y)
        information_gain = parent_entropy
        for ch_id in childs_idx:
            n_ch = len(ch_id) / n
            information_gain -= (n_ch / n) * self._entropy(y[ch_id])
            
        return information_gain

    def _split(self, X_column, split_thresh):
        splits = []
        for v in np.unique(X_column):
            splits.append(np.argwhere(X_column == v).flatten())
        return splits

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p>0])


    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

# Test Bed

In [6]:
import pandas as pd

data = {
    'alt': ['Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes'],
    'bar': ['No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'fri': ['No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'hun': ['Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes'],
    'pat': ['Some', 'Full', 'Some', 'Full', 'Full', 'Some', 'None', 'Some', 'Full', 'Full', 'None', 'Full'],
    'price': ['$$$', '$', '$', '$', '$$$', '$$', '$', '$$', '$', '$$$', '$', '$'],
    'rain': ['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No'],
    'res': ['Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No'],
    'type': ['French', 'Thai', 'Burger', 'Thai', 'French', 'Italian', 'Burger', 'Thai', 'Burger', 'Italian', 'Thai', 'Burger'],
    'est': ['0-10', '30-60', '0-10', '10-30', '>60', '0-10', '0-10', '0-10', '>60', '10-30', '0-10', '30-60'],
    'will_wait': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']
}
df = pd.DataFrame(data)
df

Unnamed: 0,alt,bar,fri,hun,pat,price,rain,res,type,est,will_wait
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes
1,Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No
2,No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes
3,Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes
4,Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No
5,No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes
6,No,Yes,No,No,,$,Yes,No,Burger,0-10,No
7,No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes
8,No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No
9,Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No


In [7]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(df.iloc[:, :-1], df.iloc[:, -1])

ValueError: could not convert string to float: 'Yes'