In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
class Node:
    def __init__(self, feature=None, f_value=None, childs=None, label=None, most_common=None):
        self.feature = feature
        self.f_value = f_value
        self.childs = childs
        self.label = label
        self.most_common = most_common
        
    def is_leaf_node(self):
        return self.label is not None

In [4]:
class DecisionTree:
    def __init__(self, min_samples_split=1):
        self.min_samples_split=min_samples_split
        self.root=None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, f_value=None):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))

        # check the stopping criteria
        if (n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(f_value=f_value, label=leaf_value, most_common=leaf_value)

        # find the best split
        best_feature = self._best_split(X, y)

        # create child nodes
        childs = {}
        best_column = X[:, best_feature]
        for v in np.unique(best_column):
            index = np.argwhere(best_column == v).flatten()
            child_X = X[index]
            child_y = y[index]
            childs[v] = self._grow_tree(child_X, child_y, v)
        
        return Node(feature=best_feature, f_value=f_value, childs=childs, most_common=self._most_common_label(y))


    def _best_split(self, X, y):
        best_gain = -1
        split_idx = None

        for feat_idx in range(X.shape[1]):
            X_column = X[:, feat_idx]
            
            gain = self._information_gain(y, X_column)
            
            if gain > best_gain:
                best_gain = gain
                split_idx = feat_idx

        return split_idx


    def _information_gain(self, y, X_column):
        # parent entropy
        parent_entropy = self._entropy(y)

        # create children
        childs_idx = self._split(X_column)
        
        # calculate the weighted avg. entropy of children
        n = len(y)
        information_gain = parent_entropy
#         print('an')
        for ch_id in childs_idx:
            n_ch = len(ch_id)
#             print(n_ch,'/', n)
            information_gain -= (n_ch / n) * self._entropy(y[ch_id])
            
        return information_gain

    def _split(self, X_column):
        splits = []
        for v in np.unique(X_column):
            splits.append(np.argwhere(X_column == v).flatten())
        return splits

    def _entropy(self, y):
#         hist = np.bincount(y)
#         ps = hist / len(y)
#         return -np.sum([p * np.log2(p) for p in ps if p>0])
        count = Counter(y)
        return -np.sum([count[i]/len(y) * np.log2(count[i]/len(y)) for i in count])


    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.label
        
        if x[node.feature] in node.childs:
            return self._traverse_tree(x, node.childs[x[node.feature]])
        else:
            return node.most_common
        

# Test Bed

In [25]:
import pandas as pd

data = {
    'alt': ['Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes'],
    'bar': ['No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'fri': ['No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'hun': ['Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes'],
    'pat': ['Some', 'Full', 'Some', 'Full', 'Full', 'Some', 'None', 'Some', 'Full', 'Full', 'None', 'Full'],
    'price': ['$$$', '$', '$', '$', '$$$', '$$', '$', '$$', '$', '$$$', '$', '$'],
    'rain': ['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No'],
    'res': ['Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No'],
    'type': ['French', 'Thai', 'Burger', 'Thai', 'French', 'Italian', 'Burger', 'Thai', 'Burger', 'Italian', 'Thai', 'Burger'],
    'est': ['0-10', '30-60', '0-10', '10-30', '>60', '0-10', '0-10', '0-10', '>60', '10-30', '0-10', '30-60'],
    'will_wait': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']
}
df = pd.DataFrame(data)
df

Unnamed: 0,alt,bar,fri,hun,pat,price,rain,res,type,est,will_wait
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes
1,Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No
2,No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes
3,Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes
4,Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No
5,No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes
6,No,Yes,No,No,,$,Yes,No,Burger,0-10,No
7,No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes
8,No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No
9,Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No


In [6]:
def traverse(root, depth=0, his=[]):
    if root.feature is not None:
        if depth < len(his):
            his[depth].append((root.f_value, list(df.columns)[root.feature]))
        else:
            his.append([(root.f_value, list(df.columns)[root.feature])])
    else:
        if depth < len(his):
            his[depth].append((root.f_value, root.label))
        else:
            his.append([(root.f_value, root.label)])
    
    if root.childs is not None:
        for child_key in root.childs:
            his = traverse(root.childs[child_key], depth+1, his=his)
    
    return his

In [7]:
tree = DecisionTree(min_samples_split=1)
tree.fit(df.to_numpy()[:, :-1], df.to_numpy()[:, -1])

In [8]:
y_pred = tree.predict(df.to_numpy()[:, :-1])
# print(accuracy_score(y_pred, df.to_numpy()[:, -1]))
print(y_pred)

['Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'Yes']


In [23]:
traverse(tree.root)

[[(None, 'pat'), (None, 'area_mean')],
 [('Full', 'type'),
  ('None', 'No'),
  ('Some', 'Yes'),
  ('Full', 'concave points_mean'),
  ('None', 'No'),
  ('Some', 'Yes')],
 [('Burger', 'alt'),
  ('French', 'No'),
  ('Italian', 'No'),
  ('Thai', 'fri'),
  ('Burger', 'diagnosis'),
  ('French', 'No'),
  ('Italian', 'No'),
  ('Thai', 'texture_mean')],
 [('No', 'No'),
  ('Yes', 'Yes'),
  ('No', 'No'),
  ('Yes', 'Yes'),
  ('No', 'No'),
  ('Yes', 'Yes'),
  ('No', 'No'),
  ('Yes', 'Yes')]]

# Cancer dataset

In [50]:
df = pd.read_csv('part2.csv').iloc[:, 1:-1]
df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [51]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

for c in X.columns:
    X[c] = pd.cut(X[c], 20)
    
x_train, x_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=4)

my_tree = DecisionTree()
my_tree.fit(x_train, y_train)

In [52]:
y_pred = my_tree.predict(x_test)
y_pred

array(['M', 'B', 'M', 'M', 'M', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M',
       'M', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'B',
       'B', 'B', 'B', 'B', 'M', 'B', 'B', 'M', 'B', 'B', 'B', 'B', 'B',
       'B', 'M', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B',
       'M', 'B', 'M', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B'], dtype='<U1')

In [53]:
accuracy_score(y_pred, y_test)

0.8859649122807017

In [131]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
    
x_train, x_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=4)

sk_tree = DecisionTreeClassifier()
sk_tree.fit(x_train, y_train)
y_pred = sk_tree.predict(x_test)
accuracy_score(y_pred, y_test)

TypeError: fit() got an unexpected keyword argument 'random_state'