# Writing decision tree classifier from scratch
### Import Libraries

In [1]:
import numpy as np
import time

### Create the node class
Each node contains attribute and threshold and their left and right sides 

In [2]:
class DecisionTreeNode(object):
    # Constructor
    def __init__(self, att, thr, left, right):  
        self.attribute = att
        self.threshold = thr
        # left and right are either binary classifications or references to 
        # decision tree nodes
        self.left = left     
        self.right = right   

    def print_tree(self,indent=''):
        if self.left  in [0,1]:
            print(indent+'       ','class=',self.left)
        else:
            self.left.print_tree(indent+'    ')
        print(indent,'if x['+str(self.attribute)+'] <=',self.threshold)
        if self.right  in [0,1]:
            print(indent+'       ','class=',self.right)
        else:
            self.right.print_tree(indent+'    ')

### Create the class for decision tree classifier

Algorithm ID3(x,y):
    1. If termination condition applies return leaf with most common class in y 
    2. Determine all the thresholds 
    2. Determine the best attribute and threshold (a,t) with the highest information gain (entropy)
    3. Split the data based on the best attribute and threshold from 2
        a. Let (xl, yl) be the training examples for which x(a)<=t
        b. Let (xr, yr) be the training examples for which x(a)>t
    4. Recursivley return decision tree node where attribute = a, threshold = t, leftchild = ID3(xl,yl), rightchild = ID3 (xr,yr)

In [3]:
class DecisionTreeClassifier(object):
    # constructor
    def __init__(self, max_depth=10, min_samples_split=10, accuracy=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.accuracy = accuracy
        
    def fit(self, data, labels):
        self.root = self._build_tree(data, labels, depth=0)
        
    def predict(self, test_data):
        pred = np.zeros(len(test_data), dtype=int)
        for i in range(len(test_data)):
            pred[i] = self._predict_example(test_data[i], self.root)
        return pred
        
    def _build_tree(self, data, labels, depth):
        # check the base case (termination condition)
        mean_val = np.mean(labels)
        if depth==self.max_depth or len(data)<= self.min_samples_split or max(
                [mean_val, 1-mean_val])>= self.accuracy:
            return int(round(mean_val))
        else:
            depth += 1
            #all_thrs = np.mean(data, axis=0) # uncomment this line only to use mean as therashold
            all_thrs = self._get_all_thrs(data) #comment this line only to use mean as threshold
            # get the best attribute and threshold wth the highest gain
            best_split_col, best_split_val = self._get_best_split(data, labels, all_thrs)
            less, more = self._split_data(data, best_split_col, best_split_val)
            #recursivly build the tree
            left = self._build_tree(data[less], labels[less], depth)
            right = self._build_tree(data[more], labels[more], depth)
            

        return DecisionTreeNode(best_split_col, best_split_val, left, right)
    
    def _get_all_thrs(self, data):
        all_thrs = {}
        for index in range(data.shape[1]):
            all_thrs[index] = []
            unique_val = np.unique(data[:,index])
            
            for idx in range(len(unique_val)):
                if idx != 0:
                    current_val = unique_val[idx]
                    previous_val = unique_val[idx - 1]
                    thr = (current_val + previous_val)/2
                    all_thrs[index].append(thr)
        return all_thrs
        
    # Find the best cloumn to classify and the best threshold value
    def _get_best_split(self, data, labels, all_thrs):
        best_entropy = 999
        for col_index in range(data.shape[1]):
            for thr in all_thrs[col_index]: # comment this line only to use mean as threshold
            #thr = all_thrs[col_index] # uncomment this line and correct the indentation only to use mean as threshold
                less, more = self._split_data(data, col_index, thr)
                ent = self._entropy(labels[less], labels[more])
                
                if ent < best_entropy:
                    best_entropy = ent
                    best_split_col = col_index
                    best_split_val = thr
        return best_split_col, best_split_val
    
    def _split_data(self, data, split_col, thr):
        less = data[:,split_col] <= thr
        more = data[:, split_col] > thr
        return less, more

    # calculate entropy
    def _entropy(self, l, m):
        ent = 0
        for p in [l, m]:
            if len(p) > 0:
                pp = sum(p)/len(p)
                pn = 1 - pp
                if pp < 1 and pp > 0:
                    ent -= len(p)*(pp*np.log2(pp) + pn*np.log2(pn))
        ent = ent / (len(l) + len(m))
        return ent
        
    def _predict_example(self, example, tree):
        col_id = tree.attribute
        val = tree.threshold
    
        if example[col_id] <= val:
            answer = tree.left
        else:
            answer = tree.right
            
        if not isinstance(answer, DecisionTreeNode):
            return answer
        else:
            remaining_tree = answer
            return self._predict_example(example, remaining_tree)
     
    def display(self):
        print("model")
        self.root.print_tree()
        
    def confusion_matrix(self, pred, labels):
        cm = np.zeros((np.max(pred)+1, np.max(pred)+1))
        for i in range(len(pred)):
            cm[pred[i]][labels[i]] += 1
        return cm

### Load and prepare data

In [4]:
x = []
y = []
infile = open("gamma_ray.txt","r")
for line in infile:
    y.append(int(line[-2:-1] =='g'))
    x.append(np.fromstring(line[:-2], dtype=float,sep=','))
infile.close()
    
x = np.array(x).astype(np.float32)
y = np.array(y)

### Split data into training and testing: 80% for trianing and 20% for testing

In [5]:
ind = np.random.permutation(len(y))
split_ind = int(len(y)*0.8) # 80% trianing data
x_train = x[ind[:split_ind]]
x_test = x[ind[split_ind:]]
y_train = y[ind[:split_ind]]
y_test = y[ind[split_ind:]]

### Take a toy dataset to run the program with minimum time 

In [6]:
# Only 5000 data for training and 500 for testing
# Run this cell, only if you want to see the reuslt in quick otherwise skip this cell
x_train = x_train[:5000]
x_test = x_test[:500]
y_train = y_train[:5000]
y_test = y_test[:500]

### Fit the model

In [7]:
model = DecisionTreeClassifier(max_depth=5) # you can change the "max_dept" parameter
start = time.time()
model.fit(x_train, y_train)
elapsed_time = time.time()-start
print('Elapsed_time training  {0:.6f} '.format(elapsed_time)) 

Elapsed_time training  68.269035 


### Display the decision tree

In [8]:
model.display()

model
                        class= 1
                 if x[1] <= 37.87635040283203
                        class= 0
             if x[6] <= -47.939697265625
                        class= 1
                 if x[2] <= 3.3074498176574707
                        class= 1
         if x[4] <= 0.3012999892234802
                        class= 0
                 if x[1] <= 5.980299949645996
                        class= 1
             if x[2] <= 2.5486998558044434
                        class= 1
                 if x[3] <= 0.5571500062942505
                        class= 0
     if x[0] <= 109.766845703125
                    class= 1
             if x[1] <= 23.3114013671875
                    class= 0
         if x[6] <= -84.53669738769531
                        class= 1
                 if x[0] <= 140.51300048828125
                        class= 0
             if x[8] <= 7.210299968719482
                        class= 0
                 if x[5] <= 99.64739990234375
                

### Training accuracy and confusion matrix

In [9]:
train_pred = model.predict(x_train)
train_acc = np.sum(train_pred == y_train)/len(train_pred)
print("Confusion matrix for training:\n", model.confusion_matrix(train_pred, y_train))
print('Train accuracy: ', train_acc)

Confusion matrix for training:
 [[1304.  344.]
 [ 469. 2883.]]
Train accuracy:  0.8374


### Testing accuracy and confusion matrix

In [10]:
start = time.time()
test_pred = model.predict(x_test)
elapsed_time = time.time() - start
print('Elapsed_time testing {0:.6f}'.format(elapsed_time))

test_acc = np.sum(test_pred == y_test)/len(test_pred)
print("\nConfusion matrix for testing:\n", model.confusion_matrix(test_pred, y_test))
print('\nTest accuracy: ', test_acc)

Elapsed_time testing 0.004965

Confusion matrix for testing:
 [[146.  31.]
 [ 38. 285.]]

Test accuracy:  0.862


### Display 10 random prediction result

In [16]:
import pandas as pd
idx = np.random.randint(0, 500, 10)
dict = {"x[0]": x_test[idx,0], "x[1]": x_test[idx,1], "x[2]": x_test[idx,2], "x[3]": x_test[idx,3], "x[4]": x_test[idx,4],
        "x[5]": x_test[idx,5], "x[6]": x_test[idx,6], "x[7]": x_test[idx,7], "x[8]": x_test[idx,8], "x[9]": x_test[idx,9],
        "labels": y_test[idx], "prediction": test_pred[idx]}
df = pd.DataFrame(dict)
pd.set_option('display.width', 1000)
print(df)

         x[0]       x[1]    x[2]    x[3]    x[4]        x[5]        x[6]       x[7]       x[8]        x[9]  labels  prediction
0   97.038300  14.246600  2.3936  0.4566  0.2404 -120.310997   55.750801   7.294300  65.683998  262.899994       0           0
1   40.299900  26.432301  2.9165  0.3261  0.1921   26.835400   34.620399  26.541401  55.900002  159.427994       0           0
2   28.153400  15.252600  2.6739  0.4492  0.2405   29.793800   19.561300 -10.605700  31.651300  224.942001       1           1
3   12.670100  11.742800  2.0394  0.7123  0.3699   15.475700   -7.206700  11.194000  82.248802  189.339996       1           1
4   43.286800  12.652200  2.7271  0.2868  0.1453  -55.563202   15.108200  -6.123800  26.625601  168.076996       0           1
5  272.062988  20.124201  2.5563  0.4556  0.2319 -349.756989  203.863007 -13.878400  62.350399  184.059998       1           0
6   19.730801  18.574499  2.3314  0.4709  0.2401   12.069400  -18.302601 -12.701200   8.753000  141.173996     