In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from decisiontree import DecisionTree

In [10]:
# Randomizing the order of column indexes of x, y 
# Before Randomization [0,1,2,3]
# After randomization idx = [3,0,1,2] etc
def bootstrap_sample(x, y):
    n,_ = x.shape
    idx = np.random.choice(n, n, replace=True)
    return x[idx], y[idx]

In [11]:
class RandomForest:
    def __init__(self, n_trees, max_depth, n_feats, classifier=True, criterion='entropy'):
        self.trees = []
        self.n_trees = n_trees
        self.n_feats = n_feats
        self.max_depth = max_depth
        self.criterion = criterion
        self.classifier = classifier

    def fit(self, x, y):
        '''  
        Create n_trees with same initial parameters 
        Fit all n trees to the data
        '''
        self.trees = []
        for _ in range(self.n_trees):
            x_sample, y_sample = bootstrap_sample(x, y)
            tree = DecisionTree(n_feats = self.n_feats,
                                max_depth = self.max_depth,
                                criterion = self.criterion,
                                classifier = self.classifier
                                )
            tree.fit(x_sample, y_sample)
            self.trees.append(tree)
    
    def predict(self, x):
        '''  
        Traverse Each Tree n_sample number of times
        Total Iterations = number of samples * number of trees, 
        For example for 3 trees and 4 samples, we get
        tree_preds  =  [[1, 0, 1, 0],   # Predictions of Tree 1 for samples 1 to 4
                        [0, 1, 1, 0],   # Predictions of Tree 2 for samples 1 to 4
                        [1, 1, 0, 1]]   # Predictions of Tree 3 for samples 1 to 4
        '''
        tree_preds = np.array([[t._traverse(xi,t.root) for xi in x] for t in self.trees])
        return self._vote(tree_preds)
    
    def _vote(self, predictions):
        '''
        if classification
            returns an array of most frequently occuring prediction for a given sample
            i.e, for the above tree_preds input, out will be [1,1,1,0]
        if prediction
            returns array of mean values of each column i.e, out = [.67, .67, .67, .33]
        '''
        if self.classifier:
            out = [np.bincount(x).argmax() for x in predictions.T]
        else: out = [np.mean(x) for x in predictions.T]
        return np.array(out)

In [12]:
file = pd.read_csv('../ML/Datasets/carstats.csv')
print("Total Records: ", len(file))
data = file.values
file.head()

Total Records:  1728


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [13]:
columns = []
for idx in range(len(data[0])):
    columns.append(np.unique(data[:, idx]))

for i in range(len(data[0])):
    feature = data[:, i]
    classes = columns[i]
    for idx, class_ in enumerate(classes):
        feature[feature == class_] = idx

columns.clear()
for idx in range(len(data[0])):
    columns.append(np.unique(data[:, idx]))
print("Unique Classes of Each Feature")
columns

Unique Classes of Each Feature


[array([0, 1, 2, 3], dtype=object),
 array([0, 1, 2, 3], dtype=object),
 array([0, 1, 2, 3], dtype=object),
 array([0, 1, 2], dtype=object),
 array([0, 1, 2], dtype=object),
 array([0, 1, 2], dtype=object),
 array([0, 1, 2, 3], dtype=object)]

In [14]:
x, y = data[:,:-1].astype(int), data[:,-1:].flatten().astype(int)
x_test, y_test = x[int(0.8*len(x)):], y[int(0.8*len(y)):]
x, y = x[:int(0.8*len(x))], y[:int(0.8*len(y))]
print(f"Train Data: {len(x)}, Test Data: {len(x_test)}")
x,y

Train Data: 1382, Test Data: 346


(array([[3, 3, 0, 0, 2, 1],
        [3, 3, 0, 0, 2, 2],
        [3, 3, 0, 0, 2, 0],
        ...,
        [1, 3, 3, 0, 2, 0],
        [1, 3, 3, 0, 1, 1],
        [1, 3, 3, 0, 1, 2]]),
 array([2, 2, 2, ..., 2, 2, 2]))

In [15]:
forest = RandomForest(n_trees=5,n_feats=7,max_depth=10,criterion='entropy')
forest.fit(x, y)

In [16]:
def accuracy(forest, x, y_true):
    y_pred = forest.predict(x)
    return np.sum(y_pred == y_true)/len(y_true)

print("Train Accuracy: ", accuracy(forest, x, y))
print("Test Accuracy: ", accuracy(forest, x_test, y_test))

Train Accuracy:  0.9898697539797395
Test Accuracy:  0.6763005780346821
