# Random Forest

Modify the Bagging scratch code in our lecture such that:
- Calculate for oob evaluation for each bootstrapped dataset, and also the average score
- Change the code to "without replacement"
- Put everything into a class <code>Bagging</code>.  It should have at least two methods, <code>fit(X_train, y_train)</code>, and <code>predict(X_test)</code>
- Modify the code from above to randomize features.  Set the number of features to be used in each tree to be <code>sqrt(n)</code>, and then select a subset of features for each tree.  This can be easily done by setting our DecisionTreeClassifier <code>max_features</code> to 'sqrt'

---

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [2]:
X_train.shape

(105, 4)

## Try doing the task without not putting in class

In [3]:
from sklearn.tree import DecisionTreeClassifier
import random
import numpy as np
from scipy import stats
from sklearn.metrics import classification_report, accuracy_score

B = 5
m, n = X_train.shape
boostrap_ratio = 0.8
tree_params = {'max_depth': 2, 'criterion':'gini', 'max_features': 'sqrt'}
models = [DecisionTreeClassifier(**tree_params) for _ in range(B)]

#sample size for each tree
sample_size = int(boostrap_ratio * len(X_train))

xsamples = np.zeros((B, sample_size, n)) # make sure to contain the number of trees you have upfront
ysamples = np.zeros((B, sample_size))

oob_xsamples = []
oob_ysamples = []
oob_accuracy = []
with_replacement = False


#subsamples for each model

for i in range(B):
    ##sampling with replacement; i.e., sample can occur more than once
    #for the same predictor
    idx_used = []
    oob = []
    check_set = np.zeros((m), dtype= bool)
    for j in range(sample_size):
        idx = random.randrange(m)   
        if with_replacement:  # condition here checks if the duplicate index for each bootstrap is allowed
          pass
        else:
          while idx in idx_used: # if the index already is used, then it must be regenerated and check again
            idx = random.randrange(m)

        xsamples[i, j, :] = X_train[idx]
        ysamples[i, j] = y_train[idx]
        idx_used.append(idx)
        oob.append(idx)
        

        #keep track of idx that i did not use for ith tree
    check_set[idx_used] = True
    oob_xsamples.append(X_train[~check_set])
    oob_ysamples.append(y_train[~check_set])



    models[i] = models[i].fit(xsamples[i, :], ysamples[i])
    val_yhat = models[i].predict(oob_xsamples[i])
    # print(val_yhat)
    oob_ac = accuracy_score(y_true = oob_ysamples[i], y_pred = val_yhat)
    oob_accuracy.append(oob_ac)
    print(f"Accuracy oob score of {i} tree: {oob_ac}")
    print()

print(f'Average oob score after of {B} trees : {np.mean(oob_accuracy)}')



    
#make prediction and return the probabilities
predictions = np.zeros((B, X_test.shape[0]))
for i, model in enumerate(models):
    yhat = model.predict(X_test)
    predictions[i, :] = yhat
        
yhat = stats.mode(predictions)[0][0]

print(classification_report(y_test, yhat))

Accuracy oob score of 0 tree: 0.9523809523809523

Accuracy oob score of 1 tree: 0.9047619047619048

Accuracy oob score of 2 tree: 0.9047619047619048

Accuracy oob score of 3 tree: 0.9523809523809523

Accuracy oob score of 4 tree: 0.9523809523809523

Average oob score after of 5 trees : 0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [4]:
#make prediction and return the probabilities
predictions = np.zeros((B, X_test.shape[0]))
for i, model in enumerate(models):
    yhat = model.predict(X_test)
    predictions[i, :] = yhat

In [5]:
stats.mode(predictions, axis = 0)[0][0]

array([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2.,
       1., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0., 0., 0., 1., 0.,
       0., 2., 1., 0., 0., 0., 2., 1., 1., 0., 0.])

In [6]:
stats.mode(predictions, axis = 0)[0][0] == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

## Now, let's put everything in class

In [13]:
class RandomForest:

  def __init__(self, B, bootstrap_ratio, with_replacement):
    self.B = B
    self.bootstrap_ratio = bootstrap_ratio
    self.with_replacement = with_replacement
    self.tree_params = {'max_depth' : 2, 'max_features': 'sqrt'}
    self.models = [DecisionTreeClassifier(**self.tree_params) for tree in range(B)]


  def fit(self, X, y):
    m, n = X.shape

    sample_size = int(self.bootstrap_ratio * m)
    X_sample = np.zeros((self.B, m, n))
    y_sample = np.zeros((self.B, m))
    print(f"Number of bootstrap sample: {sample_size}")
    print()


    oob_X_sample = []
    oob_y_sample = []
    oob_accuracy = []

    for tree in range(self.B):
      idx_used = []
      oob_idx  = []
      check_set = np.zeros((m), dtype = bool)
      
      for row in range(sample_size):
        idx = random.randrange(m)

        if self.with_replacement:
          pass
        else:
          while idx in idx_used:
            idx = random.randrange(m)


        X_sample[tree, row, :] = X[idx, :]
        y_sample[tree, row]   = y[idx]
        idx_used.append(idx)
        oob_idx.append(idx)
        # print(len(idx_used), len(np.unique(idx_used)))


      check_set[oob_idx] = True
      oob_X_sample.append(X[~check_set])
      oob_y_sample.append(y[~check_set])

      print(f"total num of idx and unique idx: {len(idx_used), len(np.unique(idx_used)) }")


      self.models[tree].fit(X_sample[tree, :], y_sample[tree])
      val_yhat = self.models[tree].predict(oob_X_sample[tree])
      oob_ac = accuracy_score(y_true = oob_y_sample[tree], y_pred = val_yhat)
      oob_accuracy.append(oob_ac)
      print(f"Accuracy oob score of {tree} tree: {oob_ac}")
      print()   


  def predict(self, X):
    predictions = np.zeros((self.B, len(X)))
    for i, model in enumerate(self.models):
      predictions[i, : ] = model.predict(X)
#       print(f"y_hat of {i} tree: {predictions[i, : ]}")

    yhat = stats.mode(predictions, axis = 0)[0][0]
    return yhat



### Try with_replacement boostrap and boostrap_ratio at 0.75, with 5 trees

In [8]:
test = RandomForest(5,0.75 ,True)
test.fit(X_train, y_train)

Number of bootstrap sample: 78

total num of idx and unique idx: (78, 58)
Accuracy oob score of 0 tree: 0.9361702127659575

total num of idx and unique idx: (78, 60)
Accuracy oob score of 1 tree: 0.8888888888888888

total num of idx and unique idx: (78, 54)
Accuracy oob score of 2 tree: 0.9019607843137255

total num of idx and unique idx: (78, 56)
Accuracy oob score of 3 tree: 0.8979591836734694

total num of idx and unique idx: (78, 53)
Accuracy oob score of 4 tree: 0.8461538461538461



In [9]:
yhat= test.predict(X_test)

In [10]:
accuracy_score(y_test, yhat)

1.0

### Try without_replacement boostrap and boostrap_ratio at 0.8, with 10 trees

In [14]:
test2 = RandomForest(10,0.8 ,False)
test2.fit(X_train, y_train)

Number of bootstrap sample: 84

total num of idx and unique idx: (84, 84)
Accuracy oob score of 0 tree: 0.9047619047619048

total num of idx and unique idx: (84, 84)
Accuracy oob score of 1 tree: 1.0

total num of idx and unique idx: (84, 84)
Accuracy oob score of 2 tree: 0.8571428571428571

total num of idx and unique idx: (84, 84)
Accuracy oob score of 3 tree: 0.6666666666666666

total num of idx and unique idx: (84, 84)
Accuracy oob score of 4 tree: 0.9047619047619048

total num of idx and unique idx: (84, 84)
Accuracy oob score of 5 tree: 0.9047619047619048

total num of idx and unique idx: (84, 84)
Accuracy oob score of 6 tree: 0.9047619047619048

total num of idx and unique idx: (84, 84)
Accuracy oob score of 7 tree: 0.9523809523809523

total num of idx and unique idx: (84, 84)
Accuracy oob score of 8 tree: 0.9047619047619048

total num of idx and unique idx: (84, 84)
Accuracy oob score of 9 tree: 0.9047619047619048



In [15]:
yhat2=  test2.predict(X_test)
accuracy_score(y_test, yhat2)

1.0

--- 
## Summary
- Notice that power of accuracy rate of random forest. Both model gives the accuracy rate at 100%!!
- Number of unique index used in with_replacement boostrap must be less than total number index. As for without replacement, the number of total indexes and unique indexes must be equal