# Implement the Random Forest Model using the previously implemented Decision Tree Model

In [1]:
from DecisionTree import DecisionTree, most_common_element # use the previously implemented DecisionTree algorithm
import numpy as np

class RandomForest:
  def __init__(self, num_trees=10, max_depth=10, min_examples_for_split=2, num_features=None):
    self.num_trees = num_trees
    self.max_depth = max_depth
    self.min_examples_for_split = min_examples_for_split
    self.num_features = num_features
    self.trees = []

  def fit(self, X, y):
    # Make a list of trees and train each tree
    self.trees = []
    for _ in range(self.num_trees):
      tree = DecisionTree(min_examples_for_split = self.min_examples_for_split, max_depth = self.max_depth, num_features = self.num_features)
      X_sample, y_sample = self._bootstrap_examples(X, y)
      tree.fit(X_sample, y_sample)
      self.trees.append(tree)

  def _bootstrap_examples(self, X, y):
    num_examples = X.shape[0]
    # Create num_example number of random indices in the range of (0, num_examples)
    # This is a random subset and not the whole dataset because its with replacement so that each example maybe repeated
    indices = np.random.choice(num_examples, num_examples, replace=True)
    return X[indices], y[indices]

  def _most_common_label(self, y):
    return most_common_element(y)

  def predict(self, X):
    # First store all the predictions for the examples from all the trees
    tree_predictions = np.array([tree.predict(X) for tree in self.trees])
    tree_predictions = np.swapaxes(tree_predictions, 0, 1)
    # Then get the majority voted label from all the trees, which is the prediction of the random forest
    y_pred = np.array([most_common_element(pred) for pred in tree_predictions])
    return y_pred

# Load in a dataset from sklearn and partition it into training and testing datasets

In [2]:
rs = 1234 # random seed for reproducibility of results
np.random.seed(rs)

from sklearn import datasets
from sklearn.model_selection import train_test_split

breast_cancer_dataset = datasets.load_breast_cancer()
X = breast_cancer_dataset.data
y = breast_cancer_dataset.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

# Train the implemented Random Forest model

In [3]:
model = RandomForest(num_trees=20)
model.fit(X_train, y_train)

# Test the model on the test dataset obtained from the previous train test split

In [4]:
def accuracy(y_test, y_pred):
  acc = np.sum(y_test == y_pred) / len(y_test)
  acc *= 100
  acc = round(acc, 2)
  return acc

def calculate_metrics(y_test, y_pred):
  true_positives = np.sum(np.logical_and(y_test == 1, y_pred == 1))
  false_positives = np.sum(np.logical_and(y_test == 0, y_pred == 1))
  false_negatives = np.sum(np.logical_and(y_test == 1, y_pred == 0))
  precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
  recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
  f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
  confusion_matrix = np.array([[true_positives, false_positives], [false_negatives, len(y_test) - true_positives]])
  return precision, recall, f1_score, confusion_matrix

def print_confusion_matrix(conf_matrix):
  true_positives, false_positives, false_negatives, true_negatives = conf_matrix.ravel()
  print(f"                    Actual Positive    | Actual Negative")
  print(f"Predicted Positive |       {true_positives} (TP)     |    {false_positives} (FP)")
  print(f"Predicted Negative |       {false_negatives} (FN)     |    {true_negatives} (TN)")

In [5]:
y_pred = model.predict(X_test)

acc = accuracy(y_test, y_pred)
precision, recall, f1_score, confusion_matrix = calculate_metrics(y_test, y_pred)

print(f"Accuracy: {acc}%")
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Confusion Matrix:")
print_confusion_matrix(confusion_matrix)

Accuracy: 93.86%
Precision: 0.9428571428571428
Recall: 0.9565217391304348
F1 Score: 0.9496402877697843
Confusion Matrix:
                    Actual Positive    | Actual Negative
Predicted Positive |       66 (TP)     |    4 (FP)
Predicted Negative |       3 (FN)     |    48 (TN)
