In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Steps to Create a Random Forest
# 1- Bagging: Randomly sample subsets of the data (with replacement) for each tree.
# 2- For each tree, randomly select a subset of features at each split.
# 3- Train a decision tree on each sampled dataset.
# 4- Repeat steps 1â€“3 to build multiple decision trees.
# 5- Aggregating: For predictions, aggregate the outputs of all the trees (e.g., majority vote for classification or averaging for regression).

from sklearn.utils import resample

class RandomForest:
    """
    A random forest classifier for multi-class classification problems (using decision stumps with depth 1).
    """
    
    def __init__(self, n_trees=7):
        self.n_trees = n_trees
        self.trees = []

    def fit(self, X, y):
        """
        Fits a random forest to the dataset (X, y).
        """
        self.trees = []
        for _ in range(self.n_trees):
            stump = DecisionStump()
            X_sample, y_sample = self._bootstrap_samples(X, y)
            stump.fit(X_sample, y_sample)
            self.trees.append(stump)

    def predict(self, X):
        """
        Predicts class labels for samples in X.
        """
        stump_predictions = np.array([stump.predict(X) for stump in self.trees])
        return self._majority_vote(stump_predictions)
    
    def _bootstrap_samples(self, X, y):
        """
        Applies bootstrap resampling to the dataset.
        """
        return resample(X, y, n_samples=len(X), replace=True)
    
    def _majority_vote(self, predictions):
        """
        Returns the majority vote of the predictions.
        """
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

In [3]:
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()

X, y = breast_cancer.data, breast_cancer.target

print(f"Breast Cancer features: {breast_cancer.feature_names}")
print(f"Breast Cancer target: {breast_cancer.target_names}")

Breast Cancer features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Breast Cancer target: ['malignant' 'benign']


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")

Shape of X_train: (455, 30), y_train: (455,)
Shape of X_test: (114, 30), y_test: (114,)


In [7]:
class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = None
        self.alpha = None

    def fit(self, X, y):
        m, n = X.shape
        min_error = float('inf')
        for feature_i in range(n):
            feature_values = np.expand_dims(X[:, feature_i], axis=1)
            unique_values = np.unique(feature_values)
            for threshold in unique_values:
                for polarity in [1, -1]:
                    error = 0
                    for i in range(m):
                        prediction = polarity * (1 if X[i, feature_i] < threshold else -1)
                        if prediction != y[i]:
                            error += 1
                    if error < min_error:
                        min_error = error
                        self.feature_index = feature_i
                        self.threshold = threshold
                        self.polarity = polarity

    def predict(self, X):
        m = X.shape[0]
        predictions = np.ones(m)
        for i in range(m):
            predictions[i] = self.polarity * (1 if X[i, self.feature_index] < self.threshold else -1)
        return predictions

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

def predict(self, X):
    m = X.shape[0]
    predictions = np.ones(m, dtype=int)  # Cast to int
    for i in range(m):
        predictions[i] = int(self.polarity * (1 if X[i, self.feature_index] < self.threshold else -1))
    return predictions

In [20]:
def _majority_vote(self, predictions):
    """
    Returns the majority vote of the predictions.
    """
    predictions = predictions.astype(int)  # Force integer type
    return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

In [22]:
from sklearn.preprocessing import LabelEncoder

# Assuming y_train contains string labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Converts labels to integers

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Assuming y_train contains string labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Encode labels to integers

rf_custom = RandomForestClassifier()
rf_custom.fit(X_train, y_train_encoded)

rf_cust_predictions = rf_custom.predict(X_test)

print(f"Custom RF Accuracy: {accuracy_score(y_test, rf_cust_predictions):.3f}")
print(f"Custom RF F1-Score: {f1_score(y_test, rf_cust_predictions, average='weighted'):.3f}")

Custom RF Accuracy: 0.965
Custom RF F1-Score: 0.965
