In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from collections import Counter

In [2]:
# Load the iris dataset

X, y = load_iris(return_X_y=True)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
class DecisionTree_():

    def __init__(self, max_depth=None):

        self.max_depth = max_depth
        self.tree = None
    
    def gini(self, y):

        classes, counts = np.unique(y, return_counts=True)
        p = counts / counts.sum()

        return 1 - np.sum(p ** 2)

    def best_split(self, X_train, y_train):

        best_gini = float("inf")
        best_features = None
        best_value = None

        n_samples, n_features = X_train.shape

        for feature in range(n_features):

            values = np.unique(X_train[:, feature])

            for value in values:

                left_mask = X_train[:, feature] <= value
                right_mask = X_train[:, feature] > value

                y_left = y_train[left_mask]
                y_right = y_train[right_mask]

                gini_left = self.gini(y_left)
                gini_right = self.gini(y_right)

                weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / n_samples

                if weighted_gini < best_gini:

                    best_gini = weighted_gini
                    best_features = feature
                    best_value = value
        
        return best_features, best_value

    def build_tree(self, X_train, y_train, depth=0):

        count_classes = np.bincount(y_train)
        majorities_class = np.argmax(count_classes)

        if len(set(y_train)) == 1 or len(y_train) == 0 or (self.max_depth is not None and depth >= self.max_depth):

            return {"leaf": True, "class": majorities_class}
        
        best_features, best_values = self.best_split(X_train, y_train)

        if best_features is None:

            return {"leaf": True, "class": majorities_class}

        left_mask = X_train[:, best_features] <= best_values
        right_mask = X_train[:, best_features] > best_values

        left_tree = self.build_tree(X_train[left_mask], y_train[left_mask], depth + 1)
        right_tree = self.build_tree(X_train[right_mask], y_train[right_mask], depth + 1)

        return {
            "leaf": False,
            "feature": best_features,
            "value": best_values,
            "left": left_tree,
            "right": right_tree
        }

    def fit(self, X_train, y_train):

        self.tree = self.build_tree(X_train, y_train)
    
    def predict_one(self, X_test):

        node = self.tree

        while not node["leaf"]:

            if X_test[node["feature"]] <= node["value"]:
                node = node["left"]
            else:
                node = node["right"]

        return node["class"]
    
    def predict(self, X_test):

        return np.array([self.predict_one(x) for x in X_test])


In [6]:
class RandomForest():

    def __init__(self, n_estimators, max_depth=None):

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.tree = []
    
    def bootsrap(self, X_train, y_train):

        n_samples = X_train.shape[0]

        indices = np.random.choice(n_samples, n_samples, replace=True)

        return X_train[indices], y_train[indices]

    def fit(self, X_train, y_train):

        for i in range(self.n_estimators):

            X_sample, y_sample = self.bootsrap(X_train, y_train)

            tree = DecisionTree_(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)

            self.tree.append(tree)
    
    def predict(self, X_test):

        tree_pred = np.array([tree.predict(X_test) for tree in self.tree])

        tree_preds = tree_pred.T

        final_pred = np.array([Counter(row).most_common(1)[0][0] for row in tree_preds])

        return final_pred
    
        



In [7]:
forest_ = RandomForest(n_estimators=10, max_depth=5)

In [8]:
forest_.fit(X_train, y_train)

In [9]:
y_pred = forest_.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

In [11]:
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 1.00%
