In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris(as_frame=True)
df = iris.frame

X = df.drop(columns='target')
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Leaf value if it's a terminal node

In [None]:
def gini_impurity(y):
    lis = set(y)
    impurity = 1.0
    total = len(y)
    for ind in lis:
        p = sum(y == ind) / total
        impurity -= p ** 2
    return impurity

def best_split(x, y):
    best_gini = float('inf')
    best_idx = None
    best_threshold = None
    n_sample, n_features = x.shape

    for feat in range(n_features):
        thresholds = set(x.iloc[:, feat])
        for threshold in thresholds:
            left_mask = x.iloc[:, feat] <= threshold
            right_mask = ~left_mask
            if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                continue

            gini_left = gini_impurity(y[left_mask])
            gini_right = gini_impurity(y[right_mask])
            weighted_gini = (
                len(y[left_mask]) * gini_left + len(y[right_mask]) * gini_right
            ) / n_sample

            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_idx = feat
                best_threshold = threshold
    return best_idx, best_threshold

# Recursively build decision tree
def build_tree(X, y, depth=0, max_depth=None, min_samples_split=2):
    if len(set(y)) == 1:  # Pure leaf
        return TreeNode(value=y.iloc[0])
    if max_depth is not None and depth >= max_depth:
        return TreeNode(value=y.mode()[0])
    if len(y) < min_samples_split:
        return TreeNode(value=y.mode()[0])

    feature_idx, threshold = best_split(X, y)
    if feature_idx is None:
        return TreeNode(value=y.mode()[0])

    left_mask = X.iloc[:, feature_idx] <= threshold
    right_mask = ~left_mask

    left = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth, min_samples_split)
    right = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth, min_samples_split)

    return TreeNode(feature_index=feature_idx, threshold=threshold, left=left, right=right)

In [None]:
def predict_one(node, x):
    if node.value is not None:
        return node.value
    if x.iloc[node.feature_index] <= node.threshold:
        return predict_one(node.left, x)
    else:
        return predict_one(node.right, x)

def predict(tree, X):
    return [predict_one(tree, row) for _, row in X.iterrows()]

In [6]:
# Bootstrap sampling
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    return X.iloc[indices], y.iloc[indices]

# Random Forest using bagging of decision trees
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            X_sample, y_sample = bootstrap_sample(X, y)
            tree = build_tree(X_sample, y_sample, max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            self.trees.append(tree)

    def predict(self, X):
        # Collect predictions from all trees
        tree_preds = np.array([predict(tree, X) for tree in self.trees])
        # Majority voting
        y_pred = [np.bincount(tree_preds[:, i]).argmax() for i in range(X.shape[0])]
        return y_pred

In [None]:
rf = RandomForest(n_estimators=10, max_depth=3)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 1.0
