In [11]:
import numpy as np

class Node:
    def __init__(self, feature = None, threshold = None, left = None, right = None, value=None):
        
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [12]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None
    
    def fit(self, X, y):
        self.root = self.grow_tree(X, y)
    
    def gini_impurity(self, y):
        _, counts = np.unique(y, return_counts = True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)
    
    def information_gain(self, parent, left_child, right_child):
        weight_left = len(left_child) / len(parent)
        weight_right = len(right_child) / len(parent)
        
        return (self.gini_impurity(parent) - (weight_left * self.gini_impurity(left_child) +
                                             weight_right * self.gini_impurity(right_child)))
    def best_split(self, X, y):
        best_gain = -1
        best_feature, best_threshold = None, None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <=threshold
                right_mask = ~left_mask
                gain = self.information_gain(y, y[left_mask], y[right_mask])
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold
    
    def grow_tree(self, X, y, depth = 0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        if(depth == self.max_depth or n_samples  < 2 or n_classes == 1):
            return Node(value = np.argmax(np.bincount(y)))
        
        
        feature, threshold = self.best_split(X, y)
        left_mask = X[:, feature] <=threshold
        right_mask = ~left_mask
        left = self.grow_tree(X[left_mask], y[left_mask], depth+1)
        right = self.grow_tree(X[right_mask], y[right_mask], depth+1)
        
        return Node(feature = feature, threshold=threshold, left = left, right = right)
    
    def predict(self, X):
        return np.array([self.traverse_tree(x, self.root) for x in X])
    
    def traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        else:
            return self.traverse_tree(x, node.right)

In [13]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([0, 0 ,1 ,1])
tree = DecisionTree(max_depth = 3)
tree.fit(X, y)
predictions = tree.predict(X)
print(f"Predictions: {predictions}")

Predictions: [0 0 1 1]


In [14]:
from collections import Counter
class RandomForestClassifier:
    def __init__(self, n_estimators = 10, max_depth = 10, max_features = None):
        
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []
        
    def bootstrap_sample(self, X, y):
        n_samples=X.shape[0]
        indices = np.random.choice(n_samples, size = n_samples, replace=True)
        return X[indices], y[indices]
    
    def fit(self, X, y):
        self.trees = []
        n_features = X.shape[1]
        max_features = self.max_features if self.max_features else n_features
        
        for i in range(self.n_estimators):
            
            tree = DecisionTree(max_depth = self.max_depth)
            
            X_sample, y_sample = self.bootstrap_sample(X, y)
            
            selected_features = np.random.choice(n_features, size = max_features, replace=True)
            
            X_sample_subset = X_sample[:, selected_features]
            
            tree.fit(X_sample_subset, y_sample)
            
            self.trees.append((tree, selected_features))
    
    def predict(self, X):
        tree_predictions = np.array([tree.predict(X[:, features]) for tree, features in self.trees])
        majority_votes = [Counter(tree_predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return np.array(majority_votes)
        

In [15]:
# Home Work: Random Forest Regressor
# Upload your code into discord server

In [16]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([0, 0, 1, 1, 1, 1])

model = RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1)
model.fit(X, y)
predictions = model.predict(X)

In [17]:
print(f"Predictions: {predictions}")

Predictions: [0 0 1 1 1 1]


In [21]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples = 1000, n_features = 10, n_informative = 5, 
                          n_redundant = 5, random_state = 42)


In [22]:
model = RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1)
model.fit(X, y)
predictions = model.predict(X)

In [24]:
predictions[:5]

array([0, 1, 0, 0, 0], dtype=int64)