In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from collections import Counter


Decision Tree


In [6]:
class Node:
    def __init__(self,feature=None,threshold=None,left=None,right=None,value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
        

class DecisionTree:
    def __init__(self,max_depth=None, min_samples_split=4,criterion="gini"):
        self.max_depth = max_depth
        self.min_samples_split= min_samples_split
        self.criterion = criterion
        self.root = None
        
        
    def fit(self,X,y):
        self.n_clsses = len(np.unique(np.unique(y)))
        self.root = self._grow_tree(X,y)
        
    def _grow_tree(self,X,y,depth = 0):
        # print(123)
        # print(y)
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        if (self.max_depth is not None and depth >= self.max_depth) or n_samples<self.min_samples_split or n_labels == 1:
            leaf_value = self._most_common_value(y)
            
            return Node(value=leaf_value)
        
        best_feature, best_threshold = self._best_split(X,y)
        
        if best_feature is None:
            leaf_value = self._most_common_value(y)
            return Node(value=leaf_value)
        
        left_idxs = X[:,best_feature] < best_threshold
        right_idxs = ~left_idxs
        
        left = self._grow_tree(X[left_idxs],y[left_idxs],depth+1)
        right = self._grow_tree(X[right_idxs],y[right_idxs],depth+1)
        
        return Node(best_feature,best_threshold,left,right)
        
        
    def _combined_metric(self, y, X_column, threshold):
        left_idxs = X_column < threshold
        right_idxs = ~left_idxs

        if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
            if self.criterion == 'entropy':
                return 0
            else:
                return float('inf')

        n = len(y)

        if self.criterion == 'gini':
            n_l, n_r = len(y[left_idxs]), len(y[right_idxs])
            gini_left, gini_right = self._impurity(y[left_idxs]), self._impurity(y[right_idxs])
            weighted_gini = (n_l / n) * gini_left + (n_r / n) * gini_right
            # print(weighted_gini,'gini')
            return weighted_gini
        elif self.criterion == 'entropy':
            parent_impurity = self._impurity(y)
            n_l, n_r = len(y[left_idxs]), len(y[right_idxs])
            e_l, e_r = self._impurity(y[left_idxs]), self._impurity(y[right_idxs])
            child_impurity = (n_l/n) * e_l + (n_r/n) * e_r
            information_gain = parent_impurity - child_impurity
            # print(information_gain,'entropy')
            return information_gain
        else:
            raise ValueError("Metric must be 'gini' or 'entopy'")

        
    def _impurity(self,y):
        if len(y) == 0:
            return 0
        p = np.bincount(y)/len(y)
        
        if self.criterion == "gini":
            return 1 - np.sum(p**2)
        else:
            return -np.sum(p*np.log2(p+ 1e-10))
        
        
    def _best_split(self,X,y):
        
        if self.criterion == "entropy":
            best_metric = -1
        else:
            best_metric = float("inf")
        best_feature = None 
        best_threshold = None
        
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:,feature])
             
            for threshold in thresholds:
                metric = self._combined_metric(y,X[:,feature],threshold)
                # print(metric,best_metric)
                if self.criterion == "entropy":
                    if metric > best_metric:
                        best_metric = metric
                        best_feature = feature
                        best_threshold = threshold
                elif self.criterion == "gini":
                    if metric < best_metric:
                        best_metric = metric
                        best_feature = feature
                        best_threshold = threshold
            # print(best_metric,best_feature,best_threshold)
                    
                    
        return best_feature,best_threshold
    
    
    def _most_common_value(self,y):
        # print(y)
        counter = Counter(y)
        
        return counter.most_common(1)[0][0]
    
    def predict(self,X):
        return np.array([self._traverse_tree(x,self.root) for x in X])
    
    def _traverse_tree(self,x,node):
        if node.value is not None:
            return node.value
        
        if x[node.feature] < node.threshold:
            return self._traverse_tree(x,node.left)
        return self._traverse_tree(x,node.right)
    


In [29]:
class RandomForest:
    def __init__(self,n_trees=5, max_depth=None, min_sampels_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_sampels_split
        self.max_features = max_features
        self.trees = []


    def fit(self,X,y):
        for _ in range(self.n_trees):
            X_sample, y_sample = self._bootstrap_sample(X,y)
            tree = DecisionTree(max_depth=self.max_depth,min_samples_split=self.min_samples_split,)
            tree.fit(X_sample,y_sample)
            self.trees.append(tree)

    
    def _bootstrap_sample(self,X,y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples,size=n_samples,replace=True)
        return X[indices], y[indices]
    

    def predict(self,X):
        trees_pred = np.array([tree.predict(X) for tree in self.trees])
        return [self._most_common_label(trees_pred[:,i]) for i in range(trees_pred.shape[1])]
    
    def _most_common_label(self,y):
        counter2 = Counter(y)
        most_common = counter2.most_common(1)[0][0]
        return most_common






from sklearn.datasets import load_iris,load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest
rf = RandomForest(n_trees=3, max_depth=10)
rf.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9385964912280702
