In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [3]:
X, y = load_iris(return_X_y=True)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
class CustomDecisionTree():

    def __init__(self, max_depth=None):

        self.max_depth = max_depth 
        self.tree = None
    
    def gini(self, y):

        classes, counts = np.unique(y, return_counts=True)

        p = counts / counts.sum()

        return 1 - np.sum(p ** 2)

    def best_split(self, X_train, y_train):

        best_gini = float("inf")
        best_features = None
        best_values = None 

        n_samples, n_features = X_train.shape

        for feature in range(n_features):

            values = np.unique(X_train[:, feature])

            for value in values:

                left_mask = X_train[:, feature] <= value
                right_mask = X_train[:, feature] > value

                y_left = y_train[left_mask]
                y_right = y_train[right_mask]

                gini_left = self.gini(y_left)
                gini_right = self.gini(y_right)

                weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / n_samples

                if weighted_gini < best_gini:

                    best_gini = weighted_gini
                    best_features = feature
                    best_values = value
        
        return best_features, best_values

    
    def build_tree(self, X_train, y_train, depth=0):

        count_classes = np.bincount(y_train)
        majorities_classes = np.argmax(count_classes)

        if len(set(y_train)) == 1 or len(y_train) == 0 or (self.max_depth is not None and depth >= self.max_depth):

            return {"leaf": True, "class": majorities_classes}
        
        best_features, best_values = self.best_split(X_train, y_train)

        if best_features is None:

            return {"leaf": True, "class": majorities_classes}
        
        left_mask = X_train[:, best_features] <= best_values
        right_mask = X_train[:, best_features] > best_values

        left_tree = self.build_tree(X_train[left_mask], y_train[left_mask], depth + 1)
        right_tree = self.build_tree(X_train[right_mask], y_train[right_mask], depth + 1)

        return {
            "leaf" : False,
            "class" : majorities_classes,
            "feature" : best_features,
            "value": best_values,
            "left" : left_tree,
            "right" : right_tree 
        }
    
    def fit(self, X_train, y_train):

        self.tree = self.build_tree(X_train, y_train)
    
    def predict_one(self, X_test):

        node = self.tree

        while isinstance(node, dict) and not node["leaf"]:

            feature_ = node["feature"]
            value_ = node["value"]

            if X_test[feature_] <= value_:

                node = node["left"]
            
            else:

                node = node["right"]
        
        return node["class"]
    
    def predict(self, X_test):

        return np.array([self.predict_one(x) for x in X_test])

In [6]:
decision_ = CustomDecisionTree(max_depth=3)
decision_.fit(X_train, y_train)

In [7]:
y_pred = decision_.predict(X_test)