In [1]:
import numpy as np

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def _gini(self, y):
        """Calculate the Gini Impurity for a target variable."""
        classes, counts = np.unique(y, return_counts=True)
        proportions = counts / len(y)
        return 1 - np.sum(proportions**2)

    def _split(self, X, y, feature_index, threshold):
        """Split the dataset into left and right based on the threshold."""
        left_mask = X[:, feature_index] <= threshold
        right_mask = X[:, feature_index] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _find_best_split(self, X, y):
        """Find the best feature and threshold to split on."""
        best_feature, best_threshold, best_gini = None, None, float('inf')
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                _, _, y_left, y_right = self._split(X, y, feature_index, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                gini_left = self._gini(y_left)
                gini_right = self._gini(y_right)
                gini_split = (len(y_left) / len(y)) * gini_left + (len(y_right) / len(y)) * gini_right

                if gini_split < best_gini:
                    best_feature = feature_index
                    best_threshold = threshold
                    best_gini = gini_split

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth):
        """Recursively build the decision tree."""
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return {
                'type': 'leaf',
                'class': np.bincount(y).argmax()
            }

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            return {
                'type': 'leaf',
                'class': np.bincount(y).argmax()
            }

        X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)

        return {
            'type': 'node',
            'feature': feature,
            'threshold': threshold,
            'left': self._build_tree(X_left, y_left, depth + 1),
            'right': self._build_tree(X_right, y_right, depth + 1)
        }

    def fit(self, X, y):
        """Fit the decision tree to the data."""
        self.tree = self._build_tree(X, y, depth=0)

    def _predict_one(self, x, tree):
        """Predict the class of a single sample."""
        if tree['type'] == 'leaf':
            return tree['class']

        if x[tree['feature']] <= tree['threshold']:
            return self._predict_one(x, tree['left'])
        else:
            return self._predict_one(x, tree['right'])

    def predict(self, X):
        """Predict the class labels for the input data."""
        return np.array([self._predict_one(x, self.tree) for x in X])

In [5]:

# Example usage
if __name__ == "__main__":
    # Example dataset with 5 features and 3 classes
    X = np.array([
        [2.771244718, 1.784783929, 0.5, 1.2, 3.4],
        [1.728571309, 1.169761413, 0.6, 1.3, 3.5],
        [3.678319846, 2.81281357, 0.7, 1.1, 3.3],
        [3.961043357, 2.61995032, 0.8, 1.4, 3.6],
        [2.999208922, 2.209014212, 0.9, 1.0, 3.7],
        [7.497545867, 3.162953546, 1.5, 2.1, 4.0],
        [9.00220326, 3.339047188, 1.6, 2.2, 4.1],
        [7.444542326, 0.476683375, 1.7, 2.0, 4.2],
        [10.12493903, 3.234550982, 1.8, 2.3, 4.3],
        [6.642287351, 3.319983761, 1.9, 2.4, 4.4]
    ])
    y = np.array([0, 0, 0, 1, 1, 2, 2, 2, 2, 1])

    # Train the Decision Tree
    clf = DecisionTreeClassifier(max_depth=3)
    clf.fit(X, y)

    # Predict
    predictions = clf.predict(X)
    print("ActualClass:", y)
    print("Predictions:", predictions,"\n\n")

    X_test = np.array([
      [2.5, 1.9, 0.55, 1.15, 3.45],
      [3.8, 2.6, 0.75, 1.35, 3.55],
      [7.8, 3.0, 1.55, 2.05, 4.05],
      [8.9, 3.4, 1.65, 2.15, 4.15],
      [6.7, 3.3, 1.85, 2.25, 4.25]
    ])
    y_test = np.array([0, 1, 2, 2, 2])
    predictions = clf.predict(X_test)
    print("ActualClass:", y_test)
    print("Predictions:", predictions)
    # Generate test labels
    


ActualClass: [0 0 0 1 1 2 2 2 2 1]
Predictions: [0 0 0 1 1 2 2 2 2 1] 


ActualClass: [0 1 2 2 2]
Predictions: [0 1 2 2 2]


In [None]:
- https://chatgpt.com/share/675fd724-c03c-8012-8b78-e6d3d201859d