In [1]:
import pandas as pd
import numpy as np

class DecisionTreeNode:
    def __init__(self, feature_index=None, threshold=None, label=None, left=None, right=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.label = label
        self.left = left
        self.right = right

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_labels = np.unique(y)

        # Stopping criteria
        if len(unique_labels) == 1 or depth == self.max_depth:
            leaf_label = self._majority_vote(y)
            return DecisionTreeNode(label=leaf_label)

        # Greedily select the best split according to Gini impurity
        best_feature, best_threshold = self._best_split(X, y, num_samples, num_features)
        
        # Grow the children recursively
        if best_feature is not None:
            left_idx, right_idx = X[:, best_feature] < best_threshold, X[:, best_feature] >= best_threshold
            left_child = self._build_tree(X[left_idx], y[left_idx], depth + 1)
            right_child = self._build_tree(X[right_idx], y[right_idx], depth + 1)
            return DecisionTreeNode(feature_index=best_feature, threshold=best_threshold, left=left_child, right=right_child)
        
        # If no split, return leaf node
        leaf_label = self._majority_vote(y)
        return DecisionTreeNode(label=leaf_label)

    def _best_split(self, X, y, num_samples, num_features):
        best_feature, best_threshold = None, None
        best_gini = 1.0  # Max possible value

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gini = self._calculate_gini(X[:, feature_index], y, threshold)
                if gini < best_gini:
                    best_gini, best_feature, best_threshold = gini, feature_index, threshold

        return best_feature, best_threshold

    def _calculate_gini(self, feature_column, labels, threshold):
        # Split dataset
        left_labels, right_labels = labels[feature_column < threshold], labels[feature_column >= threshold]
        
        # Calculate gini for children
        left_gini = 1.0 - sum([(np.sum(left_labels == c) / len(left_labels)) ** 2 for c in np.unique(labels)])
        right_gini = 1.0 - sum([(np.sum(right_labels == c) / len(right_labels)) ** 2 for c in np.unique(labels)])
        
        # Weighted gini
        weighted_gini = len(left_labels) / len(labels) * left_gini + len(right_labels) / len(labels) * right_gini
        return weighted_gini

    def _majority_vote(self, labels):
        majority_label = np.argmax(np.bincount(labels))
        return majority_label

    def predict(self, X):
        predictions = [self._predict(inputs, self.root) for inputs in X]
        return np.array(predictions)

    def _predict(self, inputs, node):
        if node.label is not None:
            return node.label
        if inputs[node.feature_index] < node.threshold:
            return self._predict(inputs, node.left)
        return self._predict(inputs, node.right)

# Hypothetical Dataset
data = pd.DataFrame({
    'Age': [25, 40, 32, 50], 
    'Income': [50000, 80000, 65000, 120000],
    'HasCreditCard': [1, 0, 1, 1],  # Converted to numeric for simplicity
    'Purchased': [0, 1, 1, 1]  # Converted to numeric for simplicity
})
X = data.drop('Purchased', axis=1).values
y = data['Purchased'].values

# Instantiate and train the model
model = DecisionTreeClassifier(max_depth=3)
model.fit(X, y)

# Predictions
predictions = model.predict(X)
print("Predictions:", predictions)

# Evaluation (for simplicity, using accuracy)
accuracy = np.sum(predictions == y) / len(y)
print("Accuracy:", accuracy)


Predictions: [0 1 1 1]
Accuracy: 1.0


  left_gini = 1.0 - sum([(np.sum(left_labels == c) / len(left_labels)) ** 2 for c in np.unique(labels)])
