# Decision Trees

What is Gini Impurity?
Gini Impurity is a measure of how often a randomly chosen element from a set would be incorrectly labeled if it was randomly labeled according to the distribution of labels in the subset. It ranges from 0 to 0.5:

0: The set is perfectly pure (all elements belong to the same class).

0.5: The set is maximally impure (elements are evenly distributed across classes).



In [2]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Helper function to calculate Gini Impurity
def gini_impurity(labels):
    """
    Calculate the Gini Impurity for a set of labels.
    Gini = 1 - sum(p_i^2), where p_i is the probability of class i.
    """
    if len(labels) == 0:
        return 0
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    return 1 - np.sum(probabilities ** 2)

# Function to find the best split based on Gini Impurity
def find_best_split(X, y):
    """
    Find the best feature and split value to minimize Gini Impurity.
    """
    best_gini = np.inf
    best_feature = None
    best_split_value = None

    # Iterate over each feature
    for feature_index in range(X.shape[1]):
        feature_values = X[:, feature_index]
        unique_values = np.unique(feature_values)

        # Try splitting at each unique value of the feature
        for value in unique_values:
            left_mask = feature_values <= value
            right_mask = feature_values > value
            left_labels = y[left_mask]
            right_labels = y[right_mask]

            # Skip invalid splits
            if len(left_labels) == 0 or len(right_labels) == 0:
                continue

            # Calculate weighted Gini Impurity
            total_samples = len(y)
            left_weight = len(left_labels) / total_samples
            right_weight = len(right_labels) / total_samples
            weighted_gini = left_weight * gini_impurity(left_labels) + right_weight * gini_impurity(right_labels)

            # Update the best split if this one is better
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_feature = feature_index
                best_split_value = value

    return best_feature, best_split_value

# Class for Decision Tree Node
class TreeNode:
    def __init__(self, feature=None, split_value=None, left=None, right=None, label=None):
        self.feature = feature  # Feature index used for splitting
        self.split_value = split_value  # Threshold value for splitting
        self.left = left  # Left child node
        self.right = right  # Right child node
        self.label = label  # Label for leaf nodes

# Recursive function to build the Decision Tree
def build_tree(X, y, max_depth, current_depth=0):
    """
    Recursively build the Decision Tree using Gini Impurity.
    """
    # Base case: Stop if the tree reaches max depth or all labels are the same
    if current_depth >= max_depth or len(np.unique(y)) == 1:
        # Return a leaf node with the majority class label
        unique_labels, counts = np.unique(y, return_counts=True)
        majority_label = unique_labels[np.argmax(counts)]
        return TreeNode(label=majority_label)

    # Find the best split
    best_feature, best_split_value = find_best_split(X, y)

    # If no valid split is found, return a leaf node
    if best_feature is None:
        unique_labels, counts = np.unique(y, return_counts=True)
        majority_label = unique_labels[np.argmax(counts)]
        return TreeNode(label=majority_label)

    # Split the dataset
    left_mask = X[:, best_feature] <= best_split_value
    right_mask = X[:, best_feature] > best_split_value
    left_X, left_y = X[left_mask], y[left_mask]
    right_X, right_y = X[right_mask], y[right_mask]

    # Recursively build left and right subtrees
    left_child = build_tree(left_X, left_y, max_depth, current_depth + 1)
    right_child = build_tree(right_X, right_y, max_depth, current_depth + 1)

    # Return a decision node
    return TreeNode(feature=best_feature, split_value=best_split_value, left=left_child, right=right_child)

# Function to predict using the Decision Tree
def predict(tree, x):
    """
    Traverse the Decision Tree to predict the class label for a single data point.
    """
    if tree.label is not None:  # Leaf node
        return tree.label
    if x[tree.feature] <= tree.split_value:
        return predict(tree.left, x)
    else:
        return predict(tree.right, x)

# Wrapper function to train and use the Decision Tree
def decision_tree_classifier(X_train, y_train, max_depth):
    """
    Train a Decision Tree classifier and return the root node.
    """
    return build_tree(X_train, y_train, max_depth)

# Example Usage with Iris Dataset
if __name__ == "__main__":
    # Load Iris dataset
    iris = load_iris()
    X = iris.data  # Features: sepal length, sepal width, petal length, petal width
    y = iris.target  # Labels: 0 (setosa), 1 (versicolor), 2 (virginica)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Decision Tree
    max_depth = 3
    tree = decision_tree_classifier(X_train, y_train, max_depth)

    # Test the Decision Tree
    predictions = [predict(tree, x) for x in X_test]

    # Evaluate accuracy
    accuracy = np.mean(predictions == y_test)
    print("Predictions:", predictions)
    print("True Labels:", y_test)
    print(f"Accuracy: {accuracy * 100:.2f}%")

Predictions: [np.int64(1), np.int64(0), np.int64(2), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(2), np.int64(1), np.int64(1), np.int64(2), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(2), np.int64(1), np.int64(1), np.int64(2), np.int64(0), np.int64(2), np.int64(0), np.int64(2), np.int64(2), np.int64(2), np.int64(2), np.int64(2), np.int64(0), np.int64(0)]
True Labels: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy: 100.00%
