In [1]:
import numpy as np
import pandas as pd

# Calculate Gini Impurity for classification
def gini_impurity(data):
    class_counts = data.iloc[:, -1].value_counts()
    probabilities = class_counts / len(data)
    return 1 - np.sum(probabilities**2)

# Split the data based on a threshold for a feature
def best_split(data, feature):
    unique_values = data[feature].unique()
    best_gini = float('inf')
    best_threshold = None
    left_split = None
    right_split = None
    
    for value in unique_values:
        left = data[data[feature] <= value]
        right = data[data[feature] > value]
        
        # Skip if one split is empty
        if len(left) == 0 or len(right) == 0:
            continue
        
        gini = (len(left) / len(data)) * gini_impurity(left) + (len(right) / len(data)) * gini_impurity(right)
        
        if gini < best_gini:
            best_gini = gini
            best_threshold = value
            left_split = left
            right_split = right
    
    return best_threshold, left_split, right_split

# CART algorithm to build a binary tree
def cart(data, features, depth=1, max_depth=5):
    if len(data.iloc[:, -1].unique()) == 1:  # Only one class remaining
        return data.iloc[0, -1]
    
    if depth >= max_depth or len(features) == 0:  # Max depth or no more features to split
        return data.iloc[:, -1].mode()[0]  # Return the most frequent class
    
    # Find the best feature to split on
    best_feature = None
    best_gini = float('inf')
    best_threshold = None
    left_split = None
    right_split = None
    
    for feature in features:
        threshold, left, right = best_split(data, feature)
        
        if threshold is not None:
            gini = (len(left) / len(data)) * gini_impurity(left) + (len(right) / len(data)) * gini_impurity(right)
            
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = threshold
                left_split = left
                right_split = right
    
    # Create the tree node
    tree = {best_feature: {'threshold': best_threshold, 
                           'left': cart(left_split, [f for f in features if f != best_feature], depth + 1, max_depth),
                           'right': cart(right_split, [f for f in features if f != best_feature], depth + 1, max_depth)}}
    
    return tree

# Example dataset (replace with your own data)
data = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Mild', 'Mild', 'Cool', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'High', 'Low', 'Low', 'Low', 'Low', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
})

# List of features (excluding the target column)
features = data.columns[:-1].tolist()

# Build the Decision Tree using CART
tree = cart(data, features, max_depth=3)
print("Decision Tree:")
print(tree)


Decision Tree:
{'Outlook': {'threshold': 'Overcast', 'left': 'Yes', 'right': {'Temperature': {'threshold': 'Hot', 'left': 'No', 'right': 'Yes'}}}}
