In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
def split_data(data, feature, value):
    left_data = data[data[feature] == value]
    right_data = data[data[feature] != value]
    return left_data, right_data


In [3]:
def calculate_gini(data):
    total_samples = len(data)
    if total_samples == 0:
        return 0.0
    
    target_counts = data['Decision'].value_counts()
    gini = 1.0
    for target in target_counts.index:
        p = target_counts[target] / total_samples
        gini -= p ** 2
    
    return gini



In [4]:

def find_best_split(data):
    best_gini = 1.0
    best_feature = None
    best_value = None

    for feature in data.columns[:-1]: 
        values = data[feature].unique()
        for value in values:
            left_data, right_data = split_data(data, feature, value)
            if len(left_data) == 0 or len(right_data) == 0:
                continue
            weighted_gini = (len(left_data) / len(data)) * calculate_gini(left_data) + (len(right_data) / len(data)) * calculate_gini(right_data)
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_feature = feature
                best_value = value
    
    return best_feature, best_value


In [5]:

def build_cart_tree(data, depth=0, max_depth=100):
    if depth == max_depth or data['Decision'].nunique() == 1:
        return data['Decision'].mode().iloc[0]

    best_feature, best_value = find_best_split(data)
    if best_feature is None:
        return data['Decision'].mode().iloc[0]

    left_data, right_data = split_data(data, best_feature, best_value)

    left_subtree = build_cart_tree(left_data, depth+1, max_depth)
    right_subtree = build_cart_tree(right_data, depth+1, max_depth)

    return {'feature': best_feature, 'value': best_value, 'left': left_subtree, 'right': right_subtree}

# Build the CART tree

In [6]:

data = pd.read_csv('../datasets/tree.csv').drop("Day", axis = 1)
data["Temp"] = pd.cut(x=data["Temp"], bins = [0,70,80,100], labels=["Cool", "Mild","Hot"])
data["Humidity"] = pd.cut(x=data["Humidity"], bins = [0,70,80,100], labels=["Low", "Normal","High"])
cart_tree = build_cart_tree(data, max_depth=3)



In [7]:
def classify(instance, tree):
    if isinstance(tree, str):
        return tree
    feature, value = tree['feature'], tree['value']
    if instance[feature] == value:
        return classify(instance, tree['left'])
    else:
        return classify(instance, tree['right'])

# Example usage for classifying a new instance
new_instance = {'Outlook': 'Sunny', 'Temp': 'Low', 'Humidity': 'Low', 'Wind': 'Weak'}
predicted_class = classify(new_instance, cart_tree)
print("Predicted class:", predicted_class)

Predicted class: Yes


In [8]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        feature = tree['feature']
        print(indent + f"Feature: {feature}")
        subtree = tree['left']
        value = tree["value"]
        print(indent + f" Left Value: {value}")
        print_tree(subtree, indent + "    ")
        subtree = tree['right']
        value = tree["value"]
        print(indent + f" Right Value: {value}")
        print_tree(subtree, indent + "    ")
    else:
        print(indent + f"Class: {tree}")


In [9]:
print(cart_tree)

{'feature': 'Outlook', 'value': 'Overcast', 'left': 'Yes', 'right': {'feature': 'Temp', 'value': 'Cool', 'left': {'feature': 'Wind', 'value': 'Weak', 'left': 'Yes', 'right': 'No'}, 'right': {'feature': 'Humidity', 'value': 'High', 'left': 'No', 'right': 'Yes'}}}


In [10]:
print_tree(cart_tree)

Feature: Outlook
 Left Value: Overcast
    Class: Yes
 Right Value: Overcast
    Feature: Temp
     Left Value: Cool
        Feature: Wind
         Left Value: Weak
            Class: Yes
         Right Value: Weak
            Class: No
     Right Value: Cool
        Feature: Humidity
         Left Value: High
            Class: No
         Right Value: High
            Class: Yes
