In [187]:
import pandas as pd
import numpy as np
import math

In [188]:
def entropy(data):
    target_counts = data['Decision'].value_counts()
    total_samples = len(data)
    entropy = 0
    for target in target_counts.index:
        p = target_counts[target] / total_samples
        entropy -= p * math.log2(p)
    return entropy

In [189]:
def gain_ratio(data, feature):
    unique_values = data[feature].unique()
    total_entropy = entropy(data)
    split_entropy = 0
    intrinsic_info = 0

    for value in unique_values:
        subset = data[data[feature] == value]
        prob = len(subset) / len(data)
        split_entropy += prob * entropy(subset)
        intrinsic_info -= prob * math.log2(prob)

    if intrinsic_info == 0:
        return 0 

    return (total_entropy - split_entropy) / intrinsic_info

In [190]:
def find_best_split(data):
    best_gain_ratio = 0
    best_feature = None

    for feature in data.columns[:-1]:
        gr = gain_ratio(data, feature)
        if gr > best_gain_ratio:
            best_gain_ratio = gr
            best_feature = feature

    return best_feature


In [191]:
def build_c45_tree(data, depth=0, max_depth=3):
    if depth == max_depth or data['Decision'].nunique() == 1:
        return data['Decision'].mode().iloc[0]

    best_feature = find_best_split(data)
    if best_feature is None:
        return data['Decision'].mode().iloc[0]

    tree = {'feature': best_feature, 'subtrees': {}}
    unique_values = data[best_feature].unique()
    for value in unique_values:
        subset = data[data[best_feature] == value]
        tree['subtrees'][value] = build_c45_tree(subset, depth+1, max_depth)

    return tree

In [192]:
data = pd.read_csv("../datasets/tree.csv").drop("Day", axis = 1)
data["Temp"] = pd.cut(x=data["Temp"], bins = [0,70,80,100], labels=["Cool", "Mild","Hot"])
data["Humidity"] = pd.cut(x=data["Humidity"], bins = [0,70,80,100], labels=["Low", "Normal","High"])

c45_tree = build_c45_tree(data, max_depth=3)

In [193]:
def classify(instance, tree):
    if isinstance(tree, str):
        return tree
    feature = tree['feature']
    value = instance[feature]
    if value not in tree['subtrees']:
        return data['PlayTennis'].mode().iloc[0]
    return classify(instance, tree['subtrees'][value])

In [194]:
new_instance = {'Outlook': 'Sunny', 'Temperature': 'Mild', 'Humidity': 'Low', 'Wind': 'Weak'}
predicted_class = classify(new_instance, c45_tree)
print("Predicted class:", predicted_class)

Predicted class: Yes


In [195]:
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        feature = tree['feature']
        print(indent + f"Feature: {feature}")
        for value, subtree in tree['subtrees'].items():
            print(indent + f"  Value: {value}")
            print_tree(subtree, indent + "    ")
    else:
        print(indent + f"Class: {tree}")

In [196]:
print_tree(c45_tree)

Feature: Outlook
  Value: Sunny
    Feature: Humidity
      Value: High
        Class: No
      Value: Low
        Class: Yes
  Value: Overcast
    Class: Yes
  Value: Rain
    Feature: Wind
      Value: Weak
        Class: Yes
      Value: Strong
        Class: No
