In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from statistics import mode

# Decision tree builder function
def build_decision_tree(df: pd.DataFrame) -> dict:
    # If all target column values are the same, return that value
    if len(np.unique(df.iloc[:, -1])) == 1:
        return df.iloc[0, -1]
    
    # If there are no features left to split on, return the mode of the target column
    if len(df.columns) == 1:
        return mode(df.iloc[:, -1])

    # Get the best column to split on
    split_col = best_split_col(df)

    # Initialize the tree structure
    tree = defaultdict(dict)
    tree["column_name"] = split_col

    # Split based on unique values in the best column
    for value in np.unique(df[split_col]):
        subset = df[df[split_col] == value].drop(columns=[split_col])
        tree[value] = build_decision_tree(subset)
        
    return tree

# Prediction function
def predict(root: dict, data: dict) -> str:
    node = root
    while isinstance(node, dict):
        col = node["column_name"]
        node = node.get(data[col], "Unknown")  # Avoid error if key is not found
    return node

# Determine the best split column based on information gain ratio
def best_split_col(df: pd.DataFrame) -> str:
    best_col, best_ratio = None, 0

    for col in df.columns[:-1]:
        info_gain = information_gain(df, col)
        split_info_val = split_info(df, col)

        # Check if split_info is non-zero to avoid division by zero
        if split_info_val > 0:
            ratio = info_gain / split_info_val
            if ratio > best_ratio:
                best_ratio, best_col = ratio, col
    
    return best_col

# Entropy calculation function
def entropy(df: pd.DataFrame) -> float:
    counts = np.unique(df.iloc[:, -1], return_counts=True)[1]
    total = len(df)
    return -sum((count / total) * np.log2(count / total) for count in counts)

# Information gain calculation function
def information_gain(df: pd.DataFrame, col: str) -> float:
    total_entropy = entropy(df)
    weighted_entropy = sum(
        (freq / len(df)) * entropy(df[df[col] == x]) 
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )
    return total_entropy - weighted_entropy

# Split information calculation function
def split_info(df: pd.DataFrame, col: str) -> float:
    freqs = np.unique(df[col], return_counts=True)[1]
    total = len(df)
    return -sum((freq / total) * np.log2(freq / total) for freq in freqs)

# Load data and build the tree
path = "data09_1.csv"
df = pd.read_csv(path)
tree = build_decision_tree(df)

# Test sample for prediction
test = {"Outlook": "Sunny", "Temp": "Mild", "Humidity": "Normal", "Wind": "Weak"}
result = predict(tree, test)

# Display results
print("Prediction:", result)
print("Decision Tree:", dict(tree))


Prediction: Yes
Decision Tree: {'column_name': 'Outlook', 'Overcast': 'Yes', 'Rain': defaultdict(<class 'dict'>, {'column_name': 'Wind', 'Strong': 'No', 'Weak': 'Yes'}), 'Sunny': defaultdict(<class 'dict'>, {'column_name': 'Humidity', 'High': 'No', 'Normal': 'Yes'})}
