In [3]:
import pandas as pd
import numpy as np
from statistics import mode
from collections import defaultdict

def build_decision_tree(df: pd.DataFrame) -> dict:
    # If all values in the target column are the same, return that value as a leaf node
    if len(np.unique(df.iloc[:, -1])) == 1:
        return df.iloc[0, -1]
    
    # If there are no columns left except the target, return the most common target value
    if len(df.columns) == 1:
        return mode(df.iloc[:, -1])

    split_col = best_split_col(df)

    tree = defaultdict(dict)
    tree["column_name"] = split_col

    for value in np.unique(df[split_col]):
        subset = df[df[split_col] == value].drop(columns=[split_col])
        tree[value] = build_decision_tree(subset)
        
    return tree

def predict(tree: dict, data: dict) -> str:
    node = tree
    while isinstance(node, dict):
        col = node["column_name"]
        if data[col] in node:
            node = node[data[col]]
        else:
            return "Unknown"  # For cases where the value does not exist in the tree
    return node

def best_split_col(df: pd.DataFrame) -> str:
    best_col, best_delta = None, 0

    for col in df.columns[:-1]:
        if pd.api.types.is_numeric_dtype(df[col]) or df[col].dtype == 'object':  # Ensure column is either numeric or categorical
            delta = gini_reduction(df, col)
            if delta > best_delta:
                best_delta, best_col = delta, col
    
    return best_col

def gini_impurity(df: pd.DataFrame) -> float:
    target_col = df.iloc[:, -1]
    return 1 - sum((target_col.value_counts(normalize=True)) ** 2)

def gini_reduction(df: pd.DataFrame, col: str) -> float:
    total_gini = gini_impurity(df)
    weighted_gini = sum(
        (freq / len(df)) * gini_impurity(df[df[col] == x])
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )
    return total_gini - weighted_gini

# Load the dataset and build the tree
path = "data09_1.csv"
df = pd.read_csv(path)
tree = build_decision_tree(df)
test = {"Outlook": "Sunny", "Temp": "Mild", "Humidity": "Normal", "Wind": "Weak"}
prediction = predict(tree, test)

print("Prediction:", prediction)
print("Decision Tree:", tree)


Prediction: Yes
Decision Tree: defaultdict(<class 'dict'>, {'column_name': 'Outlook', 'Overcast': 'Yes', 'Rain': defaultdict(<class 'dict'>, {'column_name': 'Wind', 'Strong': 'No', 'Weak': 'Yes'}), 'Sunny': defaultdict(<class 'dict'>, {'column_name': 'Humidity', 'High': 'No', 'Normal': 'Yes'})})
