In [16]:
import pandas as pd
import numpy as np
import math

# Function to calculate entropy
def calculate_entropy(data):
    labels = data.iloc[:, -1]
    total_count = len(labels)
    label_counts = labels.value_counts()
    entropy = -sum((count / total_count) * math.log2(count / total_count) for count in label_counts)
    return entropy

# Function to calculate information gain
def calculate_information_gain(data, feature):
    total_entropy = calculate_entropy(data)
    total_count = len(data)
    values = data[feature].unique()
    weighted_entropy = sum(
        (len(subset) / total_count) * calculate_entropy(subset)
        for value in values
        for subset in [data[data[feature] == value]]
    )
    return total_entropy - weighted_entropy

# Recursive ID3 algorithm
def id3(data, features):
    if len(data.iloc[:, -1].unique()) == 1:
        return data.iloc[0, -1]

    if not features:
        return data.iloc[:, -1].mode()[0]

    gains = {feature: calculate_information_gain(data, feature) for feature in features}
    best_feature = max(gains, key=gains.get)

    if gains[best_feature] <= 0:
        return data.iloc[:, -1].mode()[0]

    tree = {best_feature: {}}
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        remaining_features = [f for f in features if f != best_feature]
        tree[best_feature][value] = id3(subset.drop(columns=[best_feature]), remaining_features)

    return tree

# Prediction function
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    root_feature = next(iter(tree))
    feature_value = instance[root_feature]
    if feature_value in tree[root_feature]:
        return predict(tree[root_feature][feature_value], instance)
    else:
        # unseen value -> fallback to majority class
        return None  

# Accuracy function
def calculate_accuracy(tree, data):
    correct_predictions = 0
    for _, row in data.iterrows():
        prediction = predict(tree, row)
        if prediction == row.iloc[-1]:
            correct_predictions += 1
    accuracy = correct_predictions / len(data)
    return accuracy

# Example dataset
ev_simple = pd.DataFrame({
    "Region": ["Asia", "Europe", "North America", "Australia", "Asia", "Europe", "Australia", "North America", "Asia", "Europe"],
    "Vehicle_Type": ["SUV", "Sedan", "Hatchback", "SUV", "Truck", "Sedan", "SUV", "Truck", "Hatchback", "SUV"],
    "Battery_Health": ["High", "Medium", "High", "Low", "Medium", "High", "Low", "Medium", "High", "Low"],
    "Usage_Type": ["Personal", "Fleet", "Commercial", "Personal", "Fleet", "Fleet", "Commercial", "Personal", "Fleet", "Commercial"],
    "Buy_Recommend": ["Yes", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "No"]
})

features = ["Region", "Vehicle_Type", "Battery_Health", "Usage_Type"]

# Train on full dataset
decision_tree = id3(ev_simple, features)

# Accuracy on full dataset 
accuracy = calculate_accuracy(decision_tree, ev_simple)

print("Decision Tree:", decision_tree)
print("Accuracy:", round(accuracy*100, 2), "%")


Decision Tree: {'Usage_Type': {'Personal': 'Yes', 'Fleet': {'Vehicle_Type': {'Sedan': 'Yes', 'Truck': 'No', 'Hatchback': 'Yes'}}, 'Commercial': 'No'}}
Accuracy: 100.0 %
