In [1]:
import pandas as pd
import math

# Load the dataset
df = pd.read_csv('data.csv')

def entropy(probabilities):
    return -sum(p * math.log2(p) if p != 0 else 0 for p in probabilities)

def information_gain(data, split_attribute_name, target_name):
    # Calculate the entropy of the entire dataset
    total_entropy = entropy(data[target_name].value_counts(normalize=True))

    # Calculate the weighted entropy of the splits
    weighted_entropy = 0
    for value in data[split_attribute_name].unique():
        subset = data[data[split_attribute_name] == value]
        weight = len(subset) / len(data)
        value_counts = subset[target_name].value_counts(normalize=True)
        subset_entropy = entropy(value_counts)
        weighted_entropy += weight * subset_entropy

    # Calculate information gain
    info_gain = total_entropy - weighted_entropy
    return info_gain

def build_decision_tree(data, original_data, features, target_attribute_name, parent_node_class=None):
    # If all target values are the same, return that value
    if len(data[target_attribute_name].unique()) <= 1:
        return data[target_attribute_name].unique()[0]

    # If the dataset is empty or the features list is empty, return the most common target value
    if len(data) == 0 or len(features) == 0:
        return parent_node_class

    # Determine the best splitting criterion (attribute)
    information_gains = {}
    for feature in features:
        information_gains[feature] = information_gain(data, feature, target_attribute_name)

    best_feature = max(information_gains, key=information_gains.get)

    # Create the tree structure
    tree = {best_feature: {}}

    # Remove the best feature from the feature list
    features = [f for f in features if f != best_feature]

    # Expand the tree by recursively calling build_decision_tree
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        subtree = build_decision_tree(subset, data, features, target_attribute_name, data[target_attribute_name].mode()[0])
        tree[best_feature][value] = subtree

    return tree

# Define the target attribute name
target_attribute_name = 'buys_computer'

# Get the list of features (excluding the target attribute)
features = df.columns.tolist()
features.remove(target_attribute_name)

# Build the decision tree
decision_tree = build_decision_tree(df, df, features, target_attribute_name)

# Function to print the decision tree
def print_decision_tree(tree, indent=""):
    for attribute, children in tree.items():
        if isinstance(children, dict):
            print(indent + attribute)
            print_decision_tree(children, indent + "  ")
        else:
            print(indent + attribute + " -> " + children)

# Print the decision tree
print_decision_tree(decision_tree)

credit_rating
  fair
    age
      <=30
        student
          no
            income
              high -> no
              medium -> no
              low -> no
          yes
            income
              low -> yes
              high -> yes
              medium -> yes
      30..40
        income
          high
            student
              no -> yes
              yes -> yes
          low
            student
              yes -> no
      >40
        income
          medium
            student
              no -> yes
              yes -> yes
          low -> yes
          high -> no
  excellent
    income
      high
        age
          <=30 -> no
          >40
            student
              no -> no
      low
        age
          >40
            student
              yes -> no
              no -> yes
          30..40
            student
              yes -> no
          <=30 -> no
      medium
        age
          <=30
            student
              yes -> yes
      