In [9]:
import pandas as pd
import math

def entropy(df):
    target = df.keys()[-1]
    entropy_val = 0
    values = df[target].unique()
    for value in values:
        fraction = df[target].value_counts()[value] / len(df[target])
        entropy_val += -fraction * math.log2(fraction)
    return entropy_val

def information_gain(df, attribute):
    target = df.keys()[-1]
    total_entropy = entropy(df)
    values = df[attribute].unique()
    weighted_entropy = 0
    for value in values:
        subset = df[df[attribute] == value]
        weighted_entropy += (len(subset) / len(df)) * entropy(subset)
    information_gain_val = total_entropy - weighted_entropy
    return information_gain_val

def id3(df, attributes, target_attribute):
    #Implementation of ID3 algorithm
    # Placeholder: Replace with actual ID3 implementation
    return "ID3 Decision Tree" #Example return, replace with the actual tree structure

def calculate_instance_counts(df):
    instance_counts = {}
    target = df.keys()[-1]
    for attribute in df.keys()[:-1]:  # Exclude target column
        value_counts = {}
        for value in df[attribute].unique():
            subset = df[df[attribute] == value]
            positive_count = len(subset[subset[target] == 'Yes'])
            negative_count = len(subset[subset[target] == 'No'])
            value_counts[value] = (positive_count, negative_count)
        instance_counts[attribute] = value_counts
    return instance_counts

# Create the dataset as a Pandas DataFrame (already provided in the code)
data = {'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
        'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
        'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
        'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
        'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']}
df = pd.DataFrame(data)

# Calculate entropy of the entire dataset
print("Entropy of the entire dataset:", entropy(df))

# Calculate Information Gain for each attribute
attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
for attribute in attributes:
    gain = information_gain(df, attribute)
    print(f"Information Gain for {attribute}: {gain}")

# Find the best attribute for the root node (attribute with highest information gain)
best_attribute = max(attributes, key=lambda x: information_gain(df, x))
print(f"\nBest attribute for root node: {best_attribute}")

# Build the ID3 decision tree
target_attribute = 'PlayTennis'
decision_tree = id3(df, attributes.copy(), target_attribute) # Using a copy of attributes
print("\nDecision Tree:", decision_tree)

# Second level calculations (example for 'Outlook' as the root)
outlook_values = df['Outlook'].unique()
for value in outlook_values:
    subset = df[df['Outlook'] == value]
    print(f"\nSubset for Outlook = {value}")
    attributes_next_level = ['Temperature', 'Humidity', 'Wind']
    for attr in attributes_next_level:
        gain = information_gain(subset, attr)
        print(f"Information Gain for {attr}: {gain}")

# Calculate and print instance counts for each attribute
instance_counts = calculate_instance_counts(df)
for attribute, value_counts in instance_counts.items():
    print(f"Attribute: {attribute}")
    for value, counts in value_counts.items():
        print(f"  Value: {value}, Positive Instances: {counts[0]}, Negative Instances: {counts[1]}")
    single_dim_array = []
    for val, counts in value_counts.items():
        single_dim_array.extend(counts)
    print(f"  Single Dimensional Array: {single_dim_array}")
    print("-" * 20)


Entropy of the entire dataset: 0.9402859586706311
Information Gain for Outlook: 0.24674981977443933
Information Gain for Temperature: 0.02922256565895487
Information Gain for Humidity: 0.15183550136234159
Information Gain for Wind: 0.04812703040826949

Best attribute for root node: Outlook

Decision Tree: ID3 Decision Tree

Subset for Outlook = Sunny
Information Gain for Temperature: 0.5709505944546686
Information Gain for Humidity: 0.9709505944546686
Information Gain for Wind: 0.01997309402197489

Subset for Outlook = Overcast
Information Gain for Temperature: 0.0
Information Gain for Humidity: 0.0
Information Gain for Wind: 0.0

Subset for Outlook = Rain
Information Gain for Temperature: 0.01997309402197489
Information Gain for Humidity: 0.01997309402197489
Information Gain for Wind: 0.9709505944546686
Attribute: Outlook
  Value: Sunny, Positive Instances: 2, Negative Instances: 3
  Value: Overcast, Positive Instances: 4, Negative Instances: 0
  Value: Rain, Positive Instances: 3, Ne

Best attribute for the root node: Outlook
