In [1]:
import pandas as pd
import math

In [2]:
data = pd.read_csv('tennis.csv')
data.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [3]:
def entropy(data, target_column):
    values, counts = data[target_column].value_counts().values, data[target_column].value_counts().index
    entropy_val = -sum((count/len(data)) * math.log2(count/len(data)) for count in values)
    return entropy_val

target_entropy = entropy(data, 'play')
print("Entropy (play):", target_entropy)

Entropy (play): 0.9402859586706311


In [4]:
def information_gain(data, split_column, target_column):
    total_entropy = entropy(data, target_column)

    values = data[split_column].unique()
    weighted_entropy = 0
    
    for value in values:
        subset = data[data[split_column] == value]
        weight = len(subset) / len(data)
        weighted_entropy += weight * entropy(subset, target_column)

    info_gain = total_entropy - weighted_entropy
    return info_gain

gains = {col: information_gain(data, col, 'play') for col in data.columns if col != 'play'}
print("Information Gain for each feature:", gains)

Information Gain for each feature: {'outlook': 0.24674981977443933, 'temp': 0.02922256565895487, 'humidity': 0.15183550136234159, 'windy': 0.04812703040826949}


In [5]:
best_feature = max(gains, key=gains.get)
print("Best feature to split on:", best_feature)

Best feature to split on: outlook


In [6]:
for value in data[best_feature].unique():
    subset = data[data[best_feature] == value]
    print(f"\nSubset for {best_feature} = {value}")
    subset_gains = {col: information_gain(subset, col, 'play') for col in subset.columns if col not in ['play', best_feature]}
    print("Information Gain for each feature in subset:", subset_gains)


Subset for outlook = sunny
Information Gain for each feature in subset: {'temp': 0.5709505944546686, 'humidity': 0.9709505944546686, 'windy': 0.01997309402197489}

Subset for outlook = overcast
Information Gain for each feature in subset: {'temp': -0.0, 'humidity': -0.0, 'windy': -0.0}

Subset for outlook = rainy
Information Gain for each feature in subset: {'temp': 0.01997309402197489, 'humidity': 0.01997309402197489, 'windy': 0.9709505944546686}
