In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [204]:
data = {"Outlook":["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast","Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
        "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
        "Humidity":["High","High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
        "Wind":["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"],
        "PlayTennis":["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]}

In [205]:
df = pd.DataFrame(data)

In [206]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [207]:
# LabelEncoder'ı oluşturun ve kategorik sütunu dönüştürün
label_encoder = LabelEncoder()
df = df.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == "O" else col)

In [208]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1
5,1,0,1,0,0
6,0,0,1,0,1
7,2,2,0,1,0
8,2,0,1,1,1
9,1,2,1,1,1


On each node, we compute the information gain for each feature, then split the node on the feature with the higher information gain, by comparing the entropy of the node with the weighted entropy in the splitted nodes. 


Now let's write a function to compute the entropy.

In [209]:
def entropy(p):
    if p ==0 or p == 1:
        return 0
    else:
        return -p * np.log2(p) - ((1-p) * np.log2(1-p))

In [210]:
print(entropy(0.5))

1.0


In [211]:
def split_indices(df, index_feature):
    """Given a DataFrame and an index feature, return two lists for the two split nodes.
    The left node has the rows where the feature > 0, and the right node has rows where the feature = 0.
    """
    left_indices = []
    right_indices = []
    middle_indices = []
    
    for i, x in df.iterrows():
        if x[index_feature] == 0:
            left_indices.append(i)
        elif x[index_feature] == 1:
            middle_indices.append(i)
        else:
            right_indices.append(i)
    
    return left_indices, middle_indices, right_indices


In [213]:
split_indices(df,0)

  if x[index_feature] == 0:
  elif x[index_feature] == 1:


([2, 6, 11, 12], [3, 4, 5, 9, 13], [0, 1, 7, 8, 10])

Now we need another function to compute the weighted entropy in the splitted nodes. We must find:

- $w^{\text{left}}$ and $w^{\text{right}}$, the proportion of "Yes" and "No" in **each node**.
- $p^{\text{left}}$ and $p^{\text{right}}$, the proportion of "Yes" in **each split**.

Note the difference between these two definitions!! To illustrate, if we split the root node on the feature of index 0 (Outlook), then in the left node, the one that has "Yes" 2, 6, 11  and 12, we have:

$$w^{\text{left}}= \frac{4}{14} = 0.28  \text{ and } p^{\text{left}} = \frac{4}{4}$$
$$w^{\text{mid}}= \frac{5}{14} = 0.35  \text{ and } p^{\text{mid}} = \frac{3}{5}$$
$$w^{\text{right}}= \frac{5}{14} = 0.35  \text{ and } p^{\text{right}} = \frac{2}{5}$$

In [214]:
def weighted_entropy(X,y,left_indices,middle_indices, right_indices):
    """
    This function takes the splitted dataset, the indices we chose to split and returns the weighted entropy.
    """
    w_left = len(left_indices)/len(X)
    w_mid = len(middle_indices)/len(X)
    w_right = len(right_indices)/len(X)
    p_left = sum(y[left_indices])/len(left_indices)
    p_mid = sum(y[middle_indices])/len(middle_indices)
    if len(right_indices) > 0:
        p_right = sum(y[right_indices]) / len(right_indices)
    else:
        p_right = 0
    
    weighted_entropy = w_left * entropy(p_left) +w_mid * entropy(p_mid)  +w_right * entropy(p_right)
    return weighted_entropy

In [215]:
y = df.pop("PlayTennis")

In [216]:
X=df

In [217]:
X.columns

Index(['Outlook', 'Temperature', 'Humidity', 'Wind'], dtype='object')

In [221]:
left_indices, middle_indices, right_indices = split_indices(X, 3)
weighted_entropy(X, y, left_indices,middle_indices, right_indices)

  if x[index_feature] == 0:
  elif x[index_feature] == 1:


0.8921589282623617

To compute the **Information Gain** we must subtract it from the entropy in the node we chose to split (in this case, the root node). 

In [219]:
def information_gain(X, y, left_indices,middle_indices, right_indices):
    """
    Here, X has the elements in the node and y is theirs respectives classes
    """
    p_node = sum(y)/len(y)
    h_node = entropy(p_node)
    w_entropy = weighted_entropy(X,y,left_indices,middle_indices, right_indices)
    return h_node - w_entropy

In [222]:
information_gain(X, y, left_indices, middle_indices, right_indices)

0.04812703040826949

Now, let's compute the information gain if we split the root node for each feature:

In [200]:
for i, feature_name in enumerate(['Outlook', 'Temperature', 'Humidity', 'Wind']):
    left_indices,middle_indices, right_indices = split_indices(X, i)
    i_gain = information_gain(X, y, left_indices, middle_indices, right_indices)
    print(f"Feature: {feature_name}, information gain if we split the root node using this feature: {i_gain:.2f}")

Feature: Outlook, information gain if we split the root node using this feature: 0.25
Feature: Temperature, information gain if we split the root node using this feature: 0.03
Feature: Humidity, information gain if we split the root node using this feature: 0.15
Feature: Wind, information gain if we split the root node using this feature: 0.05


  if x[index_feature] == 0:
  elif x[index_feature] == 1:
  if x[index_feature] == 0:
  elif x[index_feature] == 1:
  if x[index_feature] == 0:
  elif x[index_feature] == 1:
  if x[index_feature] == 0:
  elif x[index_feature] == 1:
