In [5]:
import numpy as np
import pandas as pd

In [7]:
# Lê o CSV
df = pd.read_csv('play_tennis.csv')
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [16]:
### Versão 1

# Entropia da base
p_no = 5 / 14
p_yes = 9 / 14 

def entropy_tennis(p_no, p_yes):
  return (p_no * np.log2(p_no) + p_yes * np.log2(p_yes)) * - 1

H = entropy_tennis(p_no, p_yes)

# Ganho de informação
def gi_tennis(group_array_bin, y_bin):
  
  # Entropia geral
  p_no  = (y_bin == False).mean()
  p_yes = (y_bin == True).mean()
  h     = entropy_tennis(p_no, p_yes)

  # Proporção dos grupos
  p_group_true  = (group_array_bin == True).mean()
  p_group_false = (group_array_bin == False).mean()

  # Cálculo da entropia (True)
  y_group_true     = y_bin[group_array_bin == True]
  p_no_group_true  = (y_group_true == False).mean()
  p_yes_group_true = (y_group_true == True).mean()
  h_group_true     = entropy_tennis(p_no_group_true, p_yes_group_true)

  # Cálculo da entropia (False)
  y_group_false     = y_bin[group_array_bin == False]
  p_no_group_false  = (y_group_false == False).mean()
  p_yes_group_false = (y_group_false == True).mean()
  h_group_false     = entropy_tennis(p_no_group_false, p_yes_group_false)

  # Cálculo do ganho de informação
  h_groups = p_group_true * h_group_true + p_group_false * h_group_false
  gi       =  h - h_groups
  
  return gi

In [26]:
### Versão 2

def entropy_tennis_v2(y_bin):
  p0 = (y_bin == False).mean()
  p1 = (y_bin == True).mean()

  H = -(p0 * np.log2(p0) + p1 * np.log2(p1))
  return H

def gi_tennis_v2(group_array_bin, y_bin):
  # Entropia geral (conj original)
  H = entropy_tennis_v2(y_bin)

  # Entropia e proporção de grupo True
  H_true = entropy_tennis_v2(y_bin[group_array_bin == True])
  p_true = (group_array_bin == True).mean()

  # Entropia e proporção de grupo Fakse
  H_false = entropy_tennis_v2(y_bin[group_array_bin == False])
  p_false = (group_array_bin == False).mean()

  GI = H - (H_true * p_true + H_false * p_false)
  return GI


In [27]:
# Humidity == High
group_array_bin = np.array(df['humidity'] == 'high')
y_bin           = np.array(df['play'] == 'yes')

gi_tennis_v2(group_array_bin, y_bin)

0.15183550136234136

In [28]:
# Humidity == High
group_array_bin = np.array(df['temp'] == 'hot')
y_bin           = np.array(df['play'] == 'yes')

gi_tennis_v2(group_array_bin, y_bin)

0.0250781735058504

In [10]:
# Humidity == High
df[df['humidity'] == 'high']

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
7,sunny,mild,high,False,no
11,overcast,mild,high,True,yes
13,rainy,mild,high,True,no


In [14]:
# Entropia do subconjunto High (50% da base)
h_high = entropy_tennis(4/7, 3/7)

In [12]:
# Humidity != High
df[df['humidity'] != 'high']

Unnamed: 0,outlook,temp,humidity,windy,play
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes
10,sunny,mild,normal,True,yes
12,overcast,hot,normal,False,yes


In [15]:
# Entropia do subconjunto !High (50% da base)
h_nhigh = entropy_tennis(1/7, 6/7)

In [17]:
# H(humidity) = H(humidity=high) * p(humidity=high) + H(humidity!=high) * p(humidity!=high)
h_all = h_high * 0.5 + h_nhigh * 0.5
h_all

0.7884504573082896

In [21]:
# Ganho
G = H - h_all
G

0.15183550136234136

In [22]:
# Windy == True
df[df['windy'] == True]

Unnamed: 0,outlook,temp,humidity,windy,play
1,sunny,hot,high,True,no
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
10,sunny,mild,normal,True,yes
11,overcast,mild,high,True,yes
13,rainy,mild,high,True,no
