In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Decision Trees are based on Entropy
Activity: Calculate the entropy for a coin
Entropy shows the uncertainy of a random variable. The higher the entropy value, the more unncertain we are. Entropy is displayed as $H(X)$, where $X$ is a random variable

The Entropy formula is the summation of probabilities multiplied by the log of probabilities:

## Entropy of coin
Given p stands for "probability of",

for outcome in [H,T]:

$H(Coin) = \sum -p(outcome) * log_2(p(outcome)$

## Entropy of a fair coin
for p(outcome) in [p(H)=0.5, p(T)=0.5]):

$H(Coin) = \sum -p(outcome) * log_2(p(outcome)$

In [2]:
print((-0.5 * np.log2(0.5))+(-0.5 * np.log2(0.5)))

print((-0.9 * np.log2(0.9))+(-0.1 * np.log2(0.1)))

1.0
0.4689955935892812


In [3]:
def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H
    
p = [.5, .5]
print(entropy(p))

p = [.9, .1]
print(entropy(p))


p = [.16667,.16667,.16667,.16667,.16667,.16667]
print(entropy(p))

1.0
0.4689955935892812
2.5849853457818157


# Activity: Obtain the following quantitites:
In groups of 3: Using the tennis dataset, obtain the following quantities:

## Entropy for PlayTennis:
Obtain the entropy of thePlayTennis (Leaf/Decision) column.

## Entropy for PlayTennis conditioned on Weak Wind factor
Obtain the entropy of conditional probability p(PlayTennis | Wind = Weak) = [2/8, 6/8]

## Entropy for PlayTennis conditioned on Strong Wind factor
Obtain the entropy of conditional probability p(PlayTennis | Wind = Strong) = [3/6, 3/6]

### Hints:

p = [9/14, 5/14] which represents the probability that a player plays tennis (9/14 days) or not (5/14 days)
Remember your Entropy function from earlier

### Solutions
Entropy(Decision) = – (9/14) . log2(9/14) – (5/14) . log2(5/14) = 0.940

In [4]:
data = pd.read_csv('./Datasets/tennis.txt', delimiter="\t", header=None, names=['Outlook', 'Temp', 'Humidity', 'Wind', 'Play'])

print(data)

     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High    Weak   No
2      Sunny   Hot     High  Strong   No
3   Overcast   Hot     High    Weak  Yes
4       Rain  Mild     High    Weak  Yes
5       Rain  Cool   Normal    Weak  Yes
6       Rain  Cool   Normal  Strong   No
7   Overcast  Cool   Normal  Strong  Yes
8      Sunny  Mild     High    Weak   No
9      Sunny  Cool   Normal    Weak  Yes
10      Rain  Mild   Normal    Weak  Yes
11     Sunny  Mild   Normal  Strong  Yes
12  Overcast  Mild     High  Strong  Yes
13  Overcast   Hot   Normal    Weak  Yes
14      Rain  Mild     High  Strong   No


In [5]:
yes_prob = dict(data.Play.value_counts())['Yes']/len(data)
no_prob = dict(data.Play.value_counts())['No']/len(data)
print(yes_prob, no_prob)
p = [yes_prob, no_prob]
print(entropy(p))

0.6428571428571429 0.35714285714285715
0.9402859586706311


In [6]:
features = ['Wind', 'Play']
df = data[features]

In [7]:

# yes_prob = dict(data.Wind.value_counts())['Strong']/len(data)
# no_prob = dict(data.Wind.value_counts())['Weak']/len(data)
# print(yes_prob, no_prob)
# p = [yes_prob, no_prob]
# print(entropy(p))

In [8]:
## Probabilty of playing on strong wind

yes_prob = dict(df[df["Wind"]=="Strong"].Play.value_counts())['Yes']/len(df)
no_prob = dict(df[df["Wind"]=="Strong"].Play.value_counts())['No']/len(df)
print(yes_prob, no_prob)
p = [yes_prob, no_prob]
print(entropy(p))

0.21428571428571427 0.21428571428571427
0.9524538948584778


In [9]:
## Probabilty of playing on weak wind

yes_prob = dict(df[df["Wind"]=="Weak"].Play.value_counts())['Yes']/len(df)
no_prob = dict(df[df["Wind"]=="Weak"].Play.value_counts())['No']/len(df)
print(yes_prob, no_prob)
p = [yes_prob, no_prob]
print(entropy(p))

0.42857142857142855 0.14285714285714285
0.9249331694381354


Given p stands for "probability of",

for Wind = {Weak, Strong}:

$I(Decision; Wind) = H(Decision) - \sum p(Wind) * Entropy(Decision | Wind)$
We can break this down further:

$H(Decision) - \sum p(Wind) * Entropy(Decision | Wind)$

$=$

$H(Decision) - (p(Wind = Weak) * H(Decision | Wind = Weak) + p(Wind = Strong) * H(Decision | Wind = Strong)) = 0.048$

In [10]:
def calc_entropy(colm, val_1, val_2):
    yes_prob = dict(data[colm].value_counts())[val_1]/len(data)
    no_prob = dict(data[colm].value_counts())[val_2]/len(data)
    print(yes_prob, no_prob)
    p = [yes_prob, no_prob]
    return(entropy(p))


def calc_entropy2(colm1, colm2, val_1, val_2, val_3):
    yes_prob = dict(df[df[colm1]==val_3][colm2].value_counts())[val_1]/len(data)
    no_prob = dict(df[df[colm1]==val_3][colm2].value_counts())[val_2]/len(data)
    print(yes_prob, no_prob)
    p = [yes_prob, no_prob]
    return(entropy(p))

In [11]:
entropy_wind = calc_entropy("Wind", "Strong", "Weak")
entropy_decision = calc_entropy("Play", "Yes", "No")


prob_strong_wind = data.Wind.value_counts()["Strong"]/len(data.Wind)
prob_weak_wind = data.Wind.value_counts()["Weak"]/len(data.Wind)


entropy_decision_strong_wind = calc_entropy2("Wind", "Play", "Yes", "No", "Strong")
entropy_decision_weak_wind = calc_entropy2("Wind", "Play", "Yes", "No", "Weak")

0.42857142857142855 0.5714285714285714
0.6428571428571429 0.35714285714285715
0.21428571428571427 0.21428571428571427
0.42857142857142855 0.14285714285714285


In [12]:
Output = entropy_decision - (prob_weak_wind * entropy_decision_weak_wind + prob_strong_wind * entropy_decision_strong_wind)
Output

0.003558192623777545