In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Decision Trees are based on Entropy
Activity: Calculate the entropy for a coin
Entropy shows the uncertainy of a random variable. The higher the entropy value, the more unncertain we are. Entropy is displayed as $H(X)$, where $X$ is a random variable

The Entropy formula is the summation of probabilities multiplied by the log of probabilities:

## Entropy of coin
Given p stands for "probability of",

for outcome in [H,T]:

$H(Coin) = \sum -p(outcome) * log_2(p(outcome)$

## Entropy of a fair coin
for p(outcome) in [p(H)=0.5, p(T)=0.5]):

$H(Coin) = \sum -p(outcome) * log_2(p(outcome)$

In [2]:
print((-0.5 * np.log2(0.5))+(-0.5 * np.log2(0.5)))

print((-0.9 * np.log2(0.9))+(-0.1 * np.log2(0.1)))

1.0
0.4689955935892812


In [3]:
def entropy(p):
    H = np.array([-i*np.log2(i) for i in p]).sum()
    return H
    
p = [.5, .5]
print(entropy(p))

p = [.9, .1]
print(entropy(p))


p = [.16667,.16667,.16667,.16667,.16667,.16667]
print(entropy(p))

1.0
0.4689955935892812
2.5849853457818157


# Activity: Obtain the following quantitites:
In groups of 3: Using the tennis dataset, obtain the following quantities:

## Entropy for PlayTennis:
Obtain the entropy of thePlayTennis (Leaf/Decision) column.

## Entropy for PlayTennis conditioned on Weak Wind factor
Obtain the entropy of conditional probability p(PlayTennis | Wind = Weak) = [2/8, 6/8]

## Entropy for PlayTennis conditioned on Strong Wind factor
Obtain the entropy of conditional probability p(PlayTennis | Wind = Strong) = [3/6, 3/6]

### Hints:

p = [9/14, 5/14] which represents the probability that a player plays tennis (9/14 days) or not (5/14 days)
Remember your Entropy function from earlier

### Solutions
Entropy(Decision) = – (9/14) . log2(9/14) – (5/14) . log2(5/14) = 0.940

In [4]:
data = pd.read_csv('./Datasets/tennis.txt', delimiter="\t", header=None, names=['Outlook', 'Temp', 'Humidity', 'Wind', 'Play'])

print(data)

     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High    Weak   No
2      Sunny   Hot     High  Strong   No
3   Overcast   Hot     High    Weak  Yes
4       Rain  Mild     High    Weak  Yes
5       Rain  Cool   Normal    Weak  Yes
6       Rain  Cool   Normal  Strong   No
7   Overcast  Cool   Normal  Strong  Yes
8      Sunny  Mild     High    Weak   No
9      Sunny  Cool   Normal    Weak  Yes
10      Rain  Mild   Normal    Weak  Yes
11     Sunny  Mild   Normal  Strong  Yes
12  Overcast  Mild     High  Strong  Yes
13  Overcast   Hot   Normal    Weak  Yes
14      Rain  Mild     High  Strong   No


In [5]:
yes_prob = dict(data.Play.value_counts())['Yes']/len(data)
no_prob = dict(data.Play.value_counts())['No']/len(data)
print(yes_prob, no_prob)
p = [yes_prob, no_prob]
print(entropy(p))

0.6428571428571429 0.35714285714285715
0.9402859586706311


Given p stands for "probability of",

for Wind = {Weak, Strong}:

$I(Decision; Wind) = H(Decision) - \sum p(Wind) * Entropy(Decision | Wind)$
We can break this down further:

$H(Decision) - \sum p(Wind) * Entropy(Decision | Wind)$

$=$

$H(Decision) - (p(Wind = Weak) * H(Decision | Wind = Weak) + p(Wind = Strong) * H(Decision | Wind = Strong)) = 0.048$

In [6]:
def calc_entropy(colm, val_1, val_2):
    yes_prob = dict(data[colm].value_counts())[val_1]/len(data)
    no_prob = dict(data[colm].value_counts())[val_2]/len(data)
    p = [yes_prob, no_prob]
    ent_p = entropy(p)
#     print(colm, "\t\t", p, "\nEntropy\t\t", ent_p, "\n")
    return(ent_p)


def calc_entropy2(colm1, colm2, val_1, val_2, val_3):
    yes_prob = dict(data[data[colm1]==val_3][colm2].value_counts())[val_1]/len(data)
    no_prob = dict(data[data[colm1]==val_3][colm2].value_counts())[val_2]/len(data)
    p = [yes_prob, no_prob]
    ent_p = entropy(p)
#     print(colm1, colm2, "\t", p, "\nEntropy\t\t", ent_p, "\n")
    return(ent_p)


# hint: helper function that takes a dataset (df) and one of its features (c1),
# decision (c2), and condition of the feature (condition) as input, and outputs
# the condiitional probability
def conditional_prob(df, c1, c2, condition):
    df_new = df[df[c1] == condition][c2]
    s = df_new.unique()
    population_size = len(df_new)
    pr = {}
    for i in s:
        pr[i] = len(df[(df[c1] == condition) & (df[c2]== i)]) / population_size
    return pr

In [7]:
## Entropy/Probabilty of playing
entropy_decision = calc_entropy("Play", "Yes", "No")


## Probabilty of strong wind
prob_strong_wind = data.Wind.value_counts()["Strong"]/len(data.Wind)
print("p(Wind = Strong)\nWind Strong", prob_strong_wind, "\n")
## Probabilty of weak wind
prob_weak_wind = data.Wind.value_counts()["Weak"]/len(data.Wind)
print("p(Wind = Weak)\nWind Weak", prob_weak_wind, "\n")


## Entropy/Probabilty of wind
entropy_wind = calc_entropy("Wind", "Strong", "Weak")


# print("----\n")

## Entropy/Probabilty of playing on strong wind
# print("H(Decision | Wind = Strong)")
# entropy_decision_strong_wind = calc_entropy2("Wind", "Play", "Yes", "No", "Strong")
## Entropy/Probabilty of playing on weak wind
# print("H(Decision | Wind = Weak)")
# entropy_decision_weak_wind = calc_entropy2("Wind", "Play", "Yes", "No", "Weak")


# print("-----or-----\n")


# what are the probabilities of Play given Wind is Weak?
pb_decision_weak_wind = conditional_prob(data,'Wind', 'Play', 'Weak')
ep_decision_weak_wind = entropy([0.25, 0.75])
print("decision_weak_wind", pb_decision_weak_wind)
print("entropy weak wind", ep_decision_weak_wind)

# what are the probabilities of Play given Wind is Strong?
pb_decision_strong_wind = conditional_prob(data, 'Wind', 'Play', 'Strong')
ep_decision_strong_wind = entropy([0.5, 0.5])
print("decision_strong_wind", pb_decision_strong_wind)
print("entropy weak wind", ep_decision_strong_wind)

print("\n----\n")

## Overall Entropy/Probabilty of playing based on wind
print("H(Decision) - (p(Wind = Weak) * H(Decision | Wind = Weak) + p(Wind = Strong) * H(Decision | Wind = Strong)) = 0.048")
# Output1 = entropy_decision - (prob_weak_wind * entropy_decision_weak_wind + prob_strong_wind * entropy_decision_strong_wind)
print("Expected Output: \t\t\t\t\t = 0.048")
# print("Overall Entropy/Probabilty of playing based on wind\t =", Output1)
Output2 = entropy_decision - (prob_weak_wind * ep_decision_weak_wind + prob_strong_wind * ep_decision_strong_wind)
print("Overall Entropy/Probabilty of playing based on wind\t =", Output2)

p(Wind = Strong)
Wind Strong 0.42857142857142855 

p(Wind = Weak)
Wind Weak 0.5714285714285714 

decision_weak_wind {'No': 0.25, 'Yes': 0.75}
entropy weak wind 0.8112781244591328
decision_strong_wind {'No': 0.5, 'Yes': 0.5}
entropy weak wind 1.0

----

H(Decision) - (p(Wind = Weak) * H(Decision | Wind = Weak) + p(Wind = Strong) * H(Decision | Wind = Strong)) = 0.048
Expected Output: 					 = 0.048
Overall Entropy/Probabilty of playing based on wind	 = 0.04812703040826949


In [8]:
# inputs: dataset (df), a feature from the dataset (feature), and the target (decision)
# returns: information gain between feature and decision
def info_gain(df, feature, decision):
    # obtain the entropy of the decision
    dict_decision = dict(df[decision].value_counts())
    prob_decision = [q for (p,q) in dict_decision.items()]/sum(dict_decision.values())
    entropy_decision = entropy(prob_decision)
#     print(entropy_decision)
    
    # obtain the probabilities of the feature
    # example: for Wind, obtain the probabilities of Strong and Weak
    dict_feature = dict(df[feature].value_counts())
    dict_prob_feature = {}
    for (p,q) in dict_feature.items():
        dict_prob_feature[p] = q/sum(dict_feature.values())
#     print(dict_prob_feature)
    
    # obtain the probability of the decision,
    # for all possible values of the feature (conditions)
    conditions = df[feature].unique()
    dict_ = {}
    for condition in conditions:
        dict_[condition] = conditional_prob(df, feature, decision, condition)
#     print(dict_)
    
    # Given the above metrics, calculate the information gain
    # between the feature and the decision using the formula we learned
    S = 0
    for (i,j) in dict_.items():
#         print(i,j)
        prob_condition = list(dict_[i].values())
#         print(entropy_condition)
        S = S + dict_prob_feature[i]*entropy(prob_condition)
#         print(dict_prob_feature[i]*entropy(entropy_condition))
    print(entropy_decision - S)

In [9]:
info_gain(data, 'Wind', 'Play')
info_gain(data, 'Humidity', 'Play')
info_gain(data, 'Temp', 'Play')
info_gain(data, 'Outlook', 'Play')

0.04812703040826949
0.15183550136234159
0.02922256565895487
0.24674981977443933


In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus

# read in the tennis data, need the extra parameters since it's a txt file
data = pd.read_csv('./Datasets/tennis.txt', delimiter="\t", header=None, names=['Outlook', 'Temp', 'Humidity', 'Wind', 'Play'])
print(data)

# encode the data so we can use it with our decision tree,
# by converting categories into numbers
data_encoded = data.apply(preprocessing.LabelEncoder().fit_transform)
print(data_encoded)

# create our decision tree classifier with entropy
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# one_hot_data = pd.get_dummies(data[['a', 'b', 'c', 'd']], drop_first=True)
# print(one_hot_data)

# provide our feature array and target array (1-item),
# and train the model using a decision tree
clf.fit(data_encoded[['Outlook', 'Temp', 'Humidity', 'Wind']], data_encoded['Play'])

# export our decision tree into data that can be plotted
dot_data = export_graphviz(clf, out_file=None, feature_names=['Outlook', 'Temp.', 'Humidity', 'Wind'])

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('tennis_tree.png')

     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High    Weak   No
2      Sunny   Hot     High  Strong   No
3   Overcast   Hot     High    Weak  Yes
4       Rain  Mild     High    Weak  Yes
5       Rain  Cool   Normal    Weak  Yes
6       Rain  Cool   Normal  Strong   No
7   Overcast  Cool   Normal  Strong  Yes
8      Sunny  Mild     High    Weak   No
9      Sunny  Cool   Normal    Weak  Yes
10      Rain  Mild   Normal    Weak  Yes
11     Sunny  Mild   Normal  Strong  Yes
12  Overcast  Mild     High  Strong  Yes
13  Overcast   Hot   Normal    Weak  Yes
14      Rain  Mild     High  Strong   No
    Outlook  Temp  Humidity  Wind  Play
1         2     1         0     1     0
2         2     1         0     0     0
3         0     1         0     1     1
4         1     2         0     1     1
5         1     0         1     1     1
6         1     0         1     0     0
7         0     0         1     0     1
8         2     2         0     1     0
9         2     0        

True