In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

In [26]:
df = pd.read_csv('tennis.csv', delimiter='\t', names=['1', 'Outlook', 'Temp', 'Humidity', 'Wind', 'Decision'])
df.head()

Unnamed: 0,1,Outlook,Temp,Humidity,Wind,Decision
0,1,Sunny,Hot,High,Weak,No
1,2,Sunny,Hot,High,Strong,No
2,3,Overcast,Hot,High,Weak,Yes
3,4,Rain,Mild,High,Weak,Yes
4,5,Rain,Cool,Normal,Weak,Yes


In [27]:
df = df.drop(['1'], axis=1)
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [28]:
# write a function that returns entropy given a P list -> (.5, .5)

In [29]:
def get_entropy(p_list):
    """ 
    Returns entropy for a tuple of probabilities. 
    This represents uncertainty of the tuple as a whole. The bigger the entropy, the more uncertainty
    Formula = -[P(H)*log^2(H) + P(T)*log^2(T)]
    """
    num1 = p_list[0]
    num2 = p_list[1]
    return -(num1*math.log(num1, 2) + num2*math.log(num2, 2))

In [30]:
# p_list = (.5, .5)
p_list = (.1, .9)
get_entropy(p_list)

0.4689955935892812

In [31]:
df

Unnamed: 0,Outlook,Temp,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [34]:
df[df['Wind'] == 'Weak'][['Wind', 'Decision']]

Unnamed: 0,Wind,Decision
0,Weak,No
2,Weak,Yes
3,Weak,Yes
4,Weak,Yes
7,Weak,No
8,Weak,Yes
9,Weak,Yes
12,Weak,Yes


In [61]:
def obtain_conditional_prob(input_df, col1, condition, target):
    total = 0
    pos = 0
    neg = 0
    
    df = input_df
    
    if condition != 'Any':
        df = input_df[input_df[col1] == condition]

    df = df[[col1, target]]
    
    total = len(df)
    pos = len(df[df['Decision'] == 'Yes'])
    neg = total - pos
    
    return (pos/total, neg/total)

In [54]:
def obtain_feature_prob(input_df, col, condition):
    total = len(input_df[col])
    val = len(input_df[input_df[col] == condition])
    return val/total

In [60]:
p_prob, n_prob = obtain_conditional_prob(df, 'Wind', 'Any', 'Decision')
print(f'Positive prob: {p_prob}')
print(f'Negative prob: {n_prob}')
entropy = get_entropy((p_prob, n_prob))
print(f'Entropy: {entropy}')

Positive prob: 0.6428571428571429
Negative prob: 0.35714285714285715
Entropy: 0.9402859586706309


In [66]:
p_prob, n_prob = obtain_conditional_prob(df, 'Wind', 'Strong', 'Decision')
print(f'Positive prob: {p_prob}')
print(f'Negative prob: {n_prob}')
entropy = get_entropy((p_prob, n_prob))
print(f'Entropy: {entropy}')

Positive prob: 0.5
Negative prob: 0.5
Entropy: 1.0


In [57]:
obtain_feature_prob(df, 'Wind', 'Strong')

0.42857142857142855

In [58]:
obtain_feature_prob(df, 'Wind', 'Weak')

0.5714285714285714

In [85]:
H_decision = get_entropy(obtain_conditional_prob(df, 'Wind', 'Any', 'Decision'))

In [86]:
p_1 = obtain_feature_prob(df, 'Wind', 'Weak') * get_entropy(obtain_conditional_prob(df, 'Wind', 'Weak', 'Decision'))

In [87]:
p_2 = obtain_feature_prob(df, 'Wind', 'Strong') * get_entropy(obtain_conditional_prob(df, 'Wind', 'Strong', 'Decision'))

In [88]:
H_decision - (p_1 + p_2)

0.04812703040826927

In [103]:
get_info_gain(df, 'Wind', 'Decision')

0.04812703040826927

In [93]:
df['Wind'].value_counts().keys()

'Weak'

In [118]:
df['Outlook'].value_counts()

Sunny       5
Rain        5
Overcast    4
Name: Outlook, dtype: int64

In [149]:
def obtain_conditional_prob(input_df, col, condition, target):
    """
    Obtain conditional probability of decision being yes or no given a df column and a condition
    
    Return two probabilities in a tuple
    """
    
    df = input_df
    if condition != 'Target':
        df = input_df[input_df[col] == condition]
    df = df[[col, target]]
    
    total = len(df)
    pos = df[target].value_counts()[0]
    neg = total - pos
    
    return (pos/total, neg/total)

In [105]:
def obtain_feature_prob(input_df, col, condition):
    """
    Return probability of a certain condition being true in a single column(feature)
    """
    total = len(input_df[col])
    val = len(input_df[input_df[col] == condition])
    return val/total

In [135]:
def get_entropy(p_list):
    """ 
    Returns entropy for a tuple of probabilities. 
    This represents uncertainty of the tuple as a whole. The bigger the entropy, the more uncertainty
    Formula = -[P(H)*log^2(H) + P(T)*log^2(T)]
    """
    num1 = p_list[0]
    num2 = p_list[1]
    
    # Edge Case
    if num1 == 0 or num2 == 0:
        return 0

    return -(num1*np.log2(num1) + num2*np.log2(num2))

In [138]:
def get_info_gain(df, col, decision):
    
    conditions = df[col].value_counts().keys()
    
    H_decision = get_entropy(obtain_conditional_prob(df, col, 'Target', decision))
    
    p_s = []
    for condition in conditions:
        val = obtain_feature_prob(df, col, condition) * get_entropy(obtain_conditional_prob(df, col, condition, decision))
        p_s.append(val)
        
    return H_decision - (np.sum(p_s))

In [150]:
for col in df.columns[:-1]:
    info_gain = get_info_gain(df, col, df.columns[-1:][0])
    print(f'info gain between {col} and Decision is {info_gain}')

info gain between Outlook and Decision is 0.24674981977443933
info gain between Temp and Decision is 0.02922256565895487
info gain between Humidity and Decision is 0.15183550136234159
info gain between Wind and Decision is 0.04812703040826949


In [158]:
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Wind,Decision
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [161]:
sunny_df = df[df['Outlook'] == 'Sunny']
for col in df.columns[:-1]:
    if col != 'Outlook':
        info_gain = get_info_gain(sunny_df, col, df.columns[-1:][0])
        print(f'info gain between {col} and Decision is {info_gain}')

info gain between Temp and Decision is 0.5709505944546686
info gain between Humidity and Decision is 0.9709505944546686
info gain between Wind and Decision is 0.01997309402197489


In [155]:
get_info_gain(df[df['Outlook'] == 'Sunny'], 'Temp', 'Decision')

0.5709505944546686

In [156]:
get_info_gain(df[df['Outlook'] == 'Overcast'], 'Temp', 'Decision')

0.0

In [159]:
get_info_gain(df[df['Outlook'] == 'Rain'], 'Temp', 'Decision')

0.01997309402197489