In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

In [3]:
lst = ['apple']*3 + ['orange']*2 + ['banana']*2
fruits = pd.Series(lst)
print(fruits)

0     apple
1     apple
2     apple
3    orange
4    orange
5    banana
6    banana
dtype: object


In [2]:
data = pd.read_excel (r'D:/Grace/data.xlsx')
print (data)

     Age  Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0     40    Male       No        Yes                 No      Yes         No   
1     58    Male       No         No                 No      Yes         No   
2     41    Male      Yes         No                 No      Yes        Yes   
3     45    Male       No         No                Yes      Yes        Yes   
4     60    Male      Yes        Yes                Yes      Yes        Yes   
..   ...     ...      ...        ...                ...      ...        ...   
515   39  Female      Yes        Yes                Yes       No        Yes   
516   48  Female      Yes        Yes                Yes      Yes        Yes   
517   58  Female      Yes        Yes                Yes      Yes        Yes   
518   32  Female       No         No                 No      Yes         No   
519   42    Male       No         No                 No       No         No   

    Genital thrush visual blurring Itching Irritabi

In [4]:
probs = fruits.value_counts(normalize=True)
probs

apple     0.428571
orange    0.285714
banana    0.285714
dtype: float64

In [5]:
probs_by_hand = [3/7, 2/7, 2/7]
print(probs_by_hand)

[0.42857142857142855, 0.2857142857142857, 0.2857142857142857]


Recall that Shannon's model defines entropy as
H(x):=−∑i=1ℓ(P(t=i)×log2(P(t=i))
 
The idea with entropy is that the more heterogenous and impure a feature is, the higher the entropy. Conversely, the more homogenous and pure a feature is, the lower the entropy.

The following calculation shows how impurity of this fruit basket can be computed using the entropy criterion.

In [6]:
entropy = -1 * np.sum(np.log2(probs) * probs)
entropy

1.5566567074628228

The gini impurity index is defined as follows:
Gini(x):=1−∑i=1ℓP(t=i)2
 
The idea with Gini index is the same as in entropy in the sense that the more heterogenous and impure a feature is, the higher the Gini index.

A nice property of the Gini index is that it is always between 0 and 1, and this may make it easier to compare Gini indices across different features.

The impurity of our fruit basket using Gini index is calculated as below.

In [7]:
gini_index = 1 - np.sum(np.square(probs))
gini_index

0.653061224489796

In [8]:
lst2 = ['apple', 'orange', 'banana', 'mango', 'blueberry', 'watermelon', 'pear']
fruits2 = pd.Series(lst2)
print(fruits2)
probs2 = fruits2.value_counts(normalize=True)
probs2

0         apple
1        orange
2        banana
3         mango
4     blueberry
5    watermelon
6          pear
dtype: object


apple         0.142857
watermelon    0.142857
blueberry     0.142857
orange        0.142857
pear          0.142857
banana        0.142857
mango         0.142857
dtype: float64

In [9]:
entropy = -1 * np.sum(np.log2(probs2) * probs2)
entropy

2.807354922057604

In [10]:
gini_index = 1 - np.sum(np.square(probs2))
gini_index

0.8571428571428572

As expected, both entropy and Gini index of the second fruit basket is higher than those of the first fruit basket.

In [11]:
import pandas as pd
import io
import requests

data = pd.read_excel (r'D:/Grace/data.xlsx')
print (data)


     Age  Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0     40    Male       No        Yes                 No      Yes         No   
1     58    Male       No         No                 No      Yes         No   
2     41    Male      Yes         No                 No      Yes        Yes   
3     45    Male       No         No                Yes      Yes        Yes   
4     60    Male      Yes        Yes                Yes      Yes        Yes   
..   ...     ...      ...        ...                ...      ...        ...   
515   39  Female      Yes        Yes                Yes       No        Yes   
516   48  Female      Yes        Yes                Yes      Yes        Yes   
517   58  Female      Yes        Yes                Yes      Yes        Yes   
518   32  Female       No         No                 No      Yes         No   
519   42    Male       No         No                 No       No         No   

    Genital thrush visual blurring Itching Irritabi

In [14]:
df = data

In [12]:
def compute_impurity(feature, impurity_criterion):
    """
    This function calculates impurity of a feature.
    Supported impurity criteria: 'entropy', 'gini'
    input: feature (this needs to be a Pandas series)
    output: feature impurity
    """
    probs = feature.value_counts(normalize=True)
    
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    elif impurity_criterion == 'gini':
        impurity = 1 - np.sum(np.square(probs))
    else:
        raise ValueError('Unknown impurity criterion')
        
    return(round(impurity, 3))


# let's do two quick examples.
print('impurity using entropy:', compute_impurity(fruits, 'entropy'))
print('impurity using gini index:', compute_impurity(fruits, 'gini'))
# how to test for an incorrect compute_impurity_criterion value:
# print('impurity using gini index:', compute_impurity(df['stream'], 'foo'))

impurity using entropy: 1.557
impurity using gini index: 0.653


In [16]:
target_entropy = compute_impurity(df['weakness'], 'entropy')
target_entropy

0.978

In [17]:
df['weakness'].value_counts()

Yes    305
No     215
Name: weakness, dtype: int64

In [18]:
for level in df['weakness'].unique():
    print('level name:', level)
    df_feature_level = df[df['weakness'] == level]
    print('corresponding data partition:')
    print(df_feature_level)
    print('partition target feature impurity:', compute_impurity(df_feature_level['c'], 'entropy'))
    print('partition weight:', str(len(df_feature_level)) + '/' + str(len(df)))
    print('====================')

level name: Yes
corresponding data partition:
     Age  Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
0     40    Male       No        Yes                 No      Yes         No   
1     58    Male       No         No                 No      Yes         No   
2     41    Male      Yes         No                 No      Yes        Yes   
3     45    Male       No         No                Yes      Yes        Yes   
4     60    Male      Yes        Yes                Yes      Yes        Yes   
..   ...     ...      ...        ...                ...      ...        ...   
513   62  Female      Yes        Yes                Yes      Yes         No   
514   54  Female      Yes        Yes                Yes      Yes        Yes   
516   48  Female      Yes        Yes                Yes      Yes        Yes   
517   58  Female      Yes        Yes                Yes      Yes        Yes   
518   32  Female       No         No                 No      Yes         No   

    G

In [19]:
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion):
    """
    This function calculates information gain for splitting on 
    a particular descriptive feature for a given dataset
    and a given impurity criteria.
    Supported split criterion: 'entropy', 'gini'
    """
    
    print('target feature:', target)
    print('descriptive_feature:', descriptive_feature)
    print('split criterion:', split_criterion)
            
    target_entropy = compute_impurity(df[target], split_criterion)

    # we define two lists below:
    # entropy_list to store the entropy of each partition
    # weight_list to store the relative number of observations in each partition
    entropy_list = list()
    weight_list = list()
    
    # loop over each level of the descriptive feature
    # to partition the dataset with respect to that level
    # and compute the entropy and the weight of the level's partition
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))

    print('impurity of partitions:', entropy_list)
    print('weights of partitions:', weight_list)

    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    print('remaining impurity:', feature_remaining_impurity)
    
    information_gain = target_entropy - feature_remaining_impurity
    print('information gain:', information_gain)
    
    print('====================')

    return(information_gain)

In [20]:
split_criterion = 'entropy'
for feature in df.drop(columns='Polyphagia').columns:
    feature_info_gain = comp_feature_information_gain(df, 'Polyphagia', feature, split_criterion)

target feature: Polyphagia
descriptive_feature: Age
split criterion: entropy
impurity of partitions: [0.544, 0.852, -0.0, 0.764, 0.997, 0.994, 0.353, 0.991, 0.544, 0.722, -0.0, 0.934, 0.918, 0.544, 0.696, 0.904, 0.985, 0.896, 0.94, 0.722, 0.764, -0.0, 0.934, -0.0, -0.0, -0.0, 0.634, 0.918, -0.0, 0.811, -0.0, -0.0, 0.65, -0.0, 0.918, 0.503, -0.0, 1.0, -0.0, -0.0, -0.0, -0.0, 0.971, -0.0, -0.0, -0.0, -0.0, 0.722, -0.0, -0.0, -0.0]
weights of partitions: [0.046, 0.035, 0.008, 0.035, 0.029, 0.042, 0.029, 0.017, 0.015, 0.01, 0.013, 0.038, 0.058, 0.015, 0.031, 0.048, 0.013, 0.031, 0.054, 0.01, 0.017, 0.008, 0.038, 0.013, 0.013, 0.006, 0.048, 0.035, 0.015, 0.015, 0.01, 0.008, 0.012, 0.004, 0.04, 0.017, 0.019, 0.015, 0.006, 0.004, 0.004, 0.017, 0.01, 0.002, 0.012, 0.002, 0.008, 0.01, 0.012, 0.002, 0.002]
remaining impurity: 0.6633420000000001
information gain: 0.3306579999999999
target feature: Polyphagia
descriptive_feature: Gender
split criterion: entropy
impurity of partitions: [0.952, 0.97