In [1]:
import numpy as np, pandas as pd
import math, random
import matplotlib, seaborn

# Model

### data

In [2]:
test_df = pd.DataFrame({
    '#follows': [1,2,3,4,5,6,7,8,9,10,11,12], 
    'Patrons': ['Some', 'Full', 'Some', 'Full', 'Full', 'Some', 'None', 'Some', 'Full', 'Full', 'None', 'Full'],
    'Type': ['French', 'Thai', 'Burger', 'Thai', 'French', 'Italian', 'Burger', 'Thai', 'Burger', 'Italian', 'Thai', 'Burger'],
    'Label': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']
})
test_df

Unnamed: 0,#follows,Patrons,Type,Label
0,1,Some,French,Yes
1,2,Full,Thai,No
2,3,Some,Burger,Yes
3,4,Full,Thai,Yes
4,5,Full,French,No
5,6,Some,Italian,Yes
6,7,,Burger,No
7,8,Some,Thai,Yes
8,9,Full,Burger,No
9,10,Full,Italian,No


### partition

In [11]:
def partition(attribute, value, label, examples, type):

    dict_ret = {
        'df_splitA':[0,0],
        'df_splitB':[0,0]
    }

    if type == "continous":
        df_splitA = examples.loc[examples[attribute] <= value]
        df_splitB = examples.loc[examples[attribute] > value]

    elif type == "discrete":
        df_splitA = examples.loc[examples[attribute] == value]
        df_splitB = examples.loc[examples[attribute] != value]


    # print(df_splitA)
    # print('===========')
    # print(df_splitB)

    dict_ret['df_splitA'][0] = len(df_splitA.loc[df_splitA[label] == 'Yes'])
    dict_ret['df_splitA'][1] = len(df_splitA.loc[df_splitA[label] == 'No'])
    dict_ret['df_splitB'][0] = len(df_splitB.loc[df_splitB[label] == 'Yes'])
    dict_ret['df_splitB'][1] = len(df_splitB.loc[df_splitB[label] == 'No'])

    return dict_ret

### Entropy & Gain

In [9]:
def entropy(li_2_ratio):

    li_entropy = list()
    li_rat = list()

    for li_ratio in li_2_ratio:
        
        li_rat.append(sum(li_ratio))
        
        if li_ratio[0] == 0 or li_ratio[1] == 0:
            li_entropy.append(0)
        else:
            entropy = -1
            summation = 0

            for each in li_ratio:
                summation += (each/sum(li_ratio)) * math.log2(each/sum(li_ratio))
            entropy *= summation
            li_entropy.append(round(entropy,2))
    
    return li_entropy, li_rat

def gain(li_entropy, li_rat):
    
    total = sum(li_rat)
    mulSum = 0
    for entropy, ratio in zip(li_entropy, li_rat):
        mulSum += entropy*(ratio/total)
        
    return round(1-mulSum, 2)

def get_gain(attribute, a_val, label, examples, type):
    dict_ret = partition(attribute, a_val, label, examples, type)
    # print(dict_ret.values())

    li_entropy, li_ratio = entropy(li_2_ratio=dict_ret.values())
    # print(li_entropy)
    # print(li_ratio)

    Gain = gain(li_entropy, li_ratio)
    # print(Gain)
    return Gain

### get_gain Example

In [12]:
print(get_gain('Patrons', 'Some', 'Label', test_df, 'discrete'))

0.46


In [23]:
li_patrons_element = list(set(test_df['Patrons']))

dict_gain = dict()
for element in li_patrons_element:
    dict_gain[element] = get_gain('Patrons', element, 'Label', test_df, 'discrete')

dict_gain

{'None': 0.19, 'Full': 0.08, 'Some': 0.46}

In [26]:
sorted_ret = sorted(dict_gain.items(), reverse=True, key= lambda s: s[1])
print(sorted_ret)

[('Some', 0.46), ('None', 0.19), ('Full', 0.08)]
