In [1]:
import pandas as pd
import numpy as np

# import file
df = pd.read_csv('../../res/tp2/german_credit.csv', sep=',') # columns: admit, gre, gpa, rank; 399 rows

continuous_attrs = {'Duration of Credit (month)': ['<= 12', '<= 24', '<= 36', '<= 48', '<= 60', '> 60'],
                    'Credit Amount': ['<= 500', '<= 1000', '<= 1500', '<= 2000', '<= 2500', '<= 3000', '<= 3500', '<= 4000', 
                                      '<= 4500', '<= 5000', '<= 5500', '<= 6000', '<= 6500', '<= 7000', '<= 7500', '<= 8000',
                                      '<= 9000', '<= 10000', '<= 11000', '<= 12000', '<= 13000', '<= 14000', '<= 15000', '> 15000'],
                    'Age (years)': ['< 30', '< 40', '< 50', '< 60', '>= 60']}

In [2]:
from sklearn.model_selection import train_test_split

# Separar los datos en datos de entrenamiento y testeo
train, test = train_test_split(df, test_size=0.2)

test.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
717,0,4,24,2,9,4591,4,3,2,3,...,3,2,54,3,2,3,4,1,2,1
959,0,2,24,2,3,3092,2,2,3,4,...,2,3,22,3,1,1,3,1,2,1
679,1,2,6,2,3,484,1,4,3,4,...,3,1,28,1,2,1,2,1,1,1
791,0,3,15,2,3,2327,1,2,2,2,...,3,1,25,3,2,1,2,1,1,1
268,1,4,12,4,2,1258,1,2,2,2,...,4,2,22,3,1,2,2,1,1,1


In [3]:
import math

global entropy
entropy = None
positive_creditability = 'Creditability == 1'

def get_entropy_term(p):
    return - p * math.log2(p) if not math.isclose(p, 0) else 0

def get_entropy(data, cond): # H(S)
    global entropy
    if entropy is not None:
        return entropy
    
    positive_freq = len(data.query(cond)) / len(data)
    negative_freq = 1 - positive_freq
    entropy = get_entropy_term(positive_freq) + get_entropy_term(negative_freq)
    
    return entropy

def get_attr_entropy(data, cond): # H(Sv)
    positive_freq = len(data.query(cond)) / len(data)
    negative_freq = 1 - positive_freq
    return get_entropy_term(positive_freq) + get_entropy_term(negative_freq)

def get_rows_by_attr_cond(data, attr, cond, cond_index):
    rows = None
    
    if cond_index == 0:
        rows = data.query(f'`{attr}` {cond}')
    else:
        prev_cond = continuous_attrs[attr][cond_index-1]
        rows = data.query(f'not(`{attr}` {prev_cond}) and `{attr}` {cond}')
        
    return rows
    
def gain(data, attr, class_cond):
    ret = get_entropy(train, positive_creditability)
    
    if attr in continuous_attrs:
        for i, cond in enumerate(continuous_attrs[attr]):
            rows = get_rows_by_attr_cond(data, attr, cond, i)
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
    else:
        for value in df[attr].unique():
            rows = data.query(f'`{attr}` == {value}')
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
    
    return ret

gains = {}
for attr in train.columns:
    gains[attr] = gain(train, attr, positive_creditability)
    
#df['Credit Amount'].hist(grid=True, bins=30, rwidth=0.9, color='#607c8e')
#print(get_entropy(train, train.Creditability == 1))