In [6]:
import pandas as pd
import numpy as np

# import file
df = pd.read_csv('../../res/tp2/german_credit.csv', sep=',') # columns: admit, gre, gpa, rank; 399 rows

continuous_attrs = {'Duration of Credit (month)': ['<= 12', '<= 24', '<= 36', '<= 48', '<= 60', '> 60'],
                    'Credit Amount': ['<= 500', '<= 1000', '<= 1500', '<= 2000', '<= 2500', '<= 3000', '<= 3500', '<= 4000', 
                                      '<= 4500', '<= 5000', '<= 5500', '<= 6000', '<= 6500', '<= 7000', '<= 7500', '<= 8000',
                                      '<= 9000', '<= 10000', '<= 11000', '<= 12000', '<= 13000', '<= 14000', '<= 15000', '> 15000'],
                    'Age (years)': ['< 30', '< 40', '< 50', '< 60', '>= 60']}

In [7]:
from sklearn.model_selection import train_test_split

# Separar los datos en datos de entrenamiento y testeo
train, test = train_test_split(df, test_size=0.2)

test.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
436,1,4,15,4,6,1532,2,3,4,2,...,3,3,31,3,2,1,3,1,1,1
421,1,2,11,4,0,1322,4,3,4,2,...,4,3,40,3,2,2,3,1,1,1
627,1,4,18,4,3,1582,4,5,4,3,...,4,3,46,3,2,2,3,1,1,1
860,0,4,48,2,9,3914,5,3,4,1,...,2,1,38,1,2,1,3,1,1,1
700,0,1,36,2,1,8335,5,5,3,3,...,4,4,47,3,3,1,3,1,1,1


In [8]:
import math

global entropy
entropy = None
positive_creditability = 'Creditability == 1'

def get_entropy_term(p):
    return - p * math.log2(p) if not math.isclose(p, 0) else 0

def get_entropy(data, cond): # H(S)
    global entropy
    if entropy is not None:
        return entropy
    
    positive_freq = len(data.query(cond)) / len(data)
    negative_freq = 1 - positive_freq
    entropy = get_entropy_term(positive_freq) + get_entropy_term(negative_freq)
    
    return entropy

def get_attr_entropy(data, cond): # H(Sv)
    positive_freq = len(data.query(cond)) / len(data)
    negative_freq = 1 - positive_freq
    return get_entropy_term(positive_freq) + get_entropy_term(negative_freq)

def get_rows_by_attr_cond(data, attr, cond, cond_index):
    rows = None
    
    if cond_index == 0:
        rows = data.query(f'`{attr}` {cond}')
    else:
        prev_cond = continuous_attrs[attr][cond_index-1]
        rows = data.query(f'not(`{attr}` {prev_cond}) and `{attr}` {cond}')
        
    return rows
    
def gain(data, attr, class_cond):
    ret = get_entropy(train, positive_creditability)
    
    if attr in continuous_attrs:
        for i, cond in enumerate(continuous_attrs[attr]):
            rows = get_rows_by_attr_cond(data, attr, cond, i)
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
    else:
        for value in df[attr].unique(): # TODO FIX: PREGUNTAR SI VALOR DE ATRIBUTO NO APARECE EN TRAIN, FALLA
            rows = data.query(f'`{attr}` == {value}')
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
    
    return ret

gains = {}
for attr in train.columns:
    if attr == 'Creditability':
        continue
    gains[attr] = gain(train, attr, positive_creditability)

print(gains)

#df['Credit Amount'].hist(grid=True, bins=30, rwidth=0.9, color='#607c8e')
#print(get_entropy(train, train.Creditability == 1))

{'Account Balance': 0.08730152382963671, 'Duration of Credit (month)': 0.025758669384047556, 'Payment Status of Previous Credit': 0.05280189988536468, 'Purpose': 0.03288866495124507, 'Credit Amount': 0.03821498316991855, 'Value Savings/Stocks': 0.02398174737342457, 'Length of current employment': 0.014582072235789473, 'Instalment per cent': 0.004215909328400144, 'Sex & Marital Status': 0.007205673719162517, 'Guarantors': 0.003863802720333577, 'Duration in Current address': 0.0003671371317126859, 'Most valuable available asset': 0.015025279876438702, 'Age (years)': 0.012884195090870361, 'Concurrent Credits': 0.011105546588770464, 'Type of apartment': 0.010376004142437278, 'No of Credits at this Bank': 0.002530635156699405, 'Occupation': 0.0009510512938736559, 'No of dependents': 2.3662221944692474e-05, 'Telephone': 0.0017299382244533312, 'Foreign Worker': 0.005279726986266116}
