In [25]:
import pandas as pd
import numpy as np

# import file
df = pd.read_csv('../../res/tp2/german_credit.csv', sep=',') # columns: admit, gre, gpa, rank; 399 rows
df = df[['Creditability', 'Account Balance', 'Duration of Credit (month)', 'Credit Amount', 'Age (years)']]

In [26]:
all_attrs = set(df)
all_attrs.remove('Creditability')

continuous_attrs = {'Duration of Credit (month)': ['<= 12', '<= 24', '<= 36', '<= 48', '<= 60', '> 60'],
                    'Credit Amount': ['<= 500', '<= 1000', '<= 1500', '<= 2000', '<= 2500', '<= 3000', '<= 3500', '<= 4000', 
                                      '<= 4500', '<= 5000', '<= 5500', '<= 6000', '<= 6500', '<= 7000', '<= 7500', '<= 8000',
                                      '<= 9000', '<= 10000', '<= 11000', '<= 12000', '<= 13000', '<= 14000', '<= 15000', '> 15000'],
                    'Age (years)': ['< 30', '< 60', '>= 60']}
discrete_attrs = dict(map(lambda attr_name: (attr_name, list(df[attr_name].unique())),
                          filter(lambda attr_name: attr_name not in continuous_attrs, all_attrs)
                         ))
print(all_attrs)
print(list(discrete_attrs.keys()))

{'Account Balance', 'Credit Amount', 'Duration of Credit (month)', 'Age (years)'}
['Account Balance']


In [27]:
from sklearn.model_selection import train_test_split

# Separar los datos en datos de entrenamiento y testeo
train, test = train_test_split(df, test_size=0.2)

test.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Credit Amount,Age (years)
974,0,1,6,448,23
859,0,1,12,727,33
405,1,2,48,5381,40
340,1,4,30,3077,40
680,1,2,10,1048,23


In [28]:
import math

global entropy
entropy = None
positive_creditability = 'Creditability == 1'

def format_query_string(attr, value, index):
    if attr in continuous_attrs:
        return f'not (`{attr}` {continuous_attrs[attr][index-1]}) and `{attr}` {value}'
    
    return f'`{attr}` == {value}'

def get_entropy_term(p):
    return - p * math.log2(p) if not math.isclose(p, 0) else 0

def get_entropy(data, cond=positive_creditability): # H(S)
    positive_freq = len(data.query(cond)) / len(data) if len(data) != 0 else 0
    negative_freq = 1 - positive_freq
    entropy = get_entropy_term(positive_freq) + get_entropy_term(negative_freq)
    
    return entropy

def get_attr_entropy(data, cond): # H(Sv)
    positive_freq = len(data.query(cond)) / len(data) if len(data) != 0 else 0 # TODO: check what to do if len(data) == 0
    negative_freq = 1 - positive_freq
    return get_entropy_term(positive_freq) + get_entropy_term(negative_freq)

def get_rows_by_attr_cond(data, attr, cond, cond_index):
    rows = None
    
    if cond_index == 0:
        rows = data.query(f'`{attr}` {cond}')
    else:
        prev_cond = continuous_attrs[attr][cond_index-1]
        rows = data.query(f'not(`{attr}` {prev_cond}) and `{attr}` {cond}')
        
    return rows
    
def gain(data, attr, class_cond=positive_creditability):
    ret = get_entropy(data)
    
    if attr in continuous_attrs:
        for i, cond in enumerate(continuous_attrs[attr]):
            rows = get_rows_by_attr_cond(data, attr, cond, i)
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
            # print(f'{attr}: {ret}')
    else:
        for value in discrete_attrs[attr]: # TODO FIX: PREGUNTAR SI VALOR DE ATRIBUTO NO APARECE EN TRAIN, FALLA
            rows = data.query(f'`{attr}` == {value}')
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
            # print(f'{attr},{value}: {ret}')
    
    return ret

gains = {}
for attr in train.columns:
    if attr == 'Creditability':
        continue
    gains[attr] = gain(train, attr)

print(gains)

#df['Credit Amount'].hist(grid=True, bins=30, rwidth=0.9, color='#607c8e')
#print(get_entropy(train, train.Creditability == 1))

{'Account Balance': 0.09281664976303972, 'Duration of Credit (month)': 0.02612336805265416, 'Credit Amount': 0.04563660474374813, 'Age (years)': 0.00898832247051077}


In [29]:
train['Creditability'].mode()[0]

1

In [30]:
from tree import Tree

def get_max_gain(gains):
    max_attr = None
    max_gain = 0
    for attr, gain in gains.items():
        if gain > max_gain:
            max_gain = gain
            max_attr = attr
    return max_attr, max_gain


used_attrs = set({})
root_attr, _ = get_max_gain(gains)
def add_level(data, current_level_attr, node):
    global used_attrs
    used_attrs.add(current_level_attr)
    current_level_attr_node = node.add_child(current_level_attr)
    
    values = continuous_attrs[current_level_attr] if current_level_attr in continuous_attrs else discrete_attrs[current_level_attr]
    for i, val in enumerate(values):
        if current_level_attr == root_attr:
            print(val)
        current_level_attr_value_node = current_level_attr_node.add_child(val)
        rows = data.query(format_query_string(current_level_attr, val, i))
        # print(f"{format_query_string(current_level_attr, val, i)}: {len(rows)}")
        if (len(rows) == 0):
            current_level_attr_value_node.add_child('NULL')
            continue
            
        classes = rows['Creditability'].unique()
        if len(classes) == 1:  # Only positive or negative examples left (leaf)
            current_level_attr_value_node.add_child(classes[0])
            continue
        else:
            unprocessed_attrs = all_attrs - used_attrs
            if len(unprocessed_attrs) == 0:  # No more attributes to calculate gain for (leaf)
                current_level_attr_value_node.add_child(rows['Creditability'].mode()[0])  # The most frequent value for Creditability
            else:
                current_gains = {}
                for attr in unprocessed_attrs:
                    current_gains[attr] = gain(rows, attr)
                next_level_attr, _ = get_max_gain(current_gains)
                # print(current_gains)
                if (next_level_attr is None):
                    print(f'GAIN NEGATIVO: {current_gains}')
                    # print(rows)
                else:
                    add_level(rows, next_level_attr, current_level_attr_value_node)
    
    used_attrs.remove(current_level_attr)


tr = Tree(root_attr)
add_level(train, root_attr, tr.root)
print('\n\n\n\n\n**********************************\n\n\n')
print(tr)

1
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
2
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
4
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
3
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}
GAIN NEGATIVO: {'Age (years)': 0.0}
GAIN NEGATIVO: {'Duration of Credit (month)': 0.0}





**********************************




Account Balance

Account Balance

1

Credit Amount

<= 500

1

<= 1000

Age (years)

< 30

Duration of Credit (month)

<= 1