In [1]:
import pandas as pd
import numpy as np

# import file
df = pd.read_csv('../../res/tp2/german_credit.csv', sep=',') # columns: admit, gre, gpa, rank; 399 rows
df = df[['Creditability', 'Account Balance', 'Duration of Credit (month)', 'Credit Amount', 'Age (years)']]

In [2]:
all_attrs = set(df)
all_attrs.remove('Creditability')

continuous_attrs = {'Duration of Credit (month)': ['<= 12', '<= 24', '<= 36', '<= 48', '<= 60', '> 60'],
                    'Credit Amount': ['<= 500', '<= 1000', '<= 1500', '<= 2000', '<= 2500', '<= 3000', '<= 3500', '<= 4000', 
                                      '<= 4500', '<= 5000', '<= 5500', '<= 6000', '<= 6500', '<= 7000', '<= 7500', '<= 8000',
                                      '<= 9000', '<= 10000', '<= 11000', '<= 12000', '<= 13000', '<= 14000', '<= 15000', '> 15000'],
                    'Age (years)': ['< 30', '< 60', '>= 60']}
discrete_attrs = dict(map(lambda attr_name: (attr_name, list(df[attr_name].unique())),
                          filter(lambda attr_name: attr_name not in continuous_attrs, all_attrs)
                         ))
print(all_attrs)
print(list(discrete_attrs.keys()))

{'Age (years)', 'Account Balance', 'Credit Amount', 'Duration of Credit (month)'}
['Account Balance']


In [3]:
from sklearn.model_selection import train_test_split

# Separar los datos en datos de entrenamiento y testeo
train, test = train_test_split(df, test_size=0.2)

test.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Credit Amount,Age (years)
300,1,1,12,339,45
757,0,1,12,1082,48
411,1,4,24,2978,32
93,1,1,9,1364,59
285,1,4,6,672,54


In [4]:
import math

global entropy
entropy = None
positive_creditability = 'Creditability == 1'

def format_query_string(attr, value, index=-1):
    if attr in continuous_attrs:
        return f'not (`{attr}` {continuous_attrs[attr][index-1]}) and `{attr}` {value}' if index > 0 else f'`{attr}` {value}'
    
    return f'`{attr}` == {value}'

def get_entropy_term(p):
    return - p * math.log2(p) if not math.isclose(p, 0) else 0

def get_entropy(data, cond=positive_creditability): # H(S)
    positive_freq = len(data.query(cond)) / len(data) if len(data) != 0 else 0
    negative_freq = 1 - positive_freq
    entropy = get_entropy_term(positive_freq) + get_entropy_term(negative_freq)
    
    return entropy

def get_attr_entropy(data, cond): # H(Sv)
    positive_freq = len(data.query(cond)) / len(data) if len(data) != 0 else 0 # TODO: check what to do if len(data) == 0
    negative_freq = 1 - positive_freq
    return get_entropy_term(positive_freq) + get_entropy_term(negative_freq)

def get_rows_by_attr_cond(data, attr, cond, cond_index):
    rows = None
    
    if cond_index == 0:
        rows = data.query(f'`{attr}` {cond}')
    else:
        prev_cond = continuous_attrs[attr][cond_index-1]
        rows = data.query(f'not(`{attr}` {prev_cond}) and `{attr}` {cond}')
        
    return rows
    
def gain(data, attr, class_cond=positive_creditability):
    ret = get_entropy(data)
    
    if attr in continuous_attrs:
        for i, cond in enumerate(continuous_attrs[attr]):
            rows = get_rows_by_attr_cond(data, attr, cond, i)
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
            # print(f'{attr}: {ret}')
    else:
        for value in discrete_attrs[attr]: # TODO FIX: PREGUNTAR SI VALOR DE ATRIBUTO NO APARECE EN TRAIN, FALLA
            rows = data.query(f'`{attr}` == {value}')
            ret -= len(rows) / len(data) * get_attr_entropy(rows, class_cond)
            # print(f'{attr},{value}: {ret}')
    
    return ret

gains = {}
for attr in train.columns:
    if attr == 'Creditability':
        continue
    gains[attr] = gain(train, attr)

print(gains)

#df['Credit Amount'].hist(grid=True, bins=30, rwidth=0.9, color='#607c8e')
#print(get_entropy(train, train.Creditability == 1))

{'Account Balance': 0.08588779185909069, 'Duration of Credit (month)': 0.02698709047140118, 'Credit Amount': 0.05058689375350858, 'Age (years)': 0.012210269140525136}


In [5]:
train['Creditability'].mode()[0]

1

In [12]:
from tree import Tree, NodeType

def get_max_gain(gains):
    max_attr = None
    max_gain = 0
    for attr, gain in gains.items():
        if gain > max_gain:
            max_gain = gain
            max_attr = attr
    return max_attr, max_gain

used_attrs = set({})
root_attr, _ = get_max_gain(gains)
def add_level(data, current_level_attr, node=None):
    global used_attrs
    
    tree = None
    current_level_attr_node = None
    if node is None:
        tree = Tree()
        tree.root = current_level_attr_node = Tree.Node(current_level_attr)
    else:
        current_level_attr_node = node.add_child(NodeType.attr, current_level_attr)
    
    used_attrs.add(current_level_attr)
    
    
    values = continuous_attrs[current_level_attr] if current_level_attr in continuous_attrs else discrete_attrs[current_level_attr]
    for i, val in enumerate(values):
        if current_level_attr == root_attr:
            print(val)
        current_level_attr_value_node = current_level_attr_node.add_child(NodeType.val, val)
        rows = data.query(format_query_string(current_level_attr, val, i))
        # print(f"{format_query_string(current_level_attr, val, i)}: {len(rows)}")
        if (len(rows) == 0):
            current_level_attr_value_node.add_child(NodeType.leaf, 'NULL')
            continue
            
        classes = rows['Creditability'].unique()
        if len(classes) == 1:  # Only positive or negative examples left (leaf)
            current_level_attr_value_node.add_child(NodeType.leaf, classes[0])
            continue
        else:
            unprocessed_attrs = all_attrs - used_attrs
            if len(unprocessed_attrs) == 0:  # No more attributes to calculate gain for (leaf)
                current_level_attr_value_node.add_child(NodeType.leaf, rows['Creditability'].mode()[0])  # The most frequent value for Creditability TODO ASK
            else:
                current_gains = {}
                for attr in unprocessed_attrs:
                    current_gains[attr] = gain(rows, attr)
                next_level_attr, _ = get_max_gain(current_gains)
                # print(current_gains)
                if (next_level_attr is None):
                    #print(f'GAIN NEGATIVO: {current_gains}')
                    # print(rows)
                    a = 1
                else:
                    add_level(rows, next_level_attr, current_level_attr_value_node)
    
    used_attrs.remove(current_level_attr)
    
    if tree:
        return tree


#tr = Tree(root_attr)
#add_level(train, root_attr, tr.root)
#print('\n\n\n\n\n**********************************\n\n\n')
#print(tr)

In [13]:
def evaluate_condition(attr, asked_value, tree_cond): # attr = Age, asked_value = 28, tree_cond = '<= 30'
    if attr in continuous_attrs:
        index = continuous_attrs[attr].index(tree_cond)
        return eval(f'not (`{asked_value}` {continuous_attrs[attr][index-1]}) and `{asked_value}` {tree_cond}' if index > 0 else f'`{asked_value}` {tree_cond}')
        
    return asked_value == tree_cond

def classify(row, tree):
    current_node = tree.root
    current_node_type = NodeType.attr
    
    while(current_node.children):
        current_node_val = current_node.value
        current_node_type = current_node.node_type
        print(f'Node value: {current_node_val} - Type: {current_node_type}')
        
        if current_node_type == NodeType.attr:
            row_val = row[current_node_val]
            found_value = False
            for child in current_node.children:
                print(f'Attr {current_node_val} Child Value: {child.value}')
                if evaluate_condition(current_node_val, row_val, child.value):
                    current_node = child
                    found_value = True
                    break
            if not found_value:
                raise Exception('Could not find attribute value')
        elif current_node_type == NodeType.val:
            current_node = current_node.children[0]
        else:
            raise Exception('Invalid State')
            
    return current_node.value

tr = Tree(root_attr)
add_level(train, root_attr, tr.root)
print(tr)
print(df.iloc[0])
classify(df.iloc[0], tr)

1
2
4
3

Account Balance

Account Balance

1

Credit Amount

<= 500

Age (years)

< 30

< 60

1

>= 60

NULL

<= 1000

Age (years)

< 30

Duration of Credit (month)

<= 12

0

<= 24

0

<= 36

NULL

<= 48

NULL

<= 60

NULL

> 60

NULL

< 60

Duration of Credit (month)

<= 12

0

<= 24

0

<= 36

NULL

<= 48

NULL

<= 60

NULL

> 60

NULL

>= 60

1

<= 1500

Duration of Credit (month)

<= 12

Age (years)

< 30

1

< 60

1

>= 60

1

<= 24

Age (years)

< 30

0

< 60

0

>= 60

0

<= 36

NULL

<= 48

NULL

<= 60

NULL

> 60

NULL

<= 2000

Duration of Credit (month)

<= 12

1

<= 24

Age (years)

< 30

0

< 60

1

>= 60

NULL

<= 36

0

<= 48

0

<= 60

NULL

> 60

NULL

<= 2500

Duration of Credit (month)

<= 12

Age (years)

< 30

1

< 60

1

>= 60

0

<= 24

Age (years)

< 30

0

< 60

0

>= 60

1

<= 36

Age (years)

< 30

0

< 60

0

>= 60

NULL

<= 48

NULL

<= 60

NULL

> 60

NULL

<= 3000

Duration of Credit (month)

<= 12

1

<= 24

Age (years)

< 30

1

< 60

1

>= 60

1

<= 3

Exception: Could not find attribute value