### Gini Impurity: 

$$ I_G(M) = \sum_{i=1}^{M} \hat{p}_{mk} (1-\hat{p}_{mk} ) = \sum_{i=1}^{M} (\hat{p}_{mk}  - \hat{p}_{mk} ^2) = \sum_{i=1}^{M} \hat{p}_{mk}  - \sum_{i=1}^{M} \hat{p}_{mk} ^2 = 1 - \sum_{i=1}^{M} \hat{p}_{mk}^2 = \sum_{i = 1, i \neq k}^{M} \hat{p}_{mi}  \hat{p}_{mk}  $$

$$ \hat{p}_{mk} = \dfrac{1}{N_m}\sum_{x_i \in R_m} I(y_i = k)$$

### Entropy: 
$$ I_H(E) = -\sum_{k=1}^{c}\hat{p}_{mk} \log \hat{p}_{mk}  $$

In [13]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn as sk

In [14]:
zoo = pd.read_csv('zoo.csv')

In [15]:
zoo.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [94]:
def p(node, current_instance, y_attribute):
    s = 0.0
    for index, example in node.iterrows(): 
        if example[y_attribute] == current_instance[y_attribute]:
            s += 1
    return s / node.shape[0]

# Gets the Gini Impurity of a single node
def split_cost_for_node(node, y_attribute):
    impurity = 0.0
    for index, example in node.iterrows(): 
        # using p() here in this way is slow 
        pr = p(node, example, y_attribute)
        impurity += pr * (1 - pr)
    return impurity

def test_split_for_finding_optimal(from_region, f, s):
    left = pd.DataFrame()
    right = pd .DataFrame()
    for index, instance in from_region.iterrows():
        if instance[f] >= s:
            right = right.append(instance)
        else:
            left = left.append(instance)
    return left, right

# Get the best splitting criterion
def get_split_criteria(from_region, y_attribute):
    features = list(from_region)
    min_cost = 9999999999
    min_split = {}
    for f in features:
        uniq = from_region[f].unique()
        for s in uniq:
            # get left and right node from split on f feature w/ s as value
            left, right = test_split_for_finding_optimal(from_region, f, s)
            # The cost of the left and right node together 
            cost = sum([split_cost_for_node(left, y_attribute), split_cost_for_node(right, y_attribute)])
            if cost < min_cost:
                min_split = {'left': left, 'right': right, 'split': s, 'cost': cost, 'feature': f}
            # print('feature', f, 'split', s, '\nleft', left, '\nright', right, '\n')
    return min_split

def split_node(node, y_attribute, min_samples_split=2):
    
    # process left node samples, note left is a DataFrame
    left, right = node['left'], node['right']
    if left.empty or right.empty:
        node['left'] = node['right'] = pd.concat([left, right])
        return 
    
    left_samples = left.shape[0]
    if left_samples >= min_samples_split:
        node['left'] = get_split_criteria(left, y_attribute)
        split_node(node['left'], y_attribute, min_samples_split)
    
    # process right node samples, right is also a DataFrame
    right = node['right']
    right_samples = right.shape[0]
    if right_samples >= min_samples_split:
        # Since this isn't gonna be a terminal node, get rid of right instances
        node['right'] = get_split_criteria(right, y_attribute)
        split_node(node['right'], y_attribute, min_samples_split)
    
# Split up region
def build_tree(region, y_attribute, min_split_samples=2):
    root = get_split_criteria(region, y_attribute)
    split_node(root, y_attribute, min_split_samples)
    return root

df = pd.DataFrame(np.arange(96).reshape(24,4),
                      columns=['A', 'B', 'C', 'D'])

In [95]:
tree = build_tree(df, 'D', 4)
tree

{'cost': 0.9565217391304344,
 'feature': 'D',
 'left': {'cost': 0.9545454545454549,
  'feature': 'D',
  'left': {'cost': 0.9523809523809526,
   'feature': 'D',
   'left': {'cost': 0.9499999999999998,
    'feature': 'D',
    'left': {'cost': 0.9473684210526314,
     'feature': 'D',
     'left': {'cost': 0.9444444444444445,
      'feature': 'D',
      'left': {'cost': 0.9411764705882351,
       'feature': 'D',
       'left': {'cost': 0.9375,
        'feature': 'D',
        'left': {'cost': 0.9333333333333331,
         'feature': 'D',
         'left': {'cost': 0.9285714285714287,
          'feature': 'D',
          'left': {'cost': 0.9230769230769234,
           'feature': 'D',
           'left': {'cost': 0.9166666666666664,
            'feature': 'D',
            'left': {'cost': 0.909090909090909,
             'feature': 'D',
             'left': {'cost': 0.8999999999999999,
              'feature': 'D',
              'left': {'cost': 0.8888888888888888,
               'feature': 'D',
 

In [97]:
def tree_total_cost(tree, y_attribute, s=0):
    if isinstance(tree['left'], pd.DataFrame):
        s += split_cost_for_node(tree['left'], y_attribute)
    else:
        s = tree_cost(tree['left'], y_attribute)
    
    if isinstance(tree['right'], pd.DataFrame):
        s += split_cost_for_node(tree['right'], y_attribute)
    else:
        s = tree_cost(tree['right'], y_attribute)
    return s

print('before',  split_cost_for_node(df, 'D'), 'after', tree_cost(tree, 'D'))

before 0.9583333333333337 after 0.6666666666666667
