If a target is a classification outcome taking on values $0,1,…,K-1$ for node $m$, representing a region $R_m$ with $N_m$ observations, let

$$ p_{mk} = 1 / N_m \sum_{x_i \in R_m} I(y_i = k) $$

be the proportion of class $k$ observations in node $m$

### Gini Impurity: 

$$ H(X_m) = \sum_{k} p_{mk}(1 - p_{mk}) $$

### Entropy: 
$$ H(X_m) = -\sum_{k} p_{mk}\log p_{mk} $$

### Misclassification: 
$$ H(X_m) = 1 - max(p_{mk} $$

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn as sk

In [4]:
zoo = pd.read_csv('zoo.csv')

In [5]:
zoo.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [171]:
def get_split(from_region, f, s):
    left = pd.DataFrame()
    right = pd .DataFrame()
    for index, instance in from_region.iterrows():
        if instance[f] >= s:
            right = right.append(instance)
        else:
            left = left.append(instance)
    return left, right

def gini(p):
    # prp is proportion of instances w/ label
    impurity= 0
    for label, prp in p.items():
        impurity += prp * (1 - prp)
    return impurity

def entropy(p):
    raise NotImplementedError('entropy() not implemented')
    
def misclassification(p):
    raise NotImplementedError('misclassification() not implemented')
    

def loss(pairs, loss='gini'):
    if not pairs.size:
        return 0.0
    p = {}
    num_pairs = len(pairs)
    for fvalue, label in pairs:
        if label not in p:
            p[label] = 0
        p[label] += ( 1 / num_pairs )
    if loss == 'gini':
        return gini(p)
    if loss == 'entropy':
        return entropy(p)
    if loss == 'misclassification':
        return misclassification(p)
    raise ValueError("loss should be either gini, misclassification, or entropy")
        
def get_best_split(examples, labels, features=[]):
    best_feature, best_value, best_loss_reduction = \
        0, 0.0, 0.0
    for feature in range(examples.shape[1]):
        # print('feature', feature)
        pairs = np.array(sorted(
            [(examples[i][feature], labels[i]) for i in range(len(examples))]))
        for index, (value, label) in enumerate(pairs):
            # print('\tindex', index)
            left, right = pairs[:index], pairs[index:]
            # Calculates loss "as is" i.e. w/ no splitting
            # and subtracts that from the loss from splitting w/ given 
            # left and right. We take the split and feature that 
            # creates the largest reduction in loss (largest (in AV) negative value)
            current_loss_reduction = \
                loss(left) + loss(right) - loss(pairs)
            # print('\t\tLOSS REDUCTION', current_loss_reduction)
            if current_loss_reduction < best_loss_reduction:
                best_feature = feature
                best_value = value
                best_loss_reduction = current_loss_reduction

    #return (best_feature, best_value, best_loss_reduction)
    return {'left': left, 'right': right, 
            'best_split': best_value, 'cost_reduction': best_loss_reduction, 
            'feature': best_feature
           }

def split_node_examples(examples, labels, min_samples_split=2):
    # process left node samples, note left is a DataFrame
    left, right = examples['left'], examples['right']
    if not left.size or not right.size:
        # no split occured so ....
        print('in empty left or right')
        examples['left'] = examples['right'] = np.concatenate([left, right])
        return 
    
    if left.size >= min_samples_split:
        best_left_split = get_best_split(left, labels)
        if abs(best_left_split['cost_reduction']) > 0:
            examples['left'] = best_left_split
            labels = best_left_split['left']
            split_node_examples(examples['left'], labels, min_samples_split)
    
    # process right node samples, right is also a DataFrame
    if right.size >= min_samples_split:
        # Since this isn't gonna be a terminal node, get rid of right instances
        best_right_split = get_best_split(left, labels)
        if abs(best_right_split['cost_reduction']) > 0:
            examples['right'] = best_right_split
            split_node_examples(examples['right'], labels, min_samples_split)
    
# Split up region
def build_tree(examples, labels, min_split_samples=2):
    if isinstance(examples, pd.DataFrame):
        examples = examples.values
    if isinstance(examples, list):
        examples = np.array(examples)
        
    labels = np.array(labels)
    
    root = get_best_split(examples, labels)
    split_node_examples(root, labels, min_split_samples)
    return root

df = pd.DataFrame(np.arange(24).reshape(6,4),
                      columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [172]:
from collections import OrderedDict
def sortOD(od):
    res = OrderedDict()
    for k, v in sorted(od.items()):
        if isinstance(v, dict):
            res[k] = sortOD(v)
        else:
            res[k] = v
    return res

y = df['D']
train = df.drop('D', axis=1)
tree = build_tree(train, y)
import pprint
pprint.pprint(tree)

{'left': array([[ 3,  3],
       [ 7,  7],
       [11, 11],
       [15, 15]]), 'right': array([[19, 19]]), 'best_split': 6, 'cost_reduction': -0.050000000000000155, 'feature': 0}
{'left': array([[ 3,  3],
       [ 7,  7],
       [11, 11]]), 'right': array([[15, 15]]), 'best_split': 7, 'cost_reduction': -0.08333333333333326, 'feature': 0}
{'left': array([[3, 3],
       [7, 7]]), 'right': array([[11, 11]]), 'best_split': 7, 'cost_reduction': -0.16666666666666674, 'feature': 0}
{'left': array([[3, 3]]), 'right': array([[7, 7]]), 'best_split': 7, 'cost_reduction': -0.5, 'feature': 0}
{'left': array([[3, 3]]), 'right': array([[7, 7]]), 'best_split': 7, 'cost_reduction': -0.5, 'feature': 0}
{'left': array([[3, 3],
       [7, 7]]), 'right': array([[11, 11]]), 'best_split': 7, 'cost_reduction': -0.16666666666666674, 'feature': 0}
{'left': array([[3, 3]]), 'right': array([[7, 7]]), 'best_split': 7, 'cost_reduction': -0.5, 'feature': 0}
{'left': array([[3, 3]]), 'right': array([[7, 7]]), 'best_s

In [9]:
def tree_total_cost(tree, s=0):
    if isinstance(tree['left'], pd.DataFrame):
        s += split_cost_for_node(tree['left'])
    else:
        s = tree_total_cost(tree['left'])
    
    if isinstance(tree['right'], pd.DataFrame):
        s += split_cost_for_node(tree['right'])
    else:
        s = tree_total_cost(tree['right'])
    return s

print('before',  split_cost_for_node(df), 'after', tree_total_cost(tree))

before 0.8333333333333333 after 0.6666666666666667


In [24]:
get_best_split(df.drop(['D'], axis=1), df.drop(['D'],axis=1).columns.values,  df['D'])

feature A
pairs [(0, 3), (4, 7), (8, 11), (12, 15), (16, 19), (20, 23)]
	index 0
		right [3, 7, 11, 15, 19, 23]
		average label 13.0
		loss 280.0
		pairs [3, 7, 11, 15, 19, 23]
		average label 13.0
		loss 280.0
		LOSS REDUCTION 0.0
	index 1
		left [3]
		average label 3.0
		loss 0.0
		right [7, 11, 15, 19, 23]
		average label 15.0
		loss 160.0
		pairs [3, 7, 11, 15, 19, 23]
		average label 13.0
		loss 280.0
		LOSS REDUCTION -120.0
	index 2
		left [3, 7]
		average label 5.0
		loss 8.0
		right [11, 15, 19, 23]
		average label 17.0
		loss 80.0
		pairs [3, 7, 11, 15, 19, 23]
		average label 13.0
		loss 280.0
		LOSS REDUCTION -192.0
	index 3
		left [3, 7, 11]
		average label 7.0
		loss 32.0
		right [15, 19, 23]
		average label 19.0
		loss 32.0
		pairs [3, 7, 11, 15, 19, 23]
		average label 13.0
		loss 280.0
		LOSS REDUCTION -216.0
	index 4
		left [3, 7, 11, 15]
		average label 9.0
		loss 80.0
		right [19, 23]
		average label 21.0
		loss 8.0
		pairs [3, 7, 11, 15, 19, 23]
		average label 13.0

('A', 12, -216.0)