### Imports

##### References
[Decision Trees from Scratch](https://medium.com/@penggongting/implementing-decision-tree-from-scratch-in-python-c732e7c69aea)

In [47]:
import numpy as np
import math
from collections import Counter
from itertools import starmap

### Entropy Formulation
Part of building a decision tree deciding how the tree branches and breaks off into individual leaves. **Entropy** is how we characterize if a split is good or not. 
$$Entropy = -\sum_X P(X = Class) * \log_2 P(X = Class)$$
If a split is good its got very low Entropy, meaning most of the data on either side of the split is the same class. Roughly speaking that algorithm looks something like this

```
Divide all the data on a node into 2 parts, left and right
> Compute the Entropy of Left and Right
> return len(Left)/len(Total) * Entropy Left + len(Right)/len(Right) * Entropy Right
```

In [48]:
def _best_split(features, data):
    split = starmap(_split, zip(features, data))
    split = sorted(list(split), key= lambda x : x[1]) # sort by entropy
    return split[0] # return feature with the lowest entropy

def _split(feature, data):
    '''
    feature - the thing we are splitting on
    data - the thing which we are breaking up using that feature
    '''
    best_entropy, best_val = 1e6, None
    for val in feature:
        sel = val < data
        left, right = data[sel], data[~sel]
        curr_entropy = _entropy(left, right)

        if curr_entropy < best_entropy:
            best_entropy = curr_entropy
            best_val = val
    return feature, best_entropy, best_val

def _get_entropy(side):
    '''
    Compute the entropy for a given side
    '''
    data_counts = Counter(side)
    n = len(side)
    total = 0
    for c in data_counts.keys():
        prob = data_counts[c]/n
        prob *= math.log2(prob)
        total += prob
    return total

def _entropy(left, right):
    '''
    Every split is characterized by breaking some dataset into 2 sides
    the right side and the left side (left, right)

    The net entropy is thus the combined entropy of those 2 sides
    '''
    left_entropy, right_entropy = _get_entropy(left), _get_entropy(right)
    n = len(left) + len(right)
    return len(left)/n * left_entropy + len(right)/n * right_entropy

### Decision Tree Class

In [43]:
class DecisionTree():
    def __init__(self, max_depth):
        self.current_depth = 0
        self.max_depth = max_depth
        self.tree = None
    
    def fit(self, X, y, feature_names, parent={}, depth=0):
        if parent is None:
            return None
        elif len(y) == 0:
            return None
        elif all([_tmp == y[0] for _tmp in y]):
            return {'val':y[0]}
        elif depth >= self.max_depth: 
            return None
        
        # recursively generate trees
        else:
            feature, best_entropy, cutoff = _best_split(X, y)
            
            # cut data into its components
            left_sel = X[: , feature] < cutoff
            right_sel = X[: , feature] >= cutoff
            left_X, left_y = X[left_sel], y[left_sel]
            right_X, right_y = X[right_sel], y[right_sel]
            
            parent = {
                'feature' : feature,
                'cutoff' : cutoff,
            }
            parent['left'] = self.fit(X, y, feature_names, {}, depth+1)
            parent['right'] = self.fit(X, y, feature_names, {}, depth+1)
            self.depth += 1
            self.tree = parent
            return parent # when we recurse all the way back up this will be the root
        
    def __repr__(self):
        return self.tree

### Training

In [38]:
from sklearn.datasets import load_iris

In [39]:
data = load_iris()
X, y, feature_names = data['data'], data['target'], data['feature_names']
assert type(X) == np.ndarray and type(y) == np.ndarray

In [40]:
dT = DecisionTree(10)

In [49]:
tree = dT.fit(X, y, feature_names)

IndexError: arrays used as indices must be of integer (or boolean) type