In [1]:
import numpy as np
import pandas as pd


class DecisionTreeNode:
    
    def __init__(self, value, depth, accuracy):
        self.value = value
        self.edges = {}
        self.is_leaf = True
        self.depth = depth
        self.accuracy = accuracy
        
    def add_edge(self, key, value):
        self.edges[key] = value
        if self.is_leaf == True:
            self.is_leaf = False
            
    def info(self):
        print('Value: ' + str(self.value))
        print('Edges: ' + str(list(self.edges.keys()))[1:-1])
        print('Is leaf: ' + str(self.is_leaf))
        print('Depth: ' + str(self.depth))
        print('Accuracy: ' + str(self.accuracy))

        
def entropies(data, col, label):
    ents = []
    props = data[label].value_counts()/len(data)
    ents.append(-sum(props * np.log2(props)))
    col = data[col]
    for val in col.unique():
        sub = data.loc[col == val]
        props = sub[label].value_counts()/len(sub)
        ents.append(-sum(props * np.log2(props)))
    return pd.Series(ents)

def gain(data, col, label):
    props = sum([[1], data[col].value_counts()/len(data)], [])
    props = pd.Series([1]).append(data[col].value_counts()/len(data))
    ents = entropies(data, col, label)
    ents.index = props.index
    product = props * ents
    return product.iloc[0] - sum(product.iloc[1:])
        
def find_split(data, label):
    scores = []
    for col in data.drop(label, 1).columns:
        scores.append(gain(data, col, label))
    return data.columns[scores.index(max(scores))]

def percolate_predictions(observation, node):
    if node.is_leaf:
        return node.value
    return percolate_predictions(observation, node.edges[observation[node.value]])


class DecisionTree:
    
    def __init__(self, max_depth = 10, max_node_accuracy = 1):
        self.root = None
        self.max_depth = max_depth
        self.depth = 0
        self.max_node_accuracy = max_node_accuracy
    
    def make_tree(self, data, label, depth = 0):
        split = find_split(data, label)
        accuracy = np.max(data[label].value_counts())/len(data)
        node = DecisionTreeNode(split, depth, accuracy)
        if self.root == None:
            self.root = node
        if data.shape[1] > 1 and data[label].nunique() != 1 and node.depth < self.max_depth and node.accuracy < self.max_node_accuracy:
            for val in data[split].unique():
                node.add_edge(val, DecisionTree.make_tree(self, data.loc[data[split] == val].drop(split, 1), label, node.depth + 1))
        else:
            node.value = data[label].value_counts().idxmax()
            self.depth = node.depth if node.depth > self.depth else self.depth
        return node
    
    def fit(self, data, label):
        DecisionTree.make_tree(self, data, label)
    
    def predict(self, data):
        predictions = []
        for i in range(len(data)):
            predictions.append(percolate_predictions(data.iloc[i], self.root))
        return pd.Series(predictions)

In [2]:
df = pd.read_csv('tennis.csv')
df.drop('day', 1, inplace=True)

tree = DecisionTree(max_depth=10, max_node_accuracy=.9)

tree.fit(df, 'play')

In [9]:
tree.root.edges['Rain'].info()#.edges['Normal'].info()

Value: wind
Edges: 'Weak', 'Strong'
Is leaf: False
Depth: 1
Accuracy: 0.6


In [43]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 2/14)

tennis = DecisionTree(max_depth=10, max_node_accuracy=1)

tennis.fit(train, 'play')

print(tennis.predict(test))
test['play']

0     No
1    Yes
dtype: object


0     No
3    Yes
Name: play, dtype: object

In [61]:
pizza = pd.read_csv('pizza_data.csv')
pizza.drop(['Pizza_ID', 'Category', 'Charlie_eats'], 1, inplace=True)

train, test = train_test_split(pizza, test_size = 0.2)

ptree = DecisionTree(max_depth=10, max_node_accuracy=.8)

ptree.fit(train, 'Nikhil_eats')

print(ptree.predict(test))
test['Nikhil_eats']

0    0
1    1
2    1
3    1
4    0
dtype: int64


21    0
6     1
22    1
1     1
11    0
Name: Nikhil_eats, dtype: int64