C4.5 Binary Decision Tree Implementation

Usage: 

1. Read csv file in; will be stored as a 2 Dimensional list. (See fread())
2. Train a classifier (i.e. train(list))
3. Prune the decision tree (i.e. prune_tree(tree, 0.5))
4. Predict the result (i.e. predict([.....], classifier))

The function assumes that the last column of your data is populated by labels.

In [46]:
import c45
import timeit
from random import shuffle
import importlib
import csv

In [47]:
importlib.reload(c45)

<module 'c45' from '/Users/maximillian/Documents/GitHub/CDS/Insights-FakeNews/c45.py'>

In [13]:
"""
Helper functions: type_conversion, fread
"""
def type_conversion(val):
        val = val.strip()

        try:
            if '.' in val:
                return float(val)
            else:
                return int(val)

        except ValueError:
            #For other types, return
            return val

def fread(f, col_labels = False):
    """
    takes a filepath, f, and a boolean argument, col_labels.
    By default, col_labels is False, implying that the columns do not have labels. If set to true,
    fread will remove the row containing column labels at the end.
    """
    data = csv.reader(open(f, 'rt'))
    lst = [[type_conversion(i) for i in r] for r in data]
    if col_labels:
        lst.pop(0)
    return lst

In [14]:
df = fread("./Iris.csv", True)
#n * d data, n rows d cols
df = [i[1:] for i in df]
shuffle(df)
df, df_test = df[:-50],df[-50:]

#these are for performance testing estimates
df2 = df[:int(len(df)/2)] #n/2
df3 = [i[:2] for i in df] #d/2
df4 = [i[:-1]+i for i in df] #2d
df5 = df+df # 2n
df6 = df5+df5 #4n
df7 = [i[:-1]+i for i in df4] #4d

In [15]:
"""
TODO:
Optimize
Testing
"""

data = fread("./test_val_dump.csv", True)
drop_first_col = [x[1:] for x in data]
val_data = drop_first_col

The base cases are the following:

•  All the examples from the training set belong to the same class ( a tree leaf labeled with that class is returned ).

•  The training set is empty ( returns a tree leaf called failure ).

•  The attribute list is empty ( returns a leaf labeled with the most frequent class or the disjuction of all the classes).
https://octaviansima.wordpress.com/2011/03/25/decision-trees-c4-5/

In [194]:
def test_times(df):
    times = []
    tree = c45.DecisionTree()
    for i in range(10):
        start = timeit.default_timer()
        tree.fit(df)
        stop = timeit.default_timer()
        times.append(stop-start)
    print(sum(times)/len(times)) 

In [195]:
#n d
test_times(df)
#n 2d
test_times(df4)
#2n d
test_times(df5)
test_times(df6)

0.012557539594126865
0.020265244401525707
0.01848158090142533
0.03079748079762794


In [203]:
test_times(df2)

0.006191417694208212


In [205]:
def test_accuracy(tr, test):
    tree = c45.DecisionTree(max_depth = 25, min_samples_leaf = 3, min_samples_split = 10)
    tree.fit(tr)
    acc = [(tree.classify(x[:-1]) == x[-1]) for x in test]
    print(Counter(acc))

In [206]:
test_accuracy(df, df_test)

Counter({True: 48, False: 2})


In [207]:
test_accuracy(val_data[:-1000], val_data[-1000:])

Counter({True: 966, False: 34})


In [208]:
test_times(val_data)

15.544272611194174


In [48]:
import numpy as np
def count_nodes(tree):
    """
    "tree" is a classifier, i.e. the underlying Tree object within a DecisionTree object.
    """
    if((tree.right_branch == None) and (tree.left_branch == None)):
        return 1
    else:
        if(not tree.right_branch == None):
            return 1 + count_nodes(tree.right_branch)
        if(not tree.left_branch == None): 
            return 1 + count_nodes(tree.left_branch)

def c45_tree_to_dict(model, feature_names):
    """
    Adaptation from utils.tree_to_dict
    """
    tree_ = model.classifier
    n_nodes = count_nodes(model.classifier)
    lb = model.classifier.left_branch
    rb = model.classifier.right_branch
    feature = model.classifier.feature
    threshold = model.classifier.value
    
    # traverse the tree
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        if (lb[node_id] != rb[node_id]):
            stack.append((lb[node_id], parent_depth + 1))
            stack.append((rb[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    #aggregate results
    nodes = {}
    for i in range(n_nodes):
        nodes[i] = {
            "is_leaf": 1 if is_leaves[i] else 0,
            "depth": int(node_depth[i]),
            "id": int(i),
            "children_l": int(lb[i]),
            "children_r": int(rb[i]),
            "feature": feature_names[feature[i]],
            "threshold": threshold[i]
        }
    return {
        'n_nodes':int(n_nodes), 
        "nodes": nodes}#, 
        #"feature_importances": list(model.feature_importances_)}

In [52]:
tree = c45.DecisionTree()
tree.fit(df)
count_nodes(tree.classifier)

3

In [56]:
(tree.classifier.left_branch.gain)

0.0

In [40]:
c45_tree_to_dict(tree, ["1", "2", "3", "4"])

TypeError: 'Tree' object does not support indexing

In [34]:
print(df)

[[5.8, 2.7, 4.1, 1.0, 'Iris-versicolor'], [6.3, 2.5, 5.0, 1.9, 'Iris-virginica'], [6.8, 2.8, 4.8, 1.4, 'Iris-versicolor'], [6.8, 3.0, 5.5, 2.1, 'Iris-virginica'], [6.4, 2.7, 5.3, 1.9, 'Iris-virginica'], [6.6, 2.9, 4.6, 1.3, 'Iris-versicolor'], [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'], [5.8, 2.7, 5.1, 1.9, 'Iris-virginica'], [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'], [5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'], [7.4, 2.8, 6.1, 1.9, 'Iris-virginica'], [6.7, 3.0, 5.2, 2.3, 'Iris-virginica'], [6.5, 3.0, 5.5, 1.8, 'Iris-virginica'], [6.2, 3.4, 5.4, 2.3, 'Iris-virginica'], [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'], [4.9, 2.4, 3.3, 1.0, 'Iris-versicolor'], [6.1, 2.9, 4.7, 1.4, 'Iris-versicolor'], [4.6, 3.6, 1.0, 0.2, 'Iris-setosa'], [6.5, 3.0, 5.2, 2.0, 'Iris-virginica'], [6.6, 3.0, 4.4, 1.4, 'Iris-versicolor'], [5.1, 3.4, 1.5, 0.2, 'Iris-setosa'], [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'], [4.5, 2.3, 1.3, 0.3, 'Iris-setosa'], [5.0, 2.0, 3.5, 1.0, 'Iris-versicolor'], [6.1, 3.0, 4.9, 1.8, 'Iris-virginica'], [6.1, 2.8, 4