C4.5 Binary Decision Tree Implementation

Usage: 

1. Read csv file in; will be stored as a 2 Dimensional list. (See fread())
2. Train a classifier (i.e. train(list))
3. Prune the decision tree (i.e. prune_tree(tree, 0.5))
4. Predict the result (i.e. predict([.....], classifier))

The function assumes that the last column of your data is populated by labels.

In [244]:
import c45
import timeit
from random import shuffle
import importlib
import csv
from random import randrange

In [245]:
importlib.reload(c45)

<module 'c45' from '/Users/maximillian/Documents/GitHub/CDS/Insights-FakeNews/c45.py'>

In [13]:
"""
Helper functions: type_conversion, fread
"""
def type_conversion(val):
        val = val.strip()

        try:
            if '.' in val:
                return float(val)
            else:
                return int(val)

        except ValueError:
            #For other types, return
            return val

def fread(f, col_labels = False):
    """
    takes a filepath, f, and a boolean argument, col_labels.
    By default, col_labels is False, implying that the columns do not have labels. If set to true,
    fread will remove the row containing column labels at the end.
    """
    data = csv.reader(open(f, 'rt'))
    lst = [[type_conversion(i) for i in r] for r in data]
    if col_labels:
        lst.pop(0)
    return lst

In [14]:
df = fread("./Iris.csv", True)
#n * d data, n rows d cols
df = [i[1:] for i in df]
shuffle(df)
df, df_test = df[:-50],df[-50:]

#these are for performance testing estimates
df2 = df[:int(len(df)/2)] #n/2
df3 = [i[:2] for i in df] #d/2
df4 = [i[:-1]+i for i in df] #2d
df5 = df+df # 2n
df6 = df5+df5 #4n
df7 = [i[:-1]+i for i in df4] #4d

In [15]:
"""
TODO:
Optimize
Testing
"""

data = fread("./test_val_dump.csv", True)
drop_first_col = [x[1:] for x in data]
val_data = drop_first_col

The base cases are the following:

•  All the examples from the training set belong to the same class ( a tree leaf labeled with that class is returned ).

•  The training set is empty ( returns a tree leaf called failure ).

•  The attribute list is empty ( returns a leaf labeled with the most frequent class or the disjuction of all the classes).
https://octaviansima.wordpress.com/2011/03/25/decision-trees-c4-5/

In [194]:
def test_times(df):
    times = []
    tree = c45.DecisionTree()
    for i in range(10):
        start = timeit.default_timer()
        tree.fit(df)
        stop = timeit.default_timer()
        times.append(stop-start)
    print(sum(times)/len(times)) 

In [195]:
#n d
test_times(df)
#n 2d
test_times(df4)
#2n d
test_times(df5)
test_times(df6)

0.012557539594126865
0.020265244401525707
0.01848158090142533
0.03079748079762794


In [203]:
test_times(df2)

0.006191417694208212


In [205]:
def test_accuracy(tr, test):
    tree = c45.DecisionTree(max_depth = 25, min_samples_leaf = 3, min_samples_split = 10)
    tree.fit(tr)
    acc = [(tree.classify(x[:-1]) == x[-1]) for x in test]
    print(Counter(acc))

In [206]:
test_accuracy(df, df_test)

Counter({True: 48, False: 2})


In [207]:
test_accuracy(val_data[:-1000], val_data[-1000:])

Counter({True: 966, False: 34})


In [208]:
test_times(val_data)

15.544272611194174


In [86]:
import numpy as np
def count_nodes(tree):
    """
    "tree" is a classifier, i.e. the underlying Tree object within a DecisionTree object.
    """
    if((tree.right_branch == None) and (tree.left_branch == None)):
        return 1
    else:
        if(not tree.right_branch == None):
            return 1 + count_nodes(tree.right_branch)
        if(not tree.left_branch == None): 
            return 1 + count_nodes(tree.left_branch)

def c45_tree_to_dict(model, feature_names):
    """
    Adaptation from utils.tree_to_dict
    """
    tree_ = model.classifier
    n_nodes = count_nodes(model.classifier)
    lb = []#model.classifier.left_branch
    rb = []#model.classifier.right_branch
    feature = model.classifier.feature
    threshold = model.classifier.value
    
    def flatten_branch(branch, acc):
        if((branch.right_branch == None) and (branch.left_branch == None)):
            return acc.append(branch)
        else:
            if(not tree.right_branch == None):
                return flatten_branch(tree.right_branch, acc.append(branch))
            if(not tree.left_branch == None): 
                return flatten_branch(tree.left_branch, acc.append(branch))
    lbtemp = flatten_branch(model.classifier.left_branch, [])
    rbtemp = flatten_branch(model.classifier.right_branch, [])
    i = 0
    for tree in lbtemp:
        lb.append(tree)
        i += 1
    i = len(rb)
    for tree in rbtemp:
        rb.append(tree)
        i += 1
    
    # traverse the tree
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        node_depth[node_id] = parent_depth + 1

        if (lb[node_id] != rb[node_id]):
            stack.append((lb[node_id], parent_depth + 1))
            stack.append((rb[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True

    #aggregate results
    nodes = {}
    for i in range(n_nodes):
        nodes[i] = {
            "is_leaf": 1 if is_leaves[i] else 0,
            "depth": int(node_depth[i]),
            "id": int(i),
            "children_l": int(lb[i]),
            "children_r": int(rb[i]),
            "feature": feature_names[feature[i]],
            "threshold": threshold[i]
        }
    return {
        'n_nodes':int(n_nodes), 
        "nodes": nodes,
        "feature_importances": list(model.classifier.gain)}

In [87]:
tree = c45.DecisionTree()
tree.fit(df)
count_nodes(tree.classifier)

3

In [90]:
def test_forest(df, ss, ntrees):
    times = []
    forest = c45.RandomForest(subsample_ratio = ss, n_trees = ntrees)
    for i in range(10):
        start = timeit.default_timer()
        forest.fit(df)
        stop = timeit.default_timer()
        times.append(stop-start)
    print(sum(times)/len(times)) 


In [93]:
test_forest(val_data, .1, 5)

2.152095545598422


In [94]:
test_forest(val_data, .2, 5)

6.315452301497862


In [96]:
test_forest(val_data, .4, 5)

16.441954580102173


In [98]:
test_forest(val_data, .1, 50)

20.760438473096293


In [160]:
test = c45.RandomForest(subsample_ratio = .1, n_trees = 5)

In [161]:
test.fit(val_data)

In [150]:
(tree.classifier.left_branch.gain)

0.0

In [213]:
c45.evaluate_forest(val_data, 6, subsample_ratio = .1, ntrees = 5)

[96.66666666666667,
 96.21468926553672,
 96.10169491525423,
 95.76271186440678,
 96.55367231638418,
 96.4406779661017]

In [214]:
importlib.reload(c45)
#rf = c45.RandomForest(subsample_ratio = .1, n_trees = 5)
#rf.fit(val_data)
#rf.score(val_data, 6)
c45.mean_accuracy(val_data, 5, .1, ntrees = 5)

96.6949152542373

In [215]:
c45.mean_accuracy(val_data, 5, .5, ntrees = 25)

96.98681732580037

In [241]:
c45.mean_tree_accuracy(val_data, 5)

96.46892655367233

In [258]:
def time_classifier(df, classifier):
    """
    timing function that should also have the side effect of actually training a forest
    """
    times = []
    for i in range(10):
        start = timeit.default_timer()
        classifier.fit(df)
        stop = timeit.default_timer()
        times.append(stop-start)
    print(sum(times)/len(times))
    
def time_scoring(df, n_folds, subsamp_ratio, ntrees, forest = False):
    times = []
    if forest:
        for i in range(10):
            start = timeit.default_timer()
            meanacc = c45.mean_accuracy(df, 5, subsamp_ratio, ntrees)
            stop = timeit.default_timer()
            times.append(stop-start)
        return (meanacc, (sum(times)/len(times)))
    else:
        for i in range(10):
            start = timeit.default_timer()
            meanacc = c45.mean_tree_accuracy(df, 5)
            stop = timeit.default_timer()
            times.append(stop-start)
        return (meanacc, (sum(times)/len(times)))
    


In [246]:
forest1 = c45.RandomForest(subsample_ratio = .5, n_trees = 25)

In [247]:
time_classifier(val_data, forest1)

118.45014807390108


In [249]:
time_scoring(val_data, 5, .5, 25, forest = True)

(97.0056497175141, 412.03126984929696)

In [248]:
forest1.forest

[<c45.DecisionTree at 0x118deada0>,
 <c45.DecisionTree at 0x118e12550>,
 <c45.DecisionTree at 0x1190481d0>,
 <c45.DecisionTree at 0x11914bc18>,
 <c45.DecisionTree at 0x118fd4588>,
 <c45.DecisionTree at 0x118e47a58>,
 <c45.DecisionTree at 0x119146358>,
 <c45.DecisionTree at 0x119036e10>,
 <c45.DecisionTree at 0x119135c50>,
 <c45.DecisionTree at 0x118e82518>,
 <c45.DecisionTree at 0x119142a20>,
 <c45.DecisionTree at 0x118ff9710>,
 <c45.DecisionTree at 0x118e70eb8>,
 <c45.DecisionTree at 0x119029438>,
 <c45.DecisionTree at 0x1190498d0>,
 <c45.DecisionTree at 0x118e5e320>,
 <c45.DecisionTree at 0x118e87b70>,
 <c45.DecisionTree at 0x1191fa588>,
 <c45.DecisionTree at 0x11900dac8>,
 <c45.DecisionTree at 0x118e7e668>,
 <c45.DecisionTree at 0x119052a20>,
 <c45.DecisionTree at 0x11908d438>,
 <c45.DecisionTree at 0x119139e48>,
 <c45.DecisionTree at 0x11902e0b8>,
 <c45.DecisionTree at 0x118fff400>]

In [256]:
count_nodes(forest1.forest[4].classifier)

5

In [259]:
time_scoring(val_data, 5, None, None, forest = False)

(96.32768361581921, 59.30279422180028)