In [23]:
import pandas
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from collections import OrderedDict
from math import log

In [24]:
df = pandas.read_csv("./test.csv")


In [25]:
def n_distinct_dict(rows):
    counts = dict()
    for x in rows:
        xs = x[-1]
        if xs not in counts: 
            #Add it to the counts dictionary
            counts[xs] = 0
        counts[xs] += 1
    return counts

def entropy(X):
    """
    Calculate Entropy (as per Octavian)
    """
    counts = n_distinct_dict(X)
    log_2 = lambda x: log(x)/log(2)
    #Declare entropy value
    entropy = 0.0
    
    for c in counts:
        #Calculate P(C_i)
        p = float(counts[c])/len(X)
        entropy = entropy -  p*log_2(p)
    return entropy

def gini(X):
    """
    Calculate Gini Index
    """
    total = len(X)
    counts = n_distinct_dict(X)
    imp = 0.0
    
    for k1 in counts:
        p1 = float(counts[k1])/total  
        for k2 in counts:
            if k1 == k2: continue
            p2 = float(counts[k2])/total
            imp += p1*p2
    return imp

def var(X):
    """
    Calculate Variance
    """
    if len(X) == 0:
        return 0
    x = [float(c[len(c) - 1]) for c in X]
    mean = sum(x) / len(x)

    variance = sum([(d - mean)**2 for d in x]) / len(x)
    return variance

In [26]:
def df_to_dict(x):
    dictionary = dict()
    for col in x:
        dictionary.update({col:x[col]})
    return dictionary
test = df_to_dict(df)
list(OrderedDict.fromkeys(test))

['Test1', 'Test2']

In [27]:
print(test)

{'Test1': 0    1
1    2
2    3
3    1
Name: Test1, dtype: int64, 'Test2': 0    1
1    2
2    3
3    1
Name: Test2, dtype: int64}


In [34]:
class Tree:
    """
    Decision Tree class
    """
    def __init__(self, col=-1, value=None, right_branch=None, left_branch=None, results=None):
        self.col = col
        self.value = value
        self.right_branch = right_branch
        self.left_branch = left_branch
        self.results = results


def prune_tree(tree, least_gain, eval_fun = entropy):
    """
    tree : type Tree
    eval_fun : entropy(X) or gini(X)
    least_gain : float
    """
    
    if tree.right_branch.results == None: #if the right branch is a node
        prune_tree(tree.right_branch, eval_fun, least_gain)
    if tree.left_branch.results == None: #if the left branch is a node
        prune_tree(tree.left_branch, eval_fun, least_gain)
    if (tree.trueBranch.results != None) and (tree.falseBranch.results != None):
        right, left = [], []
        for v, c in tree.right_branch.results.items(): 
            right += [[v]] * c
        for v, c in tree.left_branch.results.items(): 
            left += [[v]] * c
        p = float(len(right)) / len(left + right)
        diff_entropy = evaluationFunction(tb+fb) - p*evaluationFunction(tb) - (1-p)*evaluationFunction(fb)
        if diff_entropy < least_gain:
            tree.right_branch, tree.left_branch = None, None
            tree.results = uniqueCounts(left + right)

In [3]:
"""
Helper functions
"""
def remove_duplicates():


def C45(X, Y):
    

'\nHelper functions\n'

In [29]:
def partition(r, c, val):
    """
    Function to partition the data based on value
    """
    #Declare anonymous function
    split_fun = None
    if isinstance(val, float) or isinstance(val, int): 
        #Anonymous function for numeric values
        split_fun = lambda row : row[c] >= val
    else: 
        #For string values
        split_fun = lambda row : row[c] == val
    list1 = [row for row in r if split_fun(row)]
    list2 = [row for row in r if not split_fun(row)]
    return (list1, list2)



def construct_tree(df, criteria = entropy):
    """
    Decision tree construction - by default, the entropy function is used to calculate the criteria for splitting. 
    df : dataframe with the last column reserved for labels
    criteria : entropy or gini calculation function
    """
    #Base Case: Empty Set
    if len(df) == 0: 
        return Tree()
    
    #Calculate Entropy/Gini of current X, declare A_best, create sets/gain accordingly
    score = criteria(df)
    Attribute_best = None
    Set_best = None
    Gain_best = 0.0
    

    num_col = len(df[0]) - 1  # last column of x is labels
    for col in range(0, num_col):
        col_val = [row[col] for row in df]

        for value in col_val:
            #Split dataset
            (set1, set2) = partition(X, col, value)

            # Calculate Gain
            p = float(len(set1)) / len(df)
            gain = score - p*criteria(set1) - (1-p)*criteria(set2)
            if gain>Gain_best and len(set1)>0 and len(set2)>0:
                Gain_best = gain
                Attribute_best = (col, value)
                Set_best = (set1, set2)

    if Gain_best > 0:
        #Recursive Call on partitioned Sets
        right_branch = construct_tree(Set_best[0])
        left_branch = construct_tree(Set_best[1])
        return Tree(col=Attribute_best[0], value=Attribute_best[1], right_branch=right_branch, left_branch=left_branch)
    
    else:
        return Tree(results=uniqueCounts(df))



In [None]:
"""
TODO:

Pruning
Apply Decision Tree
"""

In [None]:
"""
The base cases are the following:

•  All the examples from the training set belong to the same class ( a tree leaf labeled with that class is returned ).

•  The training set is empty ( returns a tree leaf called failure ).

•  The attribute list is empty ( returns a leaf labeled with the most frequent class or the disjuction of all the classes).
https://octaviansima.wordpress.com/2011/03/25/decision-trees-c4-5/
"""