In [83]:
import pandas as pd
import numpy as np
from random import randrange
from math import log2

In [81]:
def gini_index_numerical(above_split, below_split, attribute):
    #UPDATING NOW INPUTS ARE DATAFRAMES
    total_above = len(above_split)
    total_below = len(below_split)
    total = total_above + total_below
    #for each class calculate the individual index
    above_index = 0
    below_index = 0
    #probability point is above
    #TODO OPTIMIZE
    labels = ['0','1']
    above_count = above_split['class'].value_counts()
    below_count = below_split['class'].value_counts()
    for i in range(0,len(labels)):
        #check for empty frames
        try:
            p_above = 0 if above_split.empty else above_count[i] / total_above
        except KeyError:
            p_above = 0
        try:
            p_below = 0 if below_split.empty else below_count[i] / total_below
        except KeyError:
            p_below = 0
        above_index += (p_above * p_above)
        below_index += (p_below * p_below)
    above_index = 1 - above_index
    below_index = 1 - below_index
    return (above_index * (total_above / total)) + (below_index * (total_below / total))

gini_index_numerical takes in two n-d numpy arrays where n represents the number of potential classes. The above_split array represents all of the points located numerically above the split point and the below_split array represents the points with a lesser value.

In [91]:
def entropy_calculation(splits, labels, total_in_set):
    #calculate the total entropy by calculating the probabilities
    entropy_before_split = 0
    for label in range(0, len(labels)):
        entropy_before_split += labels[label] / total_in_set
    entropy = 0
    for split in splits:
        #for each split calculate the entropy for the given label set
        split_entropy = 0
        total_in_split = len(split)
        label_count = split['class'].value_counts()
        for label in range(0,len(labels)):
            #calculating entropy
            #need to get the values for each label within the split
            #i.e. if split A has 15 values, 4 are class 0: 5 are class 1 and 6 are class 2
            #class 0 would have 4/15 * log2(4/15)
            try:
                label_dist = 0 if split.empty else label_count[label] / total_in_split
            except KeyError:
                label_dist = 0
            split_entropy = split_entropy if label_dist == 0 else (split_entropy + (label_dist * log2(label_dist)))
            #split_entropy += (label_dist * log2(label_dist))
        entropy += split_entropy
    entropy = (-1)*(entropy)
    return entropy
        

In [82]:
def gini_index_multiclass(splits, labels, total_in_set):
    #calculate for each potential class
    g_index = 0
    for split in splits:
        split_index = 0
        label_count = split['class'].value_counts()
        total_in_split = len(split)
        #for label in labels:
        for label in range(0,len(labels)):
            try:
                p_split = 0 if split.empty else (label_count[label] / total_in_split)
            except KeyError:
                p_split = 0
            split_index += (p_split * p_split)
        split_index = 1-split_index
        g_index += (split_index * (total_in_split / total_in_set))
    return g_index    

The gini_index_multiclass function takes in the splits, the attribute the set was split on, and the potential labels. The potential features should be defined as [0,1,...n-1] where n is the total number of features within the set. Possible feature values could depend on dataset (is pixel on or off, position of pixel, 

In [3]:
class Node:
    def __init__(self, splitting_attribute, splitting_point):
        self.splitting_attribute = splitting_attribute
        self.splitting_point = splitting_point
        #TODO change to k for k classes?
        self.children = np.zeros(2)
    
    def new_child_node(self, child_index, child):
        self.children[child_index] = child        
    
    def predict(self, point):
        val = point[self.splitting_attribute]
        self.splitting_point
        if val >= self.splitting_point:
            return self.children[0]
        else:
            return self.children[1]

The Node class represents each node within the tree. The splitting attribute and point determines the axis and value the node splits on. The above and below values represent the next node or label based on the splitting point.

In [4]:
def split(value, attribute, examples):
    #split where item in row > or < value
    above_values = examples.loc[examples[attribute] >= value]
    below_values = examples.loc[examples[attribute] < value]
    return above_values, below_values
    

In [86]:
def find_best_split(examples, calculate_purity):
    #given a dataframe -> examples find the best splitting value within based
    #on the given purity calculation
    #for row in examples
        #for attribute in row
            #value = row[attribute]
            #split = split(value, examples)
            #purity = calculate_purity(split)
            #if purity better than best purity is best
    #TODO OPTIMIZE THIS
    attributes = ['0','1']
    best_purity = 2
    best_node = None
    best_splits = []
    for index, row in examples.iterrows():
        for val in attributes:
            #split dataframe into 2 dataframes
            above_split, below_split = split(row[val], val, examples)
            #purity = calculate_purity(above_split, below_split, val)
            purity = calculate_purity([above_split, below_split], examples['class'].value_counts(), len(examples))
            if purity < best_purity:
                best_purity = purity
                #create new node based on best attribute and value
                best_node = Node(val, row[val])
                best_splits = [above_split, below_split]
    return best_node, best_splits
        

In [6]:
def most_common_class(examples):
    classes = examples['class'].value_counts()
    return classes.idxmax()

In [7]:
def number_of_classes_in_samples(examples):
        return len(examples['class'].value_counts())

In [8]:
def learn_tree(examples, parent_examples, depth, max_depth, calculate_purity):
    #work with ending conditions, if no more examples left then return most common class in parent examples
    if examples.empty:
        return most_common_class(parent_examples)
    elif (number_of_classes_in_samples(examples) == 1) or (depth >= max_depth):
        return most_common_class(examples)
    else:
        #find the best split possible within the current split points
        #splits[] = [above, below]
        root_node, splits = find_best_split(examples, calculate_purity)
        for i in range(0,len(splits)):
            #above and below calculations
            #split should consist of a subset of examples
            new_node = learn_tree(splits[i],examples,depth-1,max_depth, calculate_purity)
            root_node.new_child_node(i,new_node)
        return root_node

In [18]:
def tree_prediction(point, root_node):
    prediction = root_node.predict(point)
    while isinstance(prediction, Node):
        prediction = root_node.predict(point)
    return prediction

In [9]:
def get_subset(data_frame, subset_length):
    #randomly sample rows from the dataframe up to length n
    #where n is the specified subset length
    subset = data_frame.sample(n=subset_length, random_state=1)    
    return subset

In [10]:
def random_forest(data_frame, subset_length, max_depth, num_of_trees, purity_calculation):
    trees = []
    for i in range(0, num_of_trees):
        #generate a random subsample
        subset = get_subset(data_frame, subset_length)
        #create the tree and append to list
        tree = learn_tree(subset, None, 0, max_depth, purity_calculation)
        trees.append(tree)
    return trees

In [43]:
def predict_data_with_trees(testing_data, trees, num_of_trees):
    predicted_labels = []
    for index, row in testing_data.iterrows():
        predictions = np.zeros(num_of_trees)
        for i in range(0,num_of_trees):
            #casting to int because bincount does not like floating point
            predictions[i] = tree_prediction(row, trees[i])
        #select maximum class, voting method
        most_voted_class = np.bincount(predictions.astype(np.int64)).argmax()
        predicted_labels.append(most_voted_class)
    return predicted_labels
        
        

In [11]:
test_data = {'id': [1,2,3,4,5,6,7,8,9,10], '0': [1,12,4.3,6,14,2,9,14,5,4],'1':[4.5,7,1,3,4.8,15,17,7,1.9,10],'class':[0,1,0,0,1,0,1,1,0,0]}

In [12]:
df = pd.DataFrame(data=test_data)

In [13]:
max_depth = 4

In [14]:
n = learn_tree(df,None,0,max_depth,gini_index_numerical)

In [27]:
testing = {'id': [1,2,3], '0': [1,19,11],'1':[4,12,11],'class':[0,1,1]}
tdf = pd.DataFrame(data=testing)

for index, point in tdf.iterrows():
    prediction = n.predict(point)
    while isinstance(prediction,Node):
        prediction = n.predict(point)
    print(f"prediction for point {point['0']}, {point['1']} is {type(prediction)}")

prediction for point 1, 4 is <class 'numpy.float64'>
prediction for point 19, 12 is <class 'numpy.float64'>
prediction for point 11, 11 is <class 'numpy.float64'>


In [90]:
testing = {'id': [1,2,3], '0': [1,19,11],'1':[4,12,11],'class':[0,1,1]}
tdf = pd.DataFrame(data=testing)
tree_nums = [10, 20, 30]
subset_length = 5
max_depth = 7
predictions = []
for num_of_trees in tree_nums:
    trained_trees = random_forest(df,subset_length,max_depth,num_of_trees,entropy_calculation)
    predictions = predict_data_with_trees(tdf,trained_trees,num_of_trees)
print(predictions)

ValueError: math domain error

In [45]:
newdf = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",names=["0","1","2","3","class"])

In [52]:
labels = newdf['class'].value_counts()
labels

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64