In [1]:
import numpy as np
from math import log

def split_data(data, n_tr=5, n_val=1):
    m = n_tr+n_val+1
    N = data.shape[0]
    indices = np.random.permutation(N)
    training_idx, val_idx, test_idx = indices[:int(n_tr * N/m)], indices[int(n_tr * N/m):int((n_tr+n_val) * N/m)], indices[int((n_tr+n_val) * N/m):]   
    training, validation, test = data[training_idx,:], data[val_idx,:], data[test_idx,:]
    return training, validation, test



In [33]:
#TEST

data = np.random.rand(100, 5)
assert([len(x) for x in split_data(data,1,1)]==[33,33,34])
assert([len(x) for x in split_data(data)]==[71, 14, 15])

In [134]:
training_data = [
    ['Green', 3, 0, 'Apple'],
    ['Yellow', 3, 1, 'Apple'],
    ['Red', 1, 2,'Grape'],
    ['Red', 1, 1, 'Grape'],
    ['Yellow', 2, 0, 'Lemon'],
    ['Yellow', 1, 2, 'Lemon']
]

In [118]:
class Partition(object):
    """
    Object for data partition b y feature of index feature_idx
    """
    
    def __init__(self, X, feature_idx, possible_classes):
        self.feature_idx = feature_idx # for debug
        self.possible_classes = possible_classes
        self.possible_feature_values = np.unique(X[:,feature_idx])
        self.decision_map = {value: [] for value in self.possible_feature_values}
        self.N = len(X)
        for x in X:
            self.decision_map[x[feature_idx]].append(x)
    
    def score(self):
        result = 0
        for key in self.decision_map.keys():
            key_result = 0
            for cls in self.possible_classes:
                p_class = sum(map(lambda x: x[-1] == cls, self.decision_map[key]))/len(self.decision_map[key])
                if(p_class):
                    key_result += -p_class * log(p_class)
            result += key_result * (len(self.decision_map[key])/self.N)
        return result
    
    def print_(self):
        print('Feature index: %s' % self.feature_idx)
        for k, v in self.decision_map.items():
            print('Elements for value %s:' % k)
            for row in v:
                print('    ' + str(row))
        
        
        
        

In [115]:
p = Partition(np.array(training_data), 0, ['Apple','Grape','Lemon'])
print(p.score())

0.2772588722239781


In [243]:
class DecisionTree(object):
    def __init__(self, X, possible_classes):
        self.nodes = {}
        self.classes = np.unique(X[:,-1])
        if len(self.classes) == 1:
            print('class' ,self.classes[0])
            self.classified_class = self.classes[0]
            
        self.classes_map = {}
        
        self.possible_classes = possible_classes
        
        min_value = self.find_best_partition(X)
        
        ##DEBUG
#         self.partition.print_()
#         print(min_value)
        ##
        if min_value != 0: # zmienic to chyba niepotrzebne jak beda self.predicted_class
            self._build_node()
        else:
            for k, v in self.partition.decision_map.items():
                self.classes_map[k] = v[0][-1]
        
    def find_best_partition(self, X):
        best_partition = None
        min_value = 100
        for feature_idx in range(len(X[0])-1):
            part = Partition(X, feature_idx, self.possible_classes)
            score = part.score()
            if score < min_value:
                min_value = score
                best_partition = part
        self.partition = best_partition
        self.feature_idx = best_partition.feature_idx
        return min_value

    def _build_node(self):
        for k, v in self.partition.decision_map.items():
#             print('build', k) # DEBUG
            self.nodes[k] = DecisionTree(np.array(v), self.possible_classes)
    
    
    def classify(self, x):
        if self.nodes:
            return self.nodes[x[self.feature_idx]].classify(x)
        else:
#             return self.classified_class
            return self.classes_map[str(x[self.feature_idx])] # naprawic to classes_map
    
    
    
    def print_(self):
        self.partition.print_()
        print('Children-------------------------------------\n')
        if self.nodes:
            for v in self.nodes.values():
                v.print_()
        print('\n\n\n')

In [244]:
d = DecisionTree(np.array(training_data), ['Apple','Grape','Lemon'])
print(d.classify(['Yellow', 3, 0]))

class Apple
class Grape
Apple
