# K Nearest Neighbours

In [2]:
import numpy as np
import scipy as sp

%run basic_model.ipynb

## BallTree

In [17]:
import heapq
from collections import namedtuple

class BallTree(BasicModel):
    class Node:
        def __init__(
            self, 
            observation_indexes,
            pivot_index=None,
            radius=None,
            left=None,
            right=None
        ):
            self.observation_indexes=observation_indexes
            self.pivot_index = pivot_index
            self.radius = radius
            self.left = left
            self.right = right
        
        def is_leaf(self):
            return not self.pivot_index
    
    def __init__(
        self, 
        X, 
        leaf_size=40, 
        metric='minkowski'
    ):
        self.X = super().check_and_transform_X(X)
        
        super().check_value_type_and_set(
            'leaf_size',
            leaf_size,
            int
        )
        
        super().check_value_type_and_set(
            'metric',
            metric,
            str
        )
        
        observation_indexes = np.arange(X.shape[0])
        
        self.root = self.__construct_ball(
            observation_indexes
        )
    
    @staticmethod
    def most_spreaded_dimensionality(X):
        max_std = X[:, 0].std()
        most_spreaded_dimensionality = 0
        
        for i in range(1, X.shape[1]):
            curr_std = X[:, i].std()
            
            if curr_std > max_std:
                max_std = curr_std
                most_spreaded_dimensionality = i
        
        return most_spreaded_dimensionality
    
    @staticmethod
    def arg_median(array):
        if len(array) % 2 == 1:
            return np.where(array == np.median(array))[0][0]
        else:
            l,r = len(array) // 2 - 1, len(array) // 2
            
            left = np.partition(array, l)[l]
            right = np.partition(array, r)[r]
            
            result = (
                np.where(array == left)[0][0],
                np.where(array == array)[0][0]
            )
            
            return result
    
    def dist(self, x, y):
        return sp.spatial.distance.pdist(
            [x, y],
            self.metric
        )[0]
    
    def __construct_ball(self, observation_indexes):
        if len(observation_indexes) <= self.leaf_size:
            return BallTree.Node(
                observation_indexes=observation_indexes
            )
        
        node_sample = self.X[observation_indexes, :]
        
        most_spreaded_dim = BallTree.most_spreaded_dimensionality(node_sample)
        
        # find pivot
        median_sample_index = BallTree.arg_median(
            node_sample[:, most_spreaded_dim]
        )
        
        if isinstance(median_sample_index, tuple):
            median_sample_index = median_sample_index[0]
        
        # calculate ball radius
        radius = -1
        
        median_point = node_sample[median_sample_index, :]
        
        for i in range(node_sample.shape[0]):            
            radius = max(
                radius,
                self.dist(
                    node_sample[i, :], 
                    median_point
                )
            )
        
        # split observations by pivot
        left_split_indexes = []
        right_split_indexes = []
        
        median_value = node_sample[
            median_sample_index,
            most_spreaded_dim
        ]
        
        for i in range(node_sample.shape[0]):                
            observation_index = observation_indexes[i]
            
            if node_sample[i, most_spreaded_dim] <= median_value:
                left_split_indexes.append(observation_index)
            else:
                right_split_indexes.append(observation_index)
        
        return BallTree.Node(
            observation_indexes=observation_indexes,
            pivot_index=observation_indexes[median_sample_index],
            radius=radius,
            left=self.__construct_ball(left_split_indexes),
            right=self.__construct_ball(right_split_indexes)
        )
    
    PrioritizedPointIndex = namedtuple(
        'PrioritizedPointIndex',
        ['dist_to_target', 'point_index']
    )
    
    def __process_leaf(
        self,
        k,
        target_point,
        node,
        heap
    ):
        if len(heap) == 0:
            heap.append(BallTree.PrioritizedPointIndex(
                dist_to_target=self.dist(
                    self.X[node.observation_indexes[0], :],
                    target_point
                )
            ))

        for observation_index in node.observation_indexes:
            from_target_to_current_observation = self.dist(
                target_point,
                self.X[observation_index, :]
            )

            from_target_to_heap_worst = -heap[0]

            if from_target_to_current_observation < from_target_to_heap_worst:
                heapq.heappush(heap, BallTree.PrioritizedPointIndex(
                    dist_to_target=from_target_to_current_observation,
                    point_index=observation_index
                ))
            
                if len(heap) > k:
                    heapq.heappop(heap)
    
    def k_nearest_neighbours_search(
        self,
        k,
        target_point,
        node,
        heap=None
    ):
        heap = heap or []
        
        from_target_to_node_center = self.dist(
            target_point,
            self.X[node.pivot_index, :]
        )
        
        from_target_to_heap_worst = -heap[0].dist_to_target
        
        if len(heap) != 0 and \
           from_target_to_node_center - node.radius >= from_target_to_heap_worst:
            return
        
        if node.is_leaf():
            self.__process_leaf(k, target_point, node, heap)
        else:
            self.k_nearest_neighbours_search(
                k, 
                target_point, 
                node.left, 
                heap
            )
            
            self.k_nearest_neighbours_search(
                k, 
                target_point, 
                node.right, 
                heap
            )
        
        return heap

## BallTree Testing

In [21]:
from sklearn.datasets import make_classification
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

class TestBallTree(unittest.TestCase):    
    def test_samples_num_in_tree(self):
        def count_samples(node):
            node_count_samples = len(node.observation_indexes)
            
            if node.is_leaf():
                return node_count_samples
            
            left_count_samples = count_samples(node.left)
            right_count_samples = count_samples(node.right)
            
            self.assertEqual(
                node_count_samples,
                left_count_samples + right_count_samples
            )
            
            return left_count_samples + right_count_samples
        
        root = BallTree(cl_X).root
        
        self.assertEqual(
            count_samples(root),
            cl_X.shape[0]
        )
    
    def test_leaf_size(self):
        leaf_size = 20
        
        def check_leaf_size(node):            
            if node.is_leaf():
                node_size = len(node.observation_indexes)
                
                self.assertLessEqual(
                    node_size,
                    leaf_size
                )
        
        root = BallTree(cl_X, leaf_size=leaf_size).root
        
        check_leaf_size(root)

In [22]:
with open('tmp', "w") as f:
    runner = unittest.TextTestRunner(f)
    obj = unittest.main(
        argv=['first-arg-is-ignored', '--verbose', 'TestBallTree'], 
        testRunner=runner,
        exit=False
    )

! cat tmp
! rm -r tmp

..
----------------------------------------------------------------------
Ran 2 tests in 0.022s

OK


In [None]:
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

regr_X, regr_y = make_regression(100, 20)
regr_y = regr_y.reshape((100, 1))

def time_fit_predict(
    X, 
    y,
    score_names=['ROC AUC'],
    score_funcs=[roc_auc_score],
    *args,
    **kwargs
):
    start = time.time()
    
    rf = RandomForest(*args, **kwargs)
    rf.fit(X, y)
    
    for score_name, score_func in zip(score_names, score_funcs):
        score = score_func(cl_y, rf.predict(cl_X))

        print("{} criterion {} score: {}".format(
            kwargs['criterion'].capitalize(), 
            score_name,
            score
        ))
    print("Time: {}\n\n".format(time.time() - start))

class TestRandomForest(unittest.TestCase):    
    def test_gini(self):
        time_fit_predict(cl_X, cl_y, criterion='gini')
        
    def test_entropy(self):
        time_fit_predict(cl_X, cl_y, criterion='entropy')
    
    def test_gain_ratio(self):
        time_fit_predict(cl_X, cl_y, criterion='gain_ratio')
        
    def test_mse(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mse'
        )
    
    def test_mae(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mae'
        )
        
    def test_n_estimators(self):
        n_estimators = 25
        
        rf = RandomForest(n_estimators=n_estimators)
        rf.fit(cl_X, cl_y)
        
        self.assertEqual(
            len(rf.ensemble),
            n_estimators
        )
        
    def test_max_features(self):
        max_features_values = ['log2', 6, 0.3, 'auto']
        numeric_max_features_values = [
            np.int(np.log2(20)), 
            6,
            np.int(20 * 0.3),
            np.int(np.sqrt(20))
        ]
        
        for max_features, numeric_max_features in zip(
            max_features_values,
            numeric_max_features_values
        ):            
            rf = RandomForest(
                max_features=max_features,
                delete_tree_datasets=False
            )
            rf.fit(cl_X, cl_y)

            for decision_tree, selected_features in rf.ensemble:
                self.assertEqual(
                    selected_features.shape[0],
                    numeric_max_features
                )
                
                self.assertEqual(
                    decision_tree.X.shape[1],
                    numeric_max_features
                )
    
    def test_bootstrap(self):
        bootstraps = [True, False]
        
        for bootstrap in bootstraps:
            rf = RandomForest(
                bootstrap=bootstrap,
                delete_tree_datasets=False
            )
            rf.fit(cl_X, cl_y)
            
            for decision_tree, selected_features in rf.ensemble:
                self.assertEqual(
                    np.all(decision_tree.X == rf.X[:, selected_features]),
                    not bootstrap
                )
    
    def test_max_depth(self):
        max_depth = 5
        
        def check_depth(node, depth=1):
            if not node.is_leaf():
                return check_depth(node.left, depth+1) and \
                       check_depth(node.right, depth+1)
            
            if depth > max_depth:
                result = False
            
            return True
        
        rf = RandomForest(max_depth=max_depth)
        rf.fit(cl_X, cl_y)
        
        for decision_tree, _ in rf.ensemble:
            self.assertEqual(
                check_depth(decision_tree.root),
                True
            )
    
    def test_min_samples_split(self):
        min_samples_split = 10
        
        def check_samples_split(node):
            if node.is_leaf():
                return True
            
            is_satisfied = \
                node.observation_indexes.shape[0] >= min_samples_split
            
            return is_satisfied and \
                   check_samples_split(node.left) and \
                   check_samples_split(node.right)
        
        rf = RandomForest(min_samples_split=min_samples_split)
        rf.fit(cl_X, cl_y)
        
        for decision_tree, _ in rf.ensemble:
            self.assertEqual(
                check_samples_split(decision_tree.root),
                True
            )