# K Nearest Neighbours

In [1]:
import numpy as np
import scipy as sp

%run basic_model.ipynb

## BallTree

In [11]:
import heapq
from collections import namedtuple

class BasicTree(BasicModel):
    class Node:
        def __init__(
            self,
            observation_indexes,
            pivot_index,
            left,
            right
        ):
            self.observation_indexes = observation_indexes
            self.pivot_index = pivot_index
            self.left = left
            self.right = right
        
        def is_leaf(self):
            return not self.pivot_index
    
    def __init__(
        self, 
        X, 
        leaf_size=40, 
        metric='minkowski',
        p=2
    ):
        self.X = super().check_and_transform_X(X)
        
        super().check_value_type_and_set(
            'leaf_size',
            leaf_size,
            int
        )
        
        super().check_value_type_and_set(
            'metric',
            metric,
            str
        )
        
        super().check_value_type_and_set(
            'p',
            p,
            int
        )
    
    def dist(self, x, y):
        return sp.spatial.distance.pdist(
            [x, y],
            self.metric,
            p=self.p
        )[0]
    
    @staticmethod
    def arg_median(array):
        if len(array) % 2 == 1:
            return np.where(array == np.median(array))[0][0]
        else:
            l = len(array) // 2 - 1
            
            left = np.partition(array, l)[l]
            
            return np.where(array == left)[0][0]
    
    @staticmethod
    def split_by_pivot(
        node_sample,
        observation_indexes,
        median_sample_index,
        split_dimension
    ):
        left_split_indexes = []
        right_split_indexes = []
        
        median_value = node_sample[
            median_sample_index,
            split_dimension
        ]
        
        for i in range(node_sample.shape[0]):                
            observation_index = observation_indexes[i]
            
            if node_sample[i, split_dimension] <= median_value:
                left_split_indexes.append(observation_index)
            else:
                right_split_indexes.append(observation_index)
                
        return left_split_indexes, right_split_indexes
    
    PrioritizedPointIndex = namedtuple(
        'PrioritizedPointIndex',
        ['dist_to_target', 'point_index']
    )
    
    def process_leaf(
        self,
        k,
        target_point,
        node,
        heap
    ):
        if len(heap) == 0:
            heap.append(BasicTree.PrioritizedPointIndex(
                dist_to_target=-self.dist(
                    self.X[node.observation_indexes[0], :],
                    target_point
                )
            ))

        for observation_index in node.observation_indexes:
            from_target_to_current_observation = self.dist(
                target_point,
                self.X[observation_index, :]
            )

            from_target_to_heap_worst = -heap[0].dist_to_target

            if from_target_to_current_observation < from_target_to_heap_worst:
                heapq.heappush(heap, BasicTree.PrioritizedPointIndex(
                    dist_to_target=-from_target_to_current_observation,
                    point_index=observation_index
                ))
            
                if len(heap) > k:
                    heapq.heappop(heap)

class BallTree(BasicTree):
    class Node(BasicTree.Node):
        def __init__(
            self, 
            observation_indexes,
            pivot_index=None,
            radius=None,
            left=None,
            right=None
        ):
            super().__init__(
                observation_indexes,
                pivot_index,
                left,
                right
            )
            
            self.radius = radius
    
    def __init__(
        self, 
        X, 
        leaf_size=40, 
        metric='minkowski',
        p=2
    ):
        super().__init__(
            X, 
            leaf_size, 
            metric,
            p
        )
        
        observation_indexes = np.arange(X.shape[0])
        
        self.root = self.__construct_ball(
            observation_indexes
        )
    
    @staticmethod
    def most_spreaded_dimensionality(X):
        max_std = X[:, 0].std()
        most_spreaded_dimensionality = 0
        
        for i in range(1, X.shape[1]):
            curr_std = X[:, i].std()
            
            if curr_std > max_std:
                max_std = curr_std
                most_spreaded_dimensionality = i
        
        return most_spreaded_dimensionality
    
    def __construct_ball(self, observation_indexes):
        if len(observation_indexes) <= self.leaf_size:
            return BallTree.Node(
                observation_indexes=observation_indexes
            )
        
        node_sample = self.X[observation_indexes, :]
        
        most_spreaded_dim = BallTree.most_spreaded_dimensionality(node_sample)
        
        # find pivot
        median_sample_index = BasicTree.arg_median(
            node_sample[:, most_spreaded_dim]
        )
        
        # calculate ball radius
        radius = -1
        
        median_point = node_sample[median_sample_index, :]
        
        for i in range(node_sample.shape[0]):            
            radius = max(
                radius,
                self.dist(
                    node_sample[i, :], 
                    median_point
                )
            )
        
        # split observations by pivot
        left_split_indexes, right_split_indexes = BasicTree.split_by_pivot(
            node_sample,
            observation_indexes,
            median_sample_index,
            most_spreaded_dim
        )
        
        return BallTree.Node(
            observation_indexes=None,
            pivot_index=observation_indexes[median_sample_index],
            radius=radius,
            left=self.__construct_ball(left_split_indexes),
            right=self.__construct_ball(right_split_indexes)
        )
    
    def k_nearest_neighbours(
        self,
        k,
        target_point,
        node,
        heap=None
    ):
        heap = heap or []
        
        from_target_to_node_center = self.dist(
            target_point,
            self.X[node.pivot_index, :]
        )
        
        from_target_to_heap_worst = -heap[0].dist_to_target
        
        if len(heap) != 0 and \
           from_target_to_node_center - node.radius >= from_target_to_heap_worst:
            return
        
        if node.is_leaf():
            self.__process_leaf(k, target_point, node, heap)
        else:
            self.k_nearest_neighbours(
                k, 
                target_point, 
                node.left, 
                heap
            )
            
            self.k_nearest_neighbours(
                k, 
                target_point, 
                node.right, 
                heap
            )
        
        return heap

## BallTree Testing

In [12]:
from sklearn.datasets import make_classification
import scipy as sp
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

class TestBallTree(unittest.TestCase):    
    def test_samples_num_in_tree(self):
        def count_samples(node):
            if node.is_leaf():
                node_count_samples = len(node.observation_indexes)
                
                return node_count_samples
            
            left_count_samples = count_samples(node.left)
            right_count_samples = count_samples(node.right)
            
            return left_count_samples + right_count_samples
        
        root = BallTree(cl_X).root
        
        self.assertEqual(
            count_samples(root),
            cl_X.shape[0]
        )
    
    def test_leaf_size(self):
        leaf_size = 20
        
        def check_leaf_size(node):            
            if node.is_leaf():
                node_size = len(node.observation_indexes)
                
                self.assertLessEqual(
                    node_size,
                    leaf_size
                )
        
        bt = BallTree(cl_X, leaf_size=leaf_size)
        
        check_leaf_size(bt.root)
        
        self.assertEqual(
            bt.leaf_size,
            leaf_size
        )
        
    def test_metric(self):
        metric = 'chebyshev'
        
        bt = BallTree(cl_X, metric=metric)
        
        self.assertEqual(
            bt.metric,
            metric
        )
    
    def test_dist(self):
        p = 3
        
        bt = BallTree(cl_X, p=p)
        
        self.assertEqual(
            sp.spatial.distance.pdist(
                [[1, 2], [45, 3]],
                bt.metric,
                p=p
            )[0],
            bt.dist([1, 2], [45, 3])
        )

In [14]:
with open('tmp', "w") as f:
    runner = unittest.TextTestRunner(f)
    obj = unittest.main(
        argv=['first-arg-is-ignored', '--verbose', 'TestBallTree'], 
        testRunner=runner,
        exit=False
    )

! cat tmp
! rm -r tmp

....
----------------------------------------------------------------------
Ran 4 tests in 0.034s

OK


## KDTree

In [39]:
class KDTree(BasicTree):
    class Node(BasicTree.Node):
        def __init__(
            self,
            observation_indexes,
            pivot_index,
            left,
            right
        ):
            super().__init__(
                observation_indexes,
                pivot_index,
                left,
                right
            )
    
    def __init__(
        self, 
        X, 
        leaf_size=40, 
        metric='minkowski',
        p=2
    ):
        super().__init__(
            X, 
            leaf_size, 
            metric,
            p
        )
        
        observation_indexes = np.arange(X.shape[0])
        
        self.root = self.__construct_tree(
            observation_indexes
        )
    
    def __construct_tree(self, observation_indexes, depth=1):
        if len(observation_indexes) <= self.leaf_size:
            return KDTree.Node(
                observation_indexes,
                None,
                None,
                None
            )
        
        split_dimension = depth % self.X.shape[1]
        
        node_sample = self.X[observation_indexes, :]
        
        median_sample_index = BasicTree.arg_median(
            node_sample[:, split_dimension]
        )
        
        left_split_indexes, right_split_indexes = BasicTree.split_by_pivot(
            node_sample,
            observation_indexes,
            median_sample_index,
            split_dimension
        )
        
        return KDTree.Node(
            observation_indexes=None,
            pivot_index=observation_indexes[median_sample_index],
            left=self.__construct_tree(left_split_indexes, depth+1),
            right=self.__construct_tree(right_split_indexes, depth+1)
        )
    
    def __calc_dists_in_suitable_leaf(
        self,
        k,
        target_point,
        node,
        heap=None,
        depth=1
    ):
        heap = heap or []
        
        if node.is_leaf():
            self.process_leaf(
                k,
                target_point,
                node,
                heap
            )
        else:
            split_dimension = depth % self.X.shape[1]
            split_value = self.X[node.pivot_index, split_dimension]

            if target_point[split_dimension] <= split_value:
                __calc_dists_in_suitable_leaf(
                    k,
                    target_point,
                    node.left,
                    heap,
                    depth+1
                )
            else:
                __calc_dists_in_suitable_leaf(
                    k,
                    target_point,
                    node.right,
                    heap,
                    depth+1
                )
                
    def __collect_leaf_nodes_by_mask(
        self,
        node,
        target_point,
        spherical_mask_radius,
        collected_nodes
    ):        
        if node.left.is_leaf() or node.right.is_leaf():
            from_split_node_to_target = self.dist(
                self.X[node.pivot_index, :],
                target_point
            )
            
            if from_split_node_to_target < spherical_mask_radius:
                if node.left.is_leaf():
                    collected_nodes.append(node.left)
                if node.right.is_leaf():
                    collected_nodes.append(node.right)
        
        if not node.left.is_leaf():
            self.__collect_leaf_nodes_by_mask(
                node.left,
                target_point,
                spherical_mask_radius,
                collected_nodes
            )
        
        if not node.right.is_leaf():
            self.__collect_leaf_nodes_by_mask(
                node.right,
                target_point,
                spherical_mask_radius,
                collected_nodes
            )
    
    def k_nearest_neighbours(
        self,
        k,
        target_point
    ):
        heap = []
        
        self.__calc_dists_in_suitable_leaf(
            k,
            target_point,
            self.root,
            heap=heap,
            depth=1
        )
        
        spherical_mask_radius = -heap[0].dist_to_target
        
        intersected_by_mask_nodes = []
        
        self.__collect_leaf_nodes_by_mask(
            self,
            self.root,
            target_point,
            spherical_mask_radius,
            intersected_by_mask_nodes
        )
        
        for node in intersected_by_mask_nodes:
            self.process_leaf(
                k,
                target_point,
                node,
                heap
            )
        
        return heap

## KDTree testing

In [40]:
from sklearn.datasets import make_classification
import scipy as sp
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

class TestBallTree(unittest.TestCase):    
    def test_samples_num_in_tree(self):
        def count_samples(node):
            if node.is_leaf():
                node_count_samples = len(node.observation_indexes)
                
                return node_count_samples
            
            left_count_samples = count_samples(node.left)
            right_count_samples = count_samples(node.right)
            
            return left_count_samples + right_count_samples
        
        root = BallTree(cl_X).root
        
        self.assertEqual(
            count_samples(root),
            cl_X.shape[0]
        )
    
    def test_leaf_size(self):
        leaf_size = 20
        
        def check_leaf_size(node):            
            if node.is_leaf():
                node_size = len(node.observation_indexes)
                
                self.assertLessEqual(
                    node_size,
                    leaf_size
                )
        
        bt = BallTree(cl_X, leaf_size=leaf_size)
        
        check_leaf_size(bt.root)
        
        self.assertEqual(
            bt.leaf_size,
            leaf_size
        )
        
    def test_metric(self):
        metric = 'chebyshev'
        
        bt = BallTree(cl_X, metric=metric)
        
        self.assertEqual(
            bt.metric,
            metric
        )
    
    def test_dist(self):
        p = 3
        
        bt = BallTree(cl_X, p=p)
        
        self.assertEqual(
            sp.spatial.distance.pdist(
                [[1, 2], [45, 3]],
                bt.metric,
                p=p
            )[0],
            bt.dist([1, 2], [45, 3])
        )

In [41]:
with open('tmp', "w") as f:
    runner = unittest.TextTestRunner(f)
    obj = unittest.main(
        argv=['first-arg-is-ignored', '--verbose', 'TestKDTree'], 
        testRunner=runner,
        exit=False
    )

! cat tmp
! rm -r tmp

..
----------------------------------------------------------------------
Ran 2 tests in 0.023s

OK


# K Nearest Neighbours

In [None]:
class KNearestNeighbours(BasicModel):
    

# Testing

In [None]:
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

regr_X, regr_y = make_regression(100, 20)
regr_y = regr_y.reshape((100, 1))

def time_fit_predict(
    X, 
    y,
    score_names=['ROC AUC'],
    score_funcs=[roc_auc_score],
    *args,
    **kwargs
):
    start = time.time()
    
    rf = RandomForest(*args, **kwargs)
    rf.fit(X, y)
    
    for score_name, score_func in zip(score_names, score_funcs):
        score = score_func(cl_y, rf.predict(cl_X))

        print("{} criterion {} score: {}".format(
            kwargs['criterion'].capitalize(), 
            score_name,
            score
        ))
    print("Time: {}\n\n".format(time.time() - start))

class TestRandomForest(unittest.TestCase):    
    def test_gini(self):
        time_fit_predict(cl_X, cl_y, criterion='gini')
        
    def test_entropy(self):
        time_fit_predict(cl_X, cl_y, criterion='entropy')
    
    def test_gain_ratio(self):
        time_fit_predict(cl_X, cl_y, criterion='gain_ratio')
        
    def test_mse(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mse'
        )
    
    def test_mae(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mae'
        )
        
    def test_n_estimators(self):
        n_estimators = 25
        
        rf = RandomForest(n_estimators=n_estimators)
        rf.fit(cl_X, cl_y)
        
        self.assertEqual(
            len(rf.ensemble),
            n_estimators
        )
        
    def test_max_features(self):
        max_features_values = ['log2', 6, 0.3, 'auto']
        numeric_max_features_values = [
            np.int(np.log2(20)), 
            6,
            np.int(20 * 0.3),
            np.int(np.sqrt(20))
        ]
        
        for max_features, numeric_max_features in zip(
            max_features_values,
            numeric_max_features_values
        ):            
            rf = RandomForest(
                max_features=max_features,
                delete_tree_datasets=False
            )
            rf.fit(cl_X, cl_y)

            for decision_tree, selected_features in rf.ensemble:
                self.assertEqual(
                    selected_features.shape[0],
                    numeric_max_features
                )
                
                self.assertEqual(
                    decision_tree.X.shape[1],
                    numeric_max_features
                )
    
    def test_bootstrap(self):
        bootstraps = [True, False]
        
        for bootstrap in bootstraps:
            rf = RandomForest(
                bootstrap=bootstrap,
                delete_tree_datasets=False
            )
            rf.fit(cl_X, cl_y)
            
            for decision_tree, selected_features in rf.ensemble:
                self.assertEqual(
                    np.all(decision_tree.X == rf.X[:, selected_features]),
                    not bootstrap
                )
    
    def test_max_depth(self):
        max_depth = 5
        
        def check_depth(node, depth=1):
            if not node.is_leaf():
                return check_depth(node.left, depth+1) and \
                       check_depth(node.right, depth+1)
            
            if depth > max_depth:
                result = False
            
            return True
        
        rf = RandomForest(max_depth=max_depth)
        rf.fit(cl_X, cl_y)
        
        for decision_tree, _ in rf.ensemble:
            self.assertEqual(
                check_depth(decision_tree.root),
                True
            )
    
    def test_min_samples_split(self):
        min_samples_split = 10
        
        def check_samples_split(node):
            if node.is_leaf():
                return True
            
            is_satisfied = \
                node.observation_indexes.shape[0] >= min_samples_split
            
            return is_satisfied and \
                   check_samples_split(node.left) and \
                   check_samples_split(node.right)
        
        rf = RandomForest(min_samples_split=min_samples_split)
        rf.fit(cl_X, cl_y)
        
        for decision_tree, _ in rf.ensemble:
            self.assertEqual(
                check_samples_split(decision_tree.root),
                True
            )