<a href="https://colab.research.google.com/github/AALivanova/BI_Stat_2021/blob/ML_hw1/knn_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import statistics
from statistics import mode, StatisticsError


class KNNClassifier:
    """
    K-neariest-neighbor classifier using L1 loss
    """
    
    def __init__(self, k=1):
        self.k = k
    

    def fit(self, X, y):
        self.train_X = X
        self.train_y = y


    def predict(self, X, n_loops=0):
        """
        Uses the KNN model to predict clases for the data samples provided
        
        Arguments:
        X, np array (num_samples, num_features) - samples to run
           through the model
        num_loops, int - which implementation to use
        Returns:
        predictions, np array of ints (num_samples) - predicted class
           for each sample
        """
        
        if n_loops == 0:
            distances = self.compute_distances_no_loops(X)
        elif n_loops == 1:
            distances = self.compute_distances_one_loops(X)
        else:
            distances = self.compute_distances_two_loops(X)
        
        if len(np.unique(self.train_y)) == 2:
            return self.predict_labels_binary(distances)
        else:
            return self.predict_labels_multiclass(distances)


    def compute_distances_two_loops(self, X):
        """
        Computes L1 distance from every sample of X to every training sample
        Uses simplest implementation with 2 Python loops
        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        distances, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        """
        num_test_samples = len(X)
        num_train_samples = len(self.train_X)
        distances = np.zeros(shape=(num_test_samples,num_train_samples))
        for j in  range(num_train_samples):
          for i in range(num_test_samples):
            sample_dist = np.sum(np.abs(X[i, ] - self.train_X[j, ]))
            distances[i, j] = sample_dist
        return distances
        pass


    def compute_distances_one_loop(self, X):
        """
        Computes L1 distance from every sample of X to every training sample
        Vectorizes some of the calculations, so only 1 loop is used
        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        distances, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        """
        num_test_samples = len(X)
        num_train_samples = len(self.train_X)
        distances = np.zeros(shape=(num_test_samples,num_train_samples))
        for i in  range(num_test_samples):
            d = np.sum(np.abs(X[i, ] - self.train_X), axis=-1)
            distances[i, ] = d
        return distances
        pass
       


    def compute_distances_no_loops(self, X):
        """
        Computes L1 distance from every sample of X to every training sample
        Fully vectorizes the calculations using numpy
        Arguments:
        X, np array (num_test_samples, num_features) - samples to run
        
        Returns:
        distances, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        """


        distances = np.sum(np.abs(X[:, np.newaxis] - self.train_X), axis=-1)
        return distances
        pass


    def predict_labels_binary(self, distances):
        """
        Returns model predictions for binary classification case
        
        Arguments:
        distances, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        Returns:
        pred, np array of bool (num_test_samples) - binary predictions 
           for every test sample
        """

        k = self.k
        n_train = distances.shape[1]
        n_test = distances.shape[0]
        prediction = np.zeros(n_test)
        for i in range(n_test):
          dist_sorted = np.sort(distances[i, :])
          sample_dists = distances[i, :]
          k_dists = dist_sorted[:k, ]
          classi = []
          for min_dist in range(len(k_dists)):
            ind = list(sample_dists).index(k_dists[min_dist])
            klass_min = self.train_y[ind]
            classi.append(klass_min)
          try:
            sample_class_pred = statistics.mode(classi)
          except StatisticsError:
            sample_class_pred = klass_min

          prediction[i, ] = sample_class_pred
        return prediction
        pass
        
    def predict_labels_multiclass(self, distances):
        """
        Returns model predictions for multi-class classification case
        
        Arguments:
        distances, np array (num_test_samples, num_train_samples) - array
           with distances between each test and each train sample
        Returns:
        pred, np array of int (num_test_samples) - predicted class index 
           for every test sample
        """

        n_train = distances.shape[1]
        n_test = distances.shape[0]
        prediction = np.zeros(n_test, np.int)
        k = self.k

        for i in range(n_test):
          dist_sorted = np.sort(distances[i, :])
          sample_dists = distances[i, :]
          k_dists = dist_sorted[:k, ]
          classi = []
          for min_dist in range(len(k_dists)):
            ind = list(sample_dists).index(k_dists[min_dist])
            klass_min = self.train_y[ind]
            classi.append(klass_min)
          classi_np = np.array(classi)
          sample_class_pred = np.argmax(np.bincount(classi_np))
          prediction[i, ] = sample_class_pred
        return prediction
        pass