<a href="https://colab.research.google.com/github/Aniket-Mahindrakar/ML_algorithms/blob/main/K%20Nearest%20Neighbour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K - Nearest Neighbour

In [22]:
import math
from collections import Counter

In [23]:
class KNN:
  def init(self):
    self.x = None
    self.y = None
    self.verbose = False
    self.algorithm_type = "regression"

  def fit(self, x, y, verbose = False, algorithm_type = "regression"):
    self.x = x
    self.y = y
    self.verbose = verbose
    self.algorithm_type = algorithm_type

    if(self.verbose):
      print("Train Inputs = ", (self.x, self.y))

  def predict(self, x, k):
    distance_label = [
        (self.distance(x, train_point), train_label)
        for train_point, train_label in
        zip(self.x, self.y)
    ]

    if(self.verbose):
      print("Neighbour distance labels = ", distance_label)

    neighbours = sorted(distance_label)[:k]

    if(self.verbose):
      print(k, "closest neighbour distances = ", [item for item, _ in neighbours])

    if(self.algorithm_type == "regression"):
      # Average of neighbour classes
      return sum(label for _, label in neighbours) / k

    elif(self.algorithm_type == "classification"):
      # Majority of neighbour classes
      neighbour_labels = [label for dist, label in neighbours]
      return Counter(neighbour_labels).most_common()[0][0]

  def distance(self, x_test, x_train, algorithm_type = "regression"):
    if(self.algorithm_type == "regression"):
      # Euclidean distance
      distance = [(xte - xtr)**2 for xte, xtr in zip(x_test, x_train)]
      distance = math.sqrt(sum(distance))

    if(self.algorithm_type == "classification"):
      # Cosine Similarity
      sum_xTest_xTest, sum_xTest_xTrain, sum_xTrain_xTrain = 0, 0, 0
      for i in range(len(x_test)):
          x = x_test[i]; y = x_train[i]
          sum_xTest_xTest += x*x
          sum_xTrain_xTrain += y*y
          sum_xTest_xTrain += x*y

      distance = sum_xTest_xTrain / math.sqrt(sum_xTest_xTest * sum_xTrain_xTrain)

    return distance

In [24]:
# Regression
# Train Inputs
x_train = [
    [2, 5, 9],
    [1, 6, 8],
    [3, 4, 7]
]

y_train = [
    0.5,
    2,
    4
]

verbose = True
algorithm_type = "regression"

# Test Inputs
x_test = [1, 4, 9]
k = 2

# Model
model = KNN()
model.fit(x_train, y_train, verbose, algorithm_type)
prediction = model.predict(x_test, k)

print("\nAverage prediction =", prediction)

Train Inputs =  ([[2, 5, 9], [1, 6, 8], [3, 4, 7]], [0.5, 2, 4])
Neighbour distance labels =  [(1.4142135623730951, 0.5), (2.23606797749979, 2), (2.8284271247461903, 4)]
2 closest neighbour distances =  [1.4142135623730951, 2.23606797749979]

Average prediction = 1.25


In [27]:
# Classification
# Train Inputs
x_train = [
    [2, 5, 9],
    [1, 6, 8],
    [3, 4, 7]
]

y_train = [
    2,
    2,
    4
]

verbose = True
algorithm_type = "classification"

# Test Inputs
x_test = [1, 4, 9]
k = 3

# Model
model = KNN()
model.fit(x_train, y_train, verbose, algorithm_type)
prediction = model.predict(x_test, k)

print("\nMost frequent prediction =", prediction)

Train Inputs =  ([[2, 5, 9], [1, 6, 8], [3, 4, 7]], [2, 2, 4])
Neighbour distance labels =  [(0.9920369404816276, 2), (0.9749851691262617, 2), (0.9629083542170928, 4)]
3 closest neighbour distances =  [0.9629083542170928, 0.9749851691262617, 0.9920369404816276]

Most frequent prediction = 2
