--------------------------------------------------------------------------------------------------------------------------------------------

# K Nearest Neighbour:

--------------------------------------------------------------------------------------------------------------------------------------------

## The Dataset:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
training_data_2 = pd.read_csv('data2_train.csv')
testing_data_2 = pd.read_csv('data2_test.csv')
training_data_2

Unnamed: 0,Feature_1,Feature_2,Target
0,8.160646,88.799326,0
1,31.149536,102.335826,0
2,13.103383,92.902908,0
3,15.950445,77.412565,0
4,35.856965,94.441550,0
...,...,...,...
795,35.142860,19.722994,1
796,38.306612,36.989602,1
797,28.649138,30.474118,1
798,31.949594,23.900828,1


In [3]:
training_data_1 = pd.read_csv('data1_train.csv')
testing_data_1 = pd.read_csv('data1_test.csv')
training_data_1

Unnamed: 0,Feature_1,Feature_2,Feature_3,Target
0,94.870985,88.239326,101.497093,0
1,97.684482,84.837474,90.892151,0
2,94.648343,77.467282,87.646104,0
3,94.635471,85.327735,99.851568,0
4,104.397011,84.097116,98.211326,0
...,...,...,...,...
795,95.893150,67.746229,82.932956,2
796,95.981618,80.343127,80.384783,2
797,102.469297,69.065428,77.440412,2
798,93.365763,77.225642,79.969821,2


In [4]:
testing_data_1

Unnamed: 0,Feature_1,Feature_2,Feature_3,Target
0,97.533783,88.329103,98.191966,0
1,102.838058,78.783627,90.867559,0
2,95.155724,86.679155,94.861119,0
3,102.838797,87.504581,99.622361,0
4,99.238078,86.704614,91.685225,0
...,...,...,...,...
195,95.944852,73.432026,77.215081,2
196,98.884384,75.145090,76.521510,2
197,94.958002,81.372707,77.798078,2
198,103.293445,75.642798,75.084644,2


In [5]:
k=training_data_2['Target'].value_counts().count()
k

2

- Here I will take k as the number of Target features

## Making and Training the model:

In [6]:
class KNearestNeighbour():
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        X_array = self.X.to_numpy()   #Converting X to a 2D numpy array which is an array of coordinates(k - dimensional)
        self.y_list = y['Target'].unique().tolist()   #Creating a list of unique target values to classify into
        self.centre = []  #Initialising an empty list to store the centre of clusters formed by coordinates of unique Target values
        
        for yy in self.y_list:
            
            sum = np.zeros(X.shape[1])  #Initialising an empty array to store the sum of coordinates corresponding to target 'yy'
            num = 0  #Initialising num=0 which will store the number of coordinates corresponding to target 'yy'
            
            for i in range(len(y)):
                
                if y['Target'][i] == yy:
                    sum += X_array[i]  #Adding all the coordinates of the Target 'yy'
                    num += 1

            self.centre.append(sum / num)  #Appending the centre coordinate as (sum of coordinates)/(number of coordinates)

    def predict(self, X):
        
        X_arr = X.to_numpy()  #Converting X to a 2D numpy array which is an array of coordinates(k - dimensional)
        y_pred=[]  #Initialising an empty list to store all the predicted Target values
        for x in X_arr:
            min=float('inf')
            index=-1
            for i,c in enumerate(self.centre):
                dis = np.sum((x-c)**2)
                if dis<min:
                    min=dis
                    index=i
            y_pred.append(self.y_list[index])  #Appending the Target with the distance to be minimum from the centre of the clusters

        y_pred=pd.DataFrame(y_pred)
        return y_pred

    def accuracy(self, y_pred, y):
        
        return np.mean(y_pred == y)  #Returns the mean of number of correct values

The basic approach I used here was:
- I have stored the centre coordinates of each of the clusters formed by summing up all the coordinates with common Target values and dividing it by total number of coordinates summed up.
- Then for predicting we take each of the coordinates(X) and compare their distances with each of the centres and the one with the minimum distance will give the predicted value.
- For calculating the accuracy we simply find the number of correct matches and divide it by the total number of predictions and return it.

### Training and predicting on the first dataset:

In [7]:
X_train = training_data_1.drop(columns=['Target'])
y_train = pd.DataFrame(training_data_1['Target'])

In [8]:
X_train

Unnamed: 0,Feature_1,Feature_2,Feature_3
0,94.870985,88.239326,101.497093
1,97.684482,84.837474,90.892151
2,94.648343,77.467282,87.646104
3,94.635471,85.327735,99.851568
4,104.397011,84.097116,98.211326
...,...,...,...
795,95.893150,67.746229,82.932956
796,95.981618,80.343127,80.384783
797,102.469297,69.065428,77.440412
798,93.365763,77.225642,79.969821


In [9]:
y_train

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
795,2
796,2
797,2
798,2


In [10]:
X_test = testing_data_1.drop(columns=['Target'])
y_test = pd.DataFrame(testing_data_1['Target'])

In [11]:
X_test

Unnamed: 0,Feature_1,Feature_2,Feature_3
0,97.533783,88.329103,98.191966
1,102.838058,78.783627,90.867559
2,95.155724,86.679155,94.861119
3,102.838797,87.504581,99.622361
4,99.238078,86.704614,91.685225
...,...,...,...
195,95.944852,73.432026,77.215081
196,98.884384,75.145090,76.521510
197,94.958002,81.372707,77.798078
198,103.293445,75.642798,75.084644


In [12]:
y_test

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
195,2
196,2
197,2
198,2


In [13]:
model = KNearestNeighbour()

In [14]:
model.fit(X_train,y_train)

In [15]:
y_train_pred = model.predict(X_train).to_numpy()

In [16]:
y_train_pred

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [17]:
model.accuracy(y_train_pred,y_train.to_numpy())

0.975

In [18]:
y_test_pred = model.predict(X_test)

In [19]:
y_test_pred

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
195,2
196,2
197,2
198,2


In [20]:
model.accuracy(y_test_pred,y_test.to_numpy())

0.975

#### Using Scikit-Learn:

In [21]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [22]:
model_sk = KNeighborsClassifier()
model_sk.fit(X_train,y_train)

  return self._fit(X, y)


In [23]:
y_train_pred=model_sk.predict(X_train)
accuracy_score(y_train,y_train_pred)

0.97375

In [24]:
y_train_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [25]:
y_test_pred=model_sk.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.97

- So Using Sklearn library, I am getting an accuracy of 97% whereas I got an accuracy of 97.5% from my model.

### Training and predicting on the second dataset:

In [26]:
X_train = training_data_2.drop(columns=['Target'])
y_train = pd.DataFrame(training_data_2['Target'])

In [27]:
X_train

Unnamed: 0,Feature_1,Feature_2
0,8.160646,88.799326
1,31.149536,102.335826
2,13.103383,92.902908
3,15.950445,77.412565
4,35.856965,94.441550
...,...,...
795,35.142860,19.722994
796,38.306612,36.989602
797,28.649138,30.474118
798,31.949594,23.900828


In [28]:
y_train

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
795,1
796,1
797,1
798,1


In [29]:
X_test = testing_data_2.drop(columns=['Target'])
y_test = pd.DataFrame(testing_data_2['Target'])

In [None]:
X_test

In [None]:
y_test

In [None]:
model = KNearestNeighbour()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_train_pred = model.predict(X_train).to_numpy()

In [None]:
y_train_pred

In [None]:
model.accuracy(y_train_pred,y_train.to_numpy())

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
y_test_pred

In [None]:
model.accuracy(y_test_pred,y_test.to_numpy())

#### Using Scikit-Learn:

In [None]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
model_sk = KNeighborsClassifier()
model_sk.fit(X_train,y_train)

In [None]:
y_train_pred=model_sk.predict(X_train)
accuracy_score(y_train,y_train_pred)

In [None]:
y_test_pred=model_sk.predict(X_test)
accuracy_score(y_test,y_test_pred)

- So, Here using Sklearn, I got an accuracy of 99% and my model also gave an accuracy of 99%