In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

In [24]:
K = 3

In [25]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [26]:
# iris_df.describe()

In [27]:
x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

In [28]:
x.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [29]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: target, dtype: float64

In [30]:
x_train, x_test, y_train, y_test= train_test_split(x, y,
                                                   test_size= 0.2,
                                                   shuffle= True, #shuffle the data to avoid bias
                                                   random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)
print(x_train.shape[0], ' and ', x_test.shape[0])
print(y_train.shape[0], ' and ', y_test.shape[0])
# x_train

120  and  30
120  and  30


In [31]:
print(x_train[0:5])
print('\n')
print(x_test[0:5])

[[6.4 3.1 5.5 1.8]
 [5.4 3.  4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [6.1 3.  4.9 1.8]
 [6.4 2.8 5.6 2.2]]


[[5.8 2.8 5.1 2.4]
 [6.  2.2 4.  1. ]
 [5.5 4.2 1.4 0.2]
 [7.3 2.9 6.3 1.8]
 [5.  3.4 1.5 0.2]]


In [32]:
knn=KNeighborsClassifier(K)
knn.fit(x_train, y_train)
y_pred_sklearn= knn.predict(x_test)
print(y_pred_sklearn)

[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 2. 0. 1. 1. 0. 0. 2. 1. 0. 0.
 2. 0. 0. 1. 1. 0.]


In [33]:
print(f'The accuracy of sklearn implementation is {accuracy_score(y_test, y_pred_sklearn)}')

The accuracy of sklearn implementation is 0.9666666666666667


In [62]:
def distance_ecu(x_train, x_test_point):
    distances = []  
    for row in range(len(x_train)): 
        current_train_point = x_train[row]
        current_distance = 0 

        for col in range(len(current_train_point)):
            current_distance += (current_train_point[col] - x_test_point[col]) **2
        current_distance= np.sqrt(current_distance)

        distances.append(current_distance)

    distances = pd.DataFrame(data=distances,columns=['dist'])
    return distances

In [72]:
def nearest_neighbors(distance_point, K):
    df_nearest= distance_point.sort_values(by=['dist'], axis=0)
    df_nearest= df_nearest[:K]
    return df_nearest

In [82]:
def voting(df_nearest, y_train):
    counter_vote= Counter(y_train[df_nearest.index])
    y_pred= counter_vote.most_common()[0][0]
    return y_pred

In [85]:
def KNN_from_scratch(x_train, y_train, x_test, K):

    y_pred=[]

    for x_test_point in x_test:
        distance_point  = distance_ecu(x_train, x_test_point)  ## 1
        df_nearest_point = nearest_neighbors(distance_point, K)  ## 2
        y_pred_point = voting(df_nearest_point, y_train) ## 3
        y_pred.append(y_pred_point)

    return y_pred  


In [86]:
y_pred_scratch= KNN_from_scratch(x_train, y_train, x_test, K)
print(y_pred_scratch)
print('\n')
print(y_pred_sklearn)


[2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0]


[2. 1. 0. 2. 0. 2. 0. 1. 1. 1. 2. 1. 1. 1. 2. 0. 1. 1. 0. 0. 2. 1. 0. 0.
 2. 0. 0. 1. 1. 0.]


In [87]:
print(accuracy_score(y_test, y_pred_scratch))

0.9666666666666667


In [88]:
print(accuracy_score(y_test, y_pred_sklearn))

0.9666666666666667


In [89]:
print(np.array_equal(y_pred_sklearn, y_pred_scratch))

True
