In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

In [5]:
from sklearn import datasets

In [6]:
iris_data=datasets.load_iris()

In [7]:
iris_df=pd.DataFrame(iris_data.data, columns= iris_data.feature_names)

In [8]:
iris_df['Y']= iris_data.target

In [9]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Y                  150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [10]:
iris_df['Y'].value_counts()

0    50
1    50
2    50
Name: Y, dtype: int64

In [11]:
# How are the classes of Y called?

iris_data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### Step by step logic to run KNN from scratch



Steps to do k-Nearest Neighbors:

Step 1: Calculate Euclidean Distance

- Standardize predictors!

Step 2: Get Nearest Neighbors

Step 3: Make Predictions

1) Let's test our logic using the __last five observations from Iris as test data__ and __the first 145 obs for training__

2) Let's use K=10 to test our logic

__Note__:

Lasse Schmidt published the following article with an excellent explanation about the right way of standardizing the train and test data:

https://medium.com/analytics-vidhya/why-it-makes-a-difference-how-to-standardize-training-and-test-set-e95bf350bed3


__Splitting the data: first 145 observations for training and the last five for testing__

__Also, let's standardize the data__

In [2]:
from sklearn.preprocessing import StandardScaler

In [12]:
train_scaler= StandardScaler()

In [13]:
# Get all rows but the last five rows

X_train= iris_df.iloc[:-5,:-1]

In [14]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
140,6.7,3.1,5.6,2.4
141,6.9,3.1,5.1,2.3
142,5.8,2.7,5.1,1.9
143,6.8,3.2,5.9,2.3


In [15]:
X_train_standardized= train_scaler.fit_transform(X_train)

In [16]:
y_train= iris_df.iloc[:-5,-1]

In [17]:
y_train

0      0
1      0
2      0
3      0
4      0
      ..
140    2
141    2
142    2
143    2
144    2
Name: Y, Length: 145, dtype: int32

In [18]:
# Get the last five rows

X_test= iris_df.iloc[-5:,:-1]

In [19]:
X_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [20]:
X_test_standardized= train_scaler.transform(X_test)

In [22]:
y_test= iris_df.iloc[-5:,-1]

In [23]:
y_test

145    2
146    2
147    2
148    2
149    2
Name: Y, dtype: int32


### Step 1: Calculate the Euclidean Distance between the observations to classify (the five test obs in this example) and the training obs

- KNN requires that we compute the distance between each test obs and all training obs. We identify the K nearest neighbors to each test obs from these distances.

- Let's compute these distances in a progressive way (which is better to understand the logic behind KNN)

- First, let's compute the distance ONLY between the first test observation and ALL the training obs

In [24]:
# The following matrix contains the data for the five test observations
# It has five rows, one for each obs

X_test_standardized

array([[ 1.04808507, -0.13686183,  0.84284906,  1.49829829],
       [ 0.56791971, -1.2773771 ,  0.72979345,  0.96808901],
       [ 0.80800239, -0.13686183,  0.84284906,  1.10064133],
       [ 0.44787838,  0.77555038,  0.95590467,  1.49829829],
       [ 0.08775436, -0.13686183,  0.78632125,  0.83553669]])

In [25]:
# To retrieve ONLY the first test obs

X_test_standardized[0,:]

array([ 1.04808507, -0.13686183,  0.84284906,  1.49829829])

In [26]:
# Distance between the first test observation and the first training observation

np.sqrt(np.sum(np.square(X_test_standardized[0,:]- X_train_standardized[0,:])))

4.165607307240199

In [27]:
import math

In [28]:
math.dist (X_test_standardized[0,:], X_train_standardized[0,:])

4.165607307240199

Knowing how to get the distance between the first test observation and the first training observation allows us to
extend this logic and get the dist between the first test observation and all training observations

In [29]:
# Distance between first test obs and ALL training obs

distances_to_first=[]
for i in np.arange(X_train_standardized.shape[0]):
    distances_to_first.append(np.sqrt(np.sum(np.square(X_test_standardized[0,:]- X_train_standardized[i,:]))))

In [30]:
distances_to_first

[4.165607307240199,
 4.126909730613333,
 4.3105344596209365,
 4.304616298442756,
 4.289539921237245,
 4.111806989425734,
 4.339293278758049,
 4.137617456833658,
 4.476320940023881,
 4.195149047438195,
 4.13614682514539,
 4.2336897636702755,
 4.280194637095968,
 4.709087773311855,
 4.385356793395608,
 4.728252737672504,
 4.225256554418218,
 4.078223807014727,
 3.9640105389705957,
 4.2920233105242644,
 3.864007104644589,
 4.117737784944444,
 4.648976951947716,
 3.7100260699965952,
 4.154838472297955,
 4.006759101925164,
 3.934590849458684,
 4.082391722026684,
 4.054293723095103,
 4.226315650217812,
 4.140490787996915,
 3.739662825600934,
 4.734597505813117,
 4.682954435684832,
 4.104115066267568,
 4.1513421790842875,
 3.9980517868044037,
 4.4339615578548,
 4.497939905214015,
 4.07974885963016,
 4.165751086858694,
 4.6273276517142525,
 4.521016191535016,
 3.8320015294128713,
 4.1038420492929575,
 4.104165720379318,
 4.348399159541459,
 4.350341243798089,
 4.182915069919243,
 4.12253752520

In [31]:
# This list contains 145 elements. Why?

len(distances_to_first)

145

Knowing how to get the distances between the first test observation and all training observations allows us to get the distance between each of the five test obs and all training obs

__Distance between each test obs and ALL training obs__

I am saving all these distances in a list

This list contains the distances between each of the five test obs and all training obs

Later, we need to figure out a way of retrieving (separating) the distances that correspond to each test observation

In [32]:
distances=[]
for j in np.arange(X_test_standardized.shape[0]):
    for i in np.arange(X_train_standardized.shape[0]):
        distances.append(np.sqrt(np.sum(np.square(X_test_standardized[j,:] - X_train_standardized[i,:]))))

In [33]:
distances

[4.165607307240199,
 4.126909730613333,
 4.3105344596209365,
 4.304616298442756,
 4.289539921237245,
 4.111806989425734,
 4.339293278758049,
 4.137617456833658,
 4.476320940023881,
 4.195149047438195,
 4.13614682514539,
 4.2336897636702755,
 4.280194637095968,
 4.709087773311855,
 4.385356793395608,
 4.728252737672504,
 4.225256554418218,
 4.078223807014727,
 3.9640105389705957,
 4.2920233105242644,
 3.864007104644589,
 4.117737784944444,
 4.648976951947716,
 3.7100260699965952,
 4.154838472297955,
 4.006759101925164,
 3.934590849458684,
 4.082391722026684,
 4.054293723095103,
 4.226315650217812,
 4.140490787996915,
 3.739662825600934,
 4.734597505813117,
 4.682954435684832,
 4.104115066267568,
 4.1513421790842875,
 3.9980517868044037,
 4.4339615578548,
 4.497939905214015,
 4.07974885963016,
 4.165751086858694,
 4.6273276517142525,
 4.521016191535016,
 3.8320015294128713,
 4.1038420492929575,
 4.104165720379318,
 4.348399159541459,
 4.350341243798089,
 4.182915069919243,
 4.12253752520

In [34]:
len(distances)

725

In [35]:
# 5 test obs
# 145 training obs
5*145

725

In [36]:
np.array(distances).shape

# This is like a one column array with 725 elements (or rows)

(725,)

How to do__"figure out a way of retrieving (separating) the distances that correspond to each test observation"?__

I found out a way of doing it using the __reshape method__

The __distances__ list can be converted into a Numpy array and then reshaped from its current shape, which is 725 by 1, to a new array with shape 145 by 5 (= 145 training obs and five test observations). After the reshaping, each column should contain the distances for each of the five test obs.

In [37]:
# I realized that the reshape() method does not give me DIRECTLY what I want because ...
# it reshapes by filling out the rows first. 
# This is not what I want !!!

np.array(distances).reshape( X_train.shape[0], X_test.shape[0])

# X_train.shape[0]= 145
# X_test.shape[0]= 5

array([[4.16560731, 4.12690973, 4.31053446, 4.3046163 , 4.28953992],
       [4.11180699, 4.33929328, 4.13761746, 4.47632094, 4.19514905],
       [4.13614683, 4.23368976, 4.28019464, 4.70908777, 4.38535679],
       [4.72825274, 4.22525655, 4.07822381, 3.96401054, 4.29202331],
       [3.8640071 , 4.11773778, 4.64897695, 3.71002607, 4.15483847],
       [4.0067591 , 3.93459085, 4.08239172, 4.05429372, 4.22631565],
       [4.14049079, 3.73966283, 4.73459751, 4.68295444, 4.10411507],
       [4.15134218, 3.99805179, 4.43396156, 4.49793991, 4.07974886],
       [4.16575109, 4.62732765, 4.52101619, 3.83200153, 4.10384205],
       [4.10416572, 4.34839916, 4.35034124, 4.18291507, 4.12253753],
       [1.35678947, 1.27235022, 1.12379569, 2.61566366, 1.22690071],
       [1.8875122 , 1.28049146, 3.26570326, 1.39229552, 2.38195563],
       [3.64148135, 1.53826586, 2.73231712, 1.4400877 , 2.0906133 ],
       [1.29603845, 1.73915567, 2.23416095, 2.22963989, 2.4727503 ],
       [1.27308221, 1.71581746, 1.

In [38]:
dist_matrix= np.array(distances).reshape(X_test.shape[0], X_train.shape[0]).T
print(dist_matrix.shape)
print(dist_matrix)

(145, 5)
[[4.16560731 4.06166494 3.79894526 3.8284046  3.33098376]
 [4.12690973 3.65295947 3.74109682 4.01607061 3.21145372]
 [4.31053446 3.96115084 3.9280847  4.07065518 3.37568512]
 [4.3046163  3.8768162  3.9142335  4.09461029 3.33563892]
 [4.28953992 4.23674654 3.92711267 3.89155535 3.45148701]
 [4.11180699 4.33570519 3.79083343 3.59726495 3.42233492]
 [4.33929328 4.12530702 3.96565118 3.97864494 3.40522276]
 [4.13761746 3.95498524 3.760577   3.83192419 3.26172942]
 [4.47632094 3.90556925 4.08823583 4.33942856 3.48808061]
 [4.19514905 3.78213368 3.80240402 4.03332185 3.27292166]
 [4.13614683 4.20140313 3.78950267 3.74154088 3.39837406]
 [4.23368976 4.02844658 3.85109216 3.8970391  3.31487599]
 [4.28019464 3.7916955  3.88862793 4.15612874 3.3460211 ]
 [4.70908777 4.197339   4.32303947 4.52180232 3.72549762]
 [4.38535679 4.66466878 4.0883411  3.93557739 3.8178073 ]
 [4.72825274 5.21244491 4.4713258  4.09467375 4.22382389]
 [4.22525655 4.43768514 3.91359864 3.73326635 3.55424386]
 [4.0

So far, we have created a matrix called 'dist_matrix' with the distances between each of the five test obs and the 145 training obs. Each column of dist_matrix contains the distances for each of the test observations.

### Step 2: Get the K nearest neighbors (10 nearest neighbors in this case) to each of the test obs that we want to classify

Let's use argsort() to get the indexes of the 10 nearest neighbors. Let's illustrate how argsort() works:

In [39]:
x=np.array([6,9,1,10])

In [40]:
np.sort(x)

array([ 1,  6,  9, 10])

In [41]:
# argsort() gives us the indexes of the data when sorted from lowest to highest

x.argsort()

array([2, 0, 1, 3], dtype=int64)

In [42]:
# Get the index of the training observations that represent the 10 nearest neighbors to each of the five test observations

indexes=[]
for i in np.arange(dist_matrix.shape[1]):
    indexes.append(dist_matrix[:, i].argsort()[:10]) # the number 10 here means we are using 10 nearest neighbors= the indexes for the 10 smallest values

In [43]:
indexes

[array([141, 112, 140, 104, 139, 115, 120, 143, 132, 110], dtype=int64),
 array([123, 111,  72, 108,  83, 126, 113, 101, 142, 134], dtype=int64),
 array([116, 137, 112, 104, 110,  77, 103, 128, 139, 127], dtype=int64),
 array([136, 100, 115, 124, 144, 110, 143, 140,  70, 120], dtype=int64),
 array([138, 127,  70,  78, 103, 126,  66,  61,  91, 121], dtype=int64)]

WE STOPPED HERE !!! CONTINUE ON 12-01 HERE !

In [44]:
type(indexes)

list

In [45]:
np.array(indexes).shape

(5, 10)

In [46]:
nei_index_matrix= np.array(indexes).reshape(dist_matrix.shape[1], 10).T
nei_index_matrix

# The 'nei_index_matrix' matrix contains the index of each of the 10 nearest neighbors to each of the five 
# test observations

array([[141, 123, 116, 136, 138],
       [112, 111, 137, 100, 127],
       [140,  72, 112, 115,  70],
       [104, 108, 104, 124,  78],
       [139,  83, 110, 144, 103],
       [115, 126,  77, 110, 126],
       [120, 113, 103, 143,  66],
       [143, 101, 128, 140,  61],
       [132, 142, 139,  70,  91],
       [110, 134, 127, 120, 121]], dtype=int64)

### Step 3: Make the prediction based on the classes of the nearest neighbors (= based on a majority rule applied on the classes of the nearest neighbors)

In [47]:
# Indexes of the 10 nearest neighbors to the first test observation

nei_index_matrix[:,0]

array([141, 112, 140, 104, 139, 115, 120, 143, 132, 110], dtype=int64)

In [48]:
# Y value (=class) the 10 nearest neighbors to the first test observation

y_train [nei_index_matrix[:,0]]

141    2
112    2
140    2
104    2
139    2
115    2
120    2
143    2
132    2
110    2
Name: Y, dtype: int32

What is the most frequent class among the 10 nearest neighbors to the first test obs?

Let's obtain the mode!

In [49]:
import statistics as st

In [50]:
# Applying the majority rule to know how to classify the first test observation

st.mode(y_train [nei_index_matrix[:,0]])

2

In [51]:
# Alternative way of getting the mode

y_train [nei_index_matrix[:,0]].value_counts().idxmax()

2

The classification of the first test obs is Y=2 (virginica)

<br>
Extend the logic not only to get the mode for the first test obs, but for all five test observations

Looping through the nei_index_matrix will give us the prediction of Y for all five test observations

In [52]:
y_pred=[]
for i in np.arange(nei_index_matrix.shape[1]):
    y_pred.append(st.mode(y_train [nei_index_matrix[:,i]]))

In [53]:
y_pred

[2, 2, 2, 2, 2]

In [54]:
iris_data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [56]:
confusion_matrix(y_test, y_pred)

array([[5]], dtype=int64)

In [64]:
iris_data.target_names[y_test]

array(['virginica', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [63]:
confusion_matrix(iris_data.target_names[y_test], iris_data.target_names[y_pred])

array([[5]], dtype=int64)

#### Checking that we applied KNN correctly

#### To do that, we are going to apply KNN with K=10 using KNeighborsClassifier() and we are going to compare the results

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
from sklearn.pipeline import make_pipeline

In [60]:
pipe_knn= make_pipeline(StandardScaler(), KNeighborsClassifier(algorithm='brute', n_neighbors=10, weights= 'uniform'))

In [61]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='brute', n_neighbors=10))])

In [62]:
pipe_knn.predict(X_test)

array([2, 2, 2, 2, 1])

Apparently, there is a disagreement between our method from scratch and the scikit learn method but...

Not a big deal !!!

Among the 10 nearest neighbors of the fifth test observation, there is a tie between y=1 and y=2 (there are five of each)

The scikit-learn method chooses randomly when there is tie !

In [65]:
y_train [nei_index_matrix[:,4]]

138    2
127    2
70     1
78     1
103    2
126    2
66     1
61     1
91     1
121    2
Name: Y, dtype: int32

A tie only happens for the fifth test observation as we can see next:

In [66]:
for i in np.arange(nei_index_matrix.shape[1]):
    print(y_train [nei_index_matrix[:,i]])

141    2
112    2
140    2
104    2
139    2
115    2
120    2
143    2
132    2
110    2
Name: Y, dtype: int32
123    2
111    2
72     1
108    2
83     1
126    2
113    2
101    2
142    2
134    2
Name: Y, dtype: int32
116    2
137    2
112    2
104    2
110    2
77     1
103    2
128    2
139    2
127    2
Name: Y, dtype: int32
136    2
100    2
115    2
124    2
144    2
110    2
143    2
140    2
70     1
120    2
Name: Y, dtype: int32
138    2
127    2
70     1
78     1
103    2
126    2
66     1
61     1
91     1
121    2
Name: Y, dtype: int32
