In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

In [3]:
from sklearn import datasets

In [4]:
iris_data=datasets.load_iris()

In [5]:
iris_df=pd.DataFrame(iris_data.data, columns= iris_data.feature_names)

In [6]:
iris_df['Y']= iris_data.target

In [7]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Y                  150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [8]:
iris_df['Y'].value_counts()

0    50
1    50
2    50
Name: Y, dtype: int64

In [9]:
# How are the classes of Y called?

iris_data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### Step by step logic to run KNN from scratch



Steps to do k-Nearest Neighbors:

Step 1: Calculate Euclidean Distance

- Standardize predictors!

Step 2: Get Nearest Neighbors

Step 3: Make Predictions

1) Let's test our logic using the __last five observations from Iris as test data__ and __the first 145 obs for training__

2) Let's use K=10 to test our logic

In [56]:
from sklearn.preprocessing import scale

# 'scale()' allows us to standardize variables. Similar to 'StandardScaler()'. However, StandardScaler() requires an extra
# step compared to scale() when appplied outside of a pipeline application

In [12]:
iris_df.shape[0]

150

Splitting the data: first 145 for training and the last five for testing

In [13]:
# Get all rows but the last five rows

X_train= iris_df.iloc[:-5,:-1]

In [14]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
140,6.7,3.1,5.6,2.4
141,6.9,3.1,5.1,2.3
142,5.8,2.7,5.1,1.9
143,6.8,3.2,5.9,2.3


In [15]:
X_train_standardized= scale(X_train)

In [18]:
y_train= iris_df.iloc[:-5,-1]

In [19]:
y_train

0      0
1      0
2      0
3      0
4      0
      ..
140    2
141    2
142    2
143    2
144    2
Name: Y, Length: 145, dtype: int32

In [20]:
# Get the last five rows

X_test= iris_df.iloc[-5:,:-1]

In [21]:
X_test

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
149,5.9,3.0,5.1,1.8


In [22]:
X_test_standardized= scale(X_test)

In [23]:
y_test= iris_df.iloc[-5:,-1]

In [24]:
y_test

145    2
146    2
147    2
148    2
149    2
Name: Y, dtype: int32


### Step 1: Calculate the Euclidean Distance between the observations to classify (the test obs) and the training obs

- KNN requires that we compute the distance between each test obs and all training obs. We identify the K nearest neighbor to each test obs from these distances.

- Let's compute these distances in a progressive way (which is better to understand the logic behind KNN)

- First, let's compute the distance ONLY between the first test observation and ALL the training obs

In [25]:
# The following matrix contains the data for the five test observations
# It has five rows, one for each obs

X_test_standardized

array([[ 1.40069858,  0.070014  ,  0.15075567,  1.16554303],
       [-0.07372098, -1.6803361 , -1.35680105, -0.77702869],
       [ 0.6634888 ,  0.070014  ,  0.15075567, -0.29138576],
       [-0.44232587,  1.47029409,  1.6583124 ,  1.16554303],
       [-1.54814054,  0.070014  , -0.60302269, -1.26267162]])

In [26]:
# To retrieve ONLY the first test obs

X_test_standardized[0,:]

array([1.40069858, 0.070014  , 0.15075567, 1.16554303])

In [27]:
# Distance between the first test observation and the first training observation

np.sqrt(np.sum(np.square(X_test_standardized[0,:]- X_train_standardized[0,:])))

3.7637646047330278

Knowing how to get the distance between the first test observation and the first training observation allows us to
extend this logic and get the dist between the first test observation and all training observations

In [28]:
# Distance between first test obs and ALL training obs

distances_to_first=[]
for i in np.arange(X_train_standardized.shape[0]):
    distances_to_first.append(np.sqrt(np.sum(np.square(X_test_standardized[0,:]- X_train_standardized[i,:]))))

In [29]:
distances_to_first

[3.7637646047330278,
 3.8060775847496964,
 3.9922251561594533,
 4.027704103038035,
 3.899239781521412,
 3.6742727513355735,
 4.031002466685466,
 3.7670674850212587,
 4.2438133580144175,
 3.866493274637933,
 3.681965525309738,
 3.904178026770895,
 3.971315930758127,
 4.451442307038944,
 3.8501972691722317,
 4.250570328185753,
 3.759406379670419,
 3.6788264629347465,
 3.4725922527870123,
 3.8882425079157708,
 3.4379259174986556,
 3.7195864220043315,
 4.294479044979116,
 3.356496573397705,
 3.849142098916274,
 3.6852129977589,
 3.57863093017818,
 3.6705790519006785,
 3.641518773620929,
 3.931110970478573,
 3.8399576919837797,
 3.3005759506141765,
 4.309194688292124,
 4.212413830554003,
 3.7792155472190614,
 3.7760537640622203,
 3.51869112405242,
 4.057138729253711,
 4.246332887460031,
 3.6919660858029513,
 3.776472984079167,
 4.458278311742238,
 4.2486128281648785,
 3.4773383223412915,
 3.7335881383017817,
 3.8041937747652925,
 3.9491228177175404,
 4.055308217995737,
 3.7457435108547967,


In [30]:
# This list contains 145 elements. Why?

len(distances_to_first)

145

Knowing how to get the distances between the first test observation and all training observations allows us to get the distance between each of the five test obs and all training obs

__Distance between each test obs and ALL training obs__

I am saving all these distances in a list

This list contains the distances between each of the five test obs and all training obs

Later, we need to figure out a way of retrieving (separating) the distances that correspond to each test observation

In [31]:
distances=[]
for j in np.arange(X_test_standardized.shape[0]):
    for i in np.arange(X_train_standardized.shape[0]):
        distances.append(np.sqrt(np.sum(np.square(X_test_standardized[j,:] - X_train_standardized[i,:]))))

In [32]:
distances

[3.7637646047330278,
 3.8060775847496964,
 3.9922251561594533,
 4.027704103038035,
 3.899239781521412,
 3.6742727513355735,
 4.031002466685466,
 3.7670674850212587,
 4.2438133580144175,
 3.866493274637933,
 3.681965525309738,
 3.904178026770895,
 3.971315930758127,
 4.451442307038944,
 3.8501972691722317,
 4.250570328185753,
 3.759406379670419,
 3.6788264629347465,
 3.4725922527870123,
 3.8882425079157708,
 3.4379259174986556,
 3.7195864220043315,
 4.294479044979116,
 3.356496573397705,
 3.849142098916274,
 3.6852129977589,
 3.57863093017818,
 3.6705790519006785,
 3.641518773620929,
 3.931110970478573,
 3.8399576919837797,
 3.3005759506141765,
 4.309194688292124,
 4.212413830554003,
 3.7792155472190614,
 3.7760537640622203,
 3.51869112405242,
 4.057138729253711,
 4.246332887460031,
 3.6919660858029513,
 3.776472984079167,
 4.458278311742238,
 4.2486128281648785,
 3.4773383223412915,
 3.7335881383017817,
 3.8041937747652925,
 3.9491228177175404,
 4.055308217995737,
 3.7457435108547967,


In [33]:
len(distances)

725

In [34]:
# 5 test obs
# 145 training obs
5*145

725

In [35]:
np.array(distances).shape

# This is like a one column array with 725 elements (or rows)

(725,)

In [36]:
np.array(distances).ndim

1

How to do__"figure out a way of retrieving (separating) the distances that correspond to each test observation"?__

I found out a way of doing it using the __reshape method__

The __distances__ list can be converted into a Numpy array and then reshaped from its current shape, which is 725 by 1, to a new array with shape 145 by 5 (= 145 training obs and five test observations). After the reshaping, each column should contain the distances for each of the five test obs.

In [37]:
# I realized that the reshape() method does not give me DIRECTLY what I want because ...
# it reshapes by filling out the rows first. 
# This is not what I want !!!

np.array(distances).reshape( X_train.shape[0], X_test.shape[0])

# X_train.shape[0]= 145
# X_test.shape[0]= 5

array([[3.7637646 , 3.80607758, 3.99222516, 4.0277041 , 3.89923978],
       [3.67427275, 4.03100247, 3.76706749, 4.24381336, 3.86649327],
       [3.68196553, 3.90417803, 3.97131593, 4.45144231, 3.85019727],
       [4.25057033, 3.75940638, 3.67882646, 3.47259225, 3.88824251],
       [3.43792592, 3.71958642, 4.29447904, 3.35649657, 3.8491421 ],
       [3.685213  , 3.57863093, 3.67057905, 3.64151877, 3.93111097],
       [3.83995769, 3.30057595, 4.30919469, 4.21241383, 3.77921555],
       [3.77605376, 3.51869112, 4.05713873, 4.24633289, 3.69196609],
       [3.77647298, 4.45827831, 4.24861283, 3.47733832, 3.73358814],
       [3.80419377, 3.94912282, 4.05530822, 3.74574351, 3.75264811],
       [0.98480749, 1.08972771, 0.90311923, 2.73019824, 1.2020888 ],
       [1.98116396, 1.20136305, 3.29822741, 1.23399943, 2.48443669],
       [3.73138291, 1.52059073, 2.73574378, 1.49928727, 2.00475568],
       [0.96036237, 1.85991964, 2.18766544, 2.37760771, 2.48971739],
       [1.45328307, 1.60514394, 1.

In [38]:
dist_matrix= np.array(distances).reshape(X_test.shape[0], X_train.shape[0]).T
print(dist_matrix.shape)
print(dist_matrix)

(145, 5)
[[3.7637646  2.84657194 2.51771204 3.89768558 1.34968335]
 [3.80607758 1.92943182 2.51101452 4.22153771 0.85206651]
 [3.99222516 2.42754612 2.72081723 4.1570259  0.82249815]
 [4.0277041  2.31643935 2.74050976 4.17383104 0.65077984]
 [3.89923978 3.09607114 2.68328707 3.8921466  1.46691064]
 [3.67427275 3.63794451 2.6413408  3.57591462 2.1961365 ]
 [4.03100247 2.85176789 2.81482022 3.96250361 1.0042943 ]
 [3.76706749 2.67315918 2.48731905 3.90419347 1.10624241]
 [4.24381336 2.16286823 2.99064586 4.44651897 0.84256549]
 [3.86649327 2.15411796 2.52636082 4.18039115 0.79438587]
 [3.68196553 3.21299615 2.50215786 3.80292826 1.84978532]
 [3.90417803 2.76768155 2.62384872 3.90373425 0.97204392]
 [3.97131593 2.03441968 2.65086546 4.32018819 0.81206208]
 [4.45144231 2.42932616 3.19157008 4.58333246 0.95286443]
 [3.85019727 3.85884329 2.86982718 4.01175641 2.6953773 ]
 [4.25057033 4.74503767 3.47487021 3.97849994 3.36801741]
 [3.75940638 3.63121817 2.75853898 3.75523267 2.26163572]
 [3.6

So far, we have created a matrix called 'dist_matrix' with the distances between each of the five test obs and the 145 training obs. Each column of dist_matrix contains the distances for each of the test observations.

### Step 2: Get the K nearest neighbors (10 nearest neighbors in this case) to each of the test obs that we want to classify

Let's use argsort() to get the indexes of the 10 nearest neighbors

In [39]:
x=np.array([6,9,1,10])

In [40]:
np.sort(x)

array([ 1,  6,  9, 10])

In [41]:
# argsort() gives us the indexes of the data when sorted from lowest to highest

x.argsort()

array([2, 0, 1, 3], dtype=int64)

In [42]:
# Get the index of the training observations that represent the 10 nearest neighbors to each of the five test observations

indexes=[]
for i in np.arange(dist_matrix.shape[1]):
    indexes.append(dist_matrix[:, i].argsort()[:10]) # the number 10 here means we are using 10 nearest neighbors

In [43]:
indexes

[array([141, 139,  77,  52, 110,  86, 112,  65,  50,  75], dtype=int64),
 array([98, 79, 81, 93, 41, 80, 57, 62, 69, 60], dtype=int64),
 array([74, 97, 75, 65, 58, 91, 51, 71, 63, 73], dtype=int64),
 array([136, 100,  85,  70, 115, 124,  56, 110, 137, 144], dtype=int64),
 array([ 3, 30, 29, 47, 34,  9, 38, 45, 12, 42], dtype=int64)]

In [44]:
type(indexes)

list

In [45]:
np.array(indexes).shape

(5, 10)

In [46]:
nei_index_matrix= np.array(indexes).reshape(dist_matrix.shape[1], 10).T
nei_index_matrix

# The 'nei_index_matrix' matrix contains the index of each of the 10 nearest neighbors to each of the five 
# test observations

array([[141,  98,  74, 136,   3],
       [139,  79,  97, 100,  30],
       [ 77,  81,  75,  85,  29],
       [ 52,  93,  65,  70,  47],
       [110,  41,  58, 115,  34],
       [ 86,  80,  91, 124,   9],
       [112,  57,  51,  56,  38],
       [ 65,  62,  71, 110,  45],
       [ 50,  69,  63, 137,  12],
       [ 75,  60,  73, 144,  42]], dtype=int64)

### Step 3: Make the prediction based on the classes of the nearest neighbors (= based on a majority rule applied on the classes of the nearest neighbors)

In [47]:
# Indexes of the 10 nearest neighbors to the first test observation

nei_index_matrix[:,0]

array([141, 139,  77,  52, 110,  86, 112,  65,  50,  75], dtype=int64)

In [48]:
# Y value (=class) the 10 nearest neighbors to the first test observation

y_train [nei_index_matrix[:,0]]

141    2
139    2
77     1
52     1
110    2
86     1
112    2
65     1
50     1
75     1
Name: Y, dtype: int32

What is the most frequent class among the 10 nearest neighbors to the first test obs?

Let's obtain the mode!

In [None]:
import statistics as st

In [None]:
# Applying the majority rule to know how to classify the first test observation

st.mode(y_train [nei_index_matrix[:,0]])

In [None]:
# Alternative way of getting the mode

y_train [nei_index_matrix[:,0]].value_counts().idxmax()

The classification of the first test obs is Y=1 (versicolor)

CONTINUE FROM THIS POINT AND ON !!!!!

In [None]:
for i in np.arange(nei_index_matrix.shape[1]):
    print(y_train [nei_index_matrix[:,i]])

In [None]:
# Extend the logic not only to get the mode for the first test obs, but for all five test observations
# Looping through the nei_index_matrix will give us the prediction of Y for all five test observations

y_pred=[]
for i in np.arange(nei_index_matrix.shape[1]):
    y_pred.append(st.mode(y_train [nei_index_matrix[:,i]]))

In [None]:
y_pred

In [None]:
iris_data.target_names

In [None]:
iris_data.target_names[y_pred]

In [None]:
confusion_matrix(y_test, y_pred)

#### Checking that we applied KNN correctly

#### To do that, we are going to apply KNN with K=10 using KNeighborsClassifier() and we are going to compare the results

In [49]:
from sklearn.neighbors import KNeighborsClassifier

In [50]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [51]:
scaler = StandardScaler()

In [52]:
X_train_standardized_v2= scaler.fit_transform(X_train)

In [53]:
X_train_standardized_v2

array([[-0.87257635,  1.00365344, -1.30520751, -1.28530043],
       [-1.11265903, -0.13686183, -1.30520751, -1.28530043],
       [-1.35274171,  0.31934428, -1.36173532, -1.28530043],
       [-1.47278305,  0.09124122, -1.24867971, -1.28530043],
       [-0.99261769,  1.23175649, -1.30520751, -1.28530043],
       [-0.51245234,  1.91606565, -1.1356241 , -1.02019579],
       [-1.47278305,  0.77555038, -1.30520751, -1.15274811],
       [-0.99261769,  0.77555038, -1.24867971, -1.28530043],
       [-1.71286573, -0.36496489, -1.30520751, -1.28530043],
       [-1.11265903,  0.09124122, -1.24867971, -1.41785275],
       [-0.51245234,  1.45985955, -1.24867971, -1.28530043],
       [-1.23270037,  0.77555038, -1.1921519 , -1.28530043],
       [-1.23270037, -0.13686183, -1.30520751, -1.41785275],
       [-1.83290707, -0.13686183, -1.47479093, -1.41785275],
       [-0.03228698,  2.14416871, -1.41826312, -1.28530043],
       [-0.15232832,  3.05658093, -1.24867971, -1.02019579],
       [-0.51245234,  1.

In [54]:
X_train_standardized

array([[-0.87257635,  1.00365344, -1.30520751, -1.28530043],
       [-1.11265903, -0.13686183, -1.30520751, -1.28530043],
       [-1.35274171,  0.31934428, -1.36173532, -1.28530043],
       [-1.47278305,  0.09124122, -1.24867971, -1.28530043],
       [-0.99261769,  1.23175649, -1.30520751, -1.28530043],
       [-0.51245234,  1.91606565, -1.1356241 , -1.02019579],
       [-1.47278305,  0.77555038, -1.30520751, -1.15274811],
       [-0.99261769,  0.77555038, -1.24867971, -1.28530043],
       [-1.71286573, -0.36496489, -1.30520751, -1.28530043],
       [-1.11265903,  0.09124122, -1.24867971, -1.41785275],
       [-0.51245234,  1.45985955, -1.24867971, -1.28530043],
       [-1.23270037,  0.77555038, -1.1921519 , -1.28530043],
       [-1.23270037, -0.13686183, -1.30520751, -1.41785275],
       [-1.83290707, -0.13686183, -1.47479093, -1.41785275],
       [-0.03228698,  2.14416871, -1.41826312, -1.28530043],
       [-0.15232832,  3.05658093, -1.24867971, -1.02019579],
       [-0.51245234,  1.

In [55]:
np.all(X_train_standardized_v2==X_train_standardized)

True

In [58]:
X_test_standardized_v2= scaler.fit_transform(X_test)

In [59]:
X_test_standardized_v2

array([[ 1.40069858,  0.070014  ,  0.15075567,  1.16554303],
       [-0.07372098, -1.6803361 , -1.35680105, -0.77702869],
       [ 0.6634888 ,  0.070014  ,  0.15075567, -0.29138576],
       [-0.44232587,  1.47029409,  1.6583124 ,  1.16554303],
       [-1.54814054,  0.070014  , -0.60302269, -1.26267162]])

In [60]:
X_test_standardized

array([[ 1.40069858,  0.070014  ,  0.15075567,  1.16554303],
       [-0.07372098, -1.6803361 , -1.35680105, -0.77702869],
       [ 0.6634888 ,  0.070014  ,  0.15075567, -0.29138576],
       [-0.44232587,  1.47029409,  1.6583124 ,  1.16554303],
       [-1.54814054,  0.070014  , -0.60302269, -1.26267162]])

In [61]:
pipe_knn= make_pipeline(StandardScaler(), KNeighborsClassifier(algorithm='brute', n_neighbors=10, weights= 'uniform'))

In [62]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='brute', n_neighbors=10))])

In [63]:
pipe_knn.predict(X_test)

array([2, 2, 2, 2, 1])

In [57]:
# https://medium.com/analytics-vidhya/why-it-makes-a-difference-how-to-standardize-training-and-test-set-e95bf350bed3