# k Nearest Neighbors

***

## Machine Learning

***

https://www.ibm.com/topics/machine-learning

https://www.ibm.com/topics/artificial-intelligence

https://www.ibm.com/topics/supervised-learning

https://www.ibm.com/topics/unsupervised-learning

In [10]:
# Machine Learning
import sklearn as sk

# Data frames
import pandas as pd

## scikit-learn

https://scikit-learn.org/stable/getting_started.html

***

In [11]:
# Integer literals
i = 1

# Show type
type(i)

int

In [12]:
# Float literals
f = 1.0

# Show type
type(f)

float

In [13]:
# https://scikit-learn.org/stable/getting_started.html

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
X = [[ 1,  2,  3],  # 2 samples, 3 features
     [11, 12, 13]]
y = [0, 1]  # classes of each sample
clf.fit(X, y)
RandomForestClassifier(random_state=0)

In [14]:
clf.predict(X)

array([0, 1])

In [15]:
clf.predict([[4, 5, 6], [14, 15, 16]])

array([0, 1])

## kNN in scikit-learn

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

https://github.com/mwaskom/seaborn-data/blob/master/penguins.csv

***

In [54]:
# Load penguins data set
df = pd.read_csv('penguins.csv')

# Show
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [55]:
# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

In [56]:
# The X values
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

# Show
X

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
3,,,,
4,36.7,19.3,193.0,3450.0
...,...,...,...,...
339,,,,
340,46.8,14.3,215.0,4850.0
341,50.4,15.7,222.0,5750.0
342,45.2,14.8,212.0,5200.0


In [57]:
# X is a data frame
type(X)

pandas.core.frame.DataFrame

In [58]:
# Turn X into a numpy array
X.to_numpy()

array([[  39.1,   18.7,  181. , 3750. ],
       [  39.5,   17.4,  186. , 3800. ],
       [  40.3,   18. ,  195. , 3250. ],
       ...,
       [  50.4,   15.7,  222. , 5750. ],
       [  45.2,   14.8,  212. , 5200. ],
       [  49.9,   16.1,  213. , 5400. ]])

In [59]:
# y values
y = df['sex']

# Show
y

0        MALE
1      FEMALE
2      FEMALE
3         NaN
4      FEMALE
        ...  
339       NaN
340    FEMALE
341      MALE
342    FEMALE
343      MALE
Name: sex, Length: 344, dtype: object

In [60]:
# Fit the data
# Give an error, uncomment below to see it

# clf.fit(X, y)

### NaN

https://pandas.pydata.org/docs/user_guide/missing_data.html

https://docs.python.org/3/library/math.html

https://en.wikipedia.org/wiki/NaN

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna

In [61]:
# Look for NaNs
df.isna()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,True,True,True,True,True
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
339,False,False,True,True,True,True,True
340,False,False,False,False,False,False,False
341,False,False,False,False,False,False,False
342,False,False,False,False,False,False,False


In [62]:
# Drop rows with NaNs (anywhere)
df_nona = df.dropna()

# Show
df_nona

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [63]:
# The X values
X = df_nona[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

# Show.
X

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0
...,...,...,...,...
338,47.2,13.7,214.0,4925.0
340,46.8,14.3,215.0,4850.0
341,50.4,15.7,222.0,5750.0
342,45.2,14.8,212.0,5200.0


In [64]:
# y values
y = df_nona['sex']

# Show
y

0        MALE
1      FEMALE
2      FEMALE
4      FEMALE
5        MALE
        ...  
338    FEMALE
340    FEMALE
341      MALE
342    FEMALE
343      MALE
Name: sex, Length: 333, dtype: object

In [65]:
# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

# Fit the data
clf.fit(X, y)

In [66]:
df_nona.iloc[0]

species                 Adelie
island               Torgersen
bill_length_mm            39.1
bill_depth_mm             18.7
flipper_length_mm        181.0
body_mass_g             3750.0
sex                       MALE
Name: 0, dtype: object

In [67]:
# The first row of X
X.iloc[0]

bill_length_mm         39.1
bill_depth_mm          18.7
flipper_length_mm     181.0
body_mass_g          3750.0
Name: 0, dtype: float64

In [68]:
# Quick check that the classifier correctly predicts first X sample
clf.predict(X.iloc[:1])

array(['MALE'], dtype=object)

In [69]:
# The actual output for the first sample
y.iloc[0]

'MALE'

## WHat is kNN?

NB: There is a mistake in Minkowski distance:

https://www.ibm.com/topics/knn

***

In [70]:
# Point a
a = X.iloc[0]

# Show a
a

bill_length_mm         39.1
bill_depth_mm          18.7
flipper_length_mm     181.0
body_mass_g          3750.0
Name: 0, dtype: float64

In [71]:
# Point b
b = X.iloc[1]

# Show b
b

bill_length_mm         39.5
bill_depth_mm          17.4
flipper_length_mm     186.0
body_mass_g          3800.0
Name: 1, dtype: float64

In [72]:
# Differences - straight forward subtraction
a - b

bill_length_mm       -0.4
bill_depth_mm         1.3
flipper_length_mm    -5.0
body_mass_g         -50.0
dtype: float64

In [73]:
# Total difference
(a - b).sum()

-54.099999999999994

In [74]:
# Differences - straight forward subtraction
b - a

bill_length_mm        0.4
bill_depth_mm        -1.3
flipper_length_mm     5.0
body_mass_g          50.0
dtype: float64

In [75]:
# Total difference
(b - a).sum()

54.099999999999994

In [76]:
# Differences - absolute values
(b - a).abs()

bill_length_mm        0.4
bill_depth_mm         1.3
flipper_length_mm     5.0
body_mass_g          50.0
dtype: float64

In [77]:
# Total difference - absolute values
(b - a).abs().sum()

56.7

In [78]:
# A more common distance
((b - a)**2).sum()**0.5

50.26778292306117

## Evaluation

***

In [79]:
# Ask the classifier to predict the outputs for the training set inputs
# (Usually a bad idea)
clf.predict(X)

array(['MALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'MALE',
       'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE',
       'MALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE',
       'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE',
       'MALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'M

In [80]:
# Compare to the expected outputs
clf.predict(X) == y

0       True
1       True
2       True
4       True
5      False
       ...  
338     True
340     True
341     True
342    False
343     True
Name: sex, Length: 333, dtype: bool

In [81]:
# Count the values that are correct
(clf.predict(X) == y).sum()

276

In [82]:
# Totla number of cases
X.shape[0]

333

In [83]:
# Correct proportion
(clf.predict(X) == y).sum() / X.shape[0]

0.8288288288288288

In [84]:
# Keep some samples back for testing, train on the others
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y)

In [85]:
# Training inputs
X_train

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
250,47.3,15.3,222.0,5250.0
253,59.6,17.0,230.0,6050.0
239,48.7,15.1,222.0,5350.0
290,47.7,15.0,216.0,4750.0
35,39.2,21.1,196.0,4150.0
...,...,...,...,...
91,41.1,18.1,205.0,4300.0
261,49.6,16.0,225.0,5700.0
107,38.2,20.0,190.0,3900.0
152,46.5,17.9,192.0,3500.0


In [86]:
# Training outputs
y_train

250      MALE
253      MALE
239      MALE
290    FEMALE
35       MALE
        ...  
91       MALE
261      MALE
107      MALE
152    FEMALE
100    FEMALE
Name: sex, Length: 249, dtype: object

In [87]:
# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

# Fit the data
clf.fit(X_train, y_train)

In [88]:
# Predict based on test set
clf.predict(X_test)

array(['MALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'MALE', 'MALE',
       'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'MALE', 'MALE',
       'MALE', 'MALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE',
       'MALE', 'MALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'MALE', 'MALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'MALE',
       'FEMALE', 'MALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'MALE', 'MALE',
       'MALE', 'FEMALE', 'FEMALE', 'MALE', 'FEMALE', 'FEMALE', 'MALE',
       'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE', 'FEMALE',
       'FEMALE', 'FEMALE', 'MALE', 'MALE', 'FEMALE', 'MALE', 'FEMALE',
       'FEMALE'], dtype=object)

In [89]:
# Proportion of correct classifications on test set
(clf.predict(X_test) == y_test).sum() / X_test.shape[0]

0.8452380952380952

### Cross-validation

https://scikit-learn.org/stable/modules/cross_validation.html

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
***

In [90]:
# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

# Run cross-validation with five folds
sk.model_selection.cross_val_score(clf, X, y)

array([0.76119403, 0.7761194 , 0.71641791, 0.74242424, 0.72727273])

***

## End