In [None]:
import numpy as np

# Distance Metrics

A way to describe the "closeness" of data points $\rightarrow$ proxy for similarity

## Manhattan Distance

Imagine a grid and you travel along a grid

> Does it matter what path we take along the grid?

<img src='images/manhattan-distance.png' width = 50%/>

### Formula

$$dist(A,B) = \sum_{k=1}^{N} |a_k - b_k| $$

### Code distance

Can use a for-loop but vectorization is usually very quick

In [None]:
a = np.array([2,3,5])
b = np.array([1,-1,3])

display(a)
display(b)

In [None]:
diffs = a - b
print('A - B')
display(diffs)

In [None]:
print('|A - B|')
abs_diff = np.abs(diffs)
display(abs_diff)

In [None]:
dist = np.sum(abs_diff)
print('sum(|A-B|)')
display(dist)

## Euclidean Distance (Pythagorean Distance)

Well-known for the Pythagorean Theorem

<img src='images/euclidean-distance.png' width = 50%/>

### Formula

$$dist(A,B) = \sqrt{ \sum_{k=1}^{N} (a_k - b_k)^2 } $$

### Code distance

In [None]:
a = np.array([2,3,5])
b = np.array([1,-1,3])

display(a)
display(b)

In [None]:
diffs = a - b
print('A - B')
display(diffs)

In [None]:
print('(A - B)^2')
sq_diffs = diffs * diffs
display(sq_diffs)

In [None]:
print('sum[(A - B)^2]')
sq_sum = np.sum(sq_diffs)
display(sq_sum)

In [None]:
dist = np.sqrt(sq_sum)
print('√sum[(A - B)^2]')
display(dist)

## Minkowski Distance

Used in a Normed Vector Space

Above were special cases of the Minkowski Distance

### Formula

$$dist(A,B) = (\sum_{k=1}^{N} |a_k - b_k|^c )^\frac{1}{c} $$

### Code distance

In [None]:
def minkowski(A,B,c=2):
    abs_diffs = np.abs(A-B)
    pow_diffs = np.power(abs_diffs, c)
    sum_diff = np.sum(pow_diffs)
    dist = np.power(sum_diff, 1/c)
    return dist

In [None]:
a = np.array([2,3,5])
b = np.array([1,-1,3])

display(a)
display(b)

In [None]:
# Euclidean Distance
minkowski(a,b)

# K-Nearest Neighbors

Classification / Supervised Learning

## Summary

Use the training data to "learn" and then predict a test point

<img src='https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/KnnClassification.svg/440px-KnnClassification.svg.png'/>

<img src='http://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1531424125/KNN_final1_ibdm8a.png'/>

> From Datacamp: https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn

## Advantages

- lazy learning (no training phase)
- easy to interpret

## Disadvantages

- has to be kept in memory (small data with few features)
- not robust; doesn't generalize well
- soft boundaries are troublesome
- curse of dimensionality
    + PCA (learn this in time)
    + high dimensions: cosine similarity

## Determining K

How many neighbors ($k$) are used to determine our point to classify?

<img src='https://raw.githubusercontent.com/learn-co-students/dsc-3-27-09-finding-the-best-value-for-k-online-ds-sp-000/master/best_k.png'/>

### Overfitting & Underfitting

<img src='https://raw.githubusercontent.com/learn-co-students/dsc-3-27-09-finding-the-best-value-for-k-online-ds-sp-000/master/fit.png'/>

Elbow plot and test the error

Usually between 1 & 19

## Implementing via sklearn

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :3]
y = iris.target

# 
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)

ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y ,cmap=plt.cm.Set1, s=40)

ax.set_xlabel("1st")
# ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd")
# ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd ")
# ax.w_zaxis.set_ticklabels([])

plt.show()

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3,metric='euclidean')
neigh.fit(X, y) 

In [None]:
pred_pts = np.array([
    [7,3,7],
    [8,4,7],
    [7,3,6],
    [7,4,6],    
    [4,4,1],
    [5,4,3],
    [5,4,5],
    [4,4,5],
    [3,3,3]
])

pred_y = neigh.predict(pred_pts)
print(pred_y)

In [None]:
for p,prob in zip(pred_y,neigh.predict_proba(pred_pts)):
    print(f'{p}: {prob}')
# print(neigh.predict_proba(pred_pts))

In [None]:
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)

ax.scatter(
    X[:, 0], 
    X[:, 1], 
    X[:, 2], 
    c=y,
    cmap=plt.cm.Set1,
    s=40
)

ax.scatter(
    pred_pts[:, 0], 
    pred_pts[:, 1], 
    pred_pts[:, 2], 
    c=pred_y,
    cmap=plt.cm.Set1,
    s=400
)

ax.set_xlabel("1st")
# ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd")
# ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd ")
# ax.w_zaxis.set_ticklabels([])

plt.show()