# Similarity & Distances

1. cosine similarity
2. Minkowski distance
3. Mahalanobis distance


In [24]:
import pandas as pd
import numpy as np
from numpy import linalg as la

cars = pd.read_csv("https://gist.githubusercontent.com/noamross/e5d3e859aa0c794be10b/raw/b999fb4425b54c63cab088c0ce2c0d6ce961a563/cars.csv")
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Cosine Similarity

In [4]:
def cosine_similarity(x, y):
  return np.dot(x, y) / (la.norm(x) * la.norm(y))

In [27]:
disp = cars["disp"]
hp = cars["hp"]

print("(disp, disp):", cosine_similarity(disp, disp))
print("(disp, hp):", cosine_similarity(disp, hp))

(disp, disp): 1.0
(disp, hp): 0.9576399993708385


## Minkowski Distance

In [28]:
def minkowski_distance(x, y, r):
  sum = 0
  for i in range(len(x)):
    sum += np.power(np.abs(x[i] - y[i]), r)
  return np.power(sum, 1/r)

In [35]:
print("(disp, disp, r=2):", minkowski_distance(disp, disp, 2))
print("(disp, hp, r=2):", minkowski_distance(disp, hp, 2))
print("(disp, hp, r=3):", minkowski_distance(disp, hp, 3))
print("(disp, hp, r=100):", minkowski_distance(disp, hp, 100))

(disp, disp, r=2): 0.0
(disp, hp, r=2): 656.6404419467323
(disp, hp, r=3): 433.2535606783471
(disp, hp, r=100): 267.0004920859905


## Mahalanobis Distance

In [36]:
def mahalanobis_distance(x, y, cov):
  return np.power(np.transpose(x - y) @ cov @ (x-y), 0.5)

In [41]:
np.cov(disp, disp)

array([[15360.79982863,  6721.15866935],
       [ 6721.15866935,  4700.86693548]])

In [47]:
cov = np.random.rand(len(disp), len(disp))

print("(disp, disp):", mahalanobis_distance(disp, disp, cov))
print("(disp, hp):", mahalanobis_distance(disp, hp, cov))

(disp, disp): 0.0
(disp, hp): 2001.9557827192898
