# Similarity & Distances

1. cosine similarity
2. Minkowski distance
3. Mahalanobis distance
4. Simple Matching Coefficients
5. Jaccard Coefficients
6. Correlation
7. Entropy
8. Mutual Information

In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as la

cars = pd.read_csv("https://gist.githubusercontent.com/noamross/e5d3e859aa0c794be10b/raw/b999fb4425b54c63cab088c0ce2c0d6ce961a563/cars.csv")
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Cosine Similarity

$$
\cos(\mathbf{x}, \mathbf{y}) = \frac{\mathbf{x} \cdot \mathbf{y}}{\left\| \mathbf{x} \right\| \left\| \mathbf{y} \right\|}
$$

In [2]:
def cosine_similarity(x, y):
  return np.dot(x, y) / (la.norm(x) * la.norm(y))

In [3]:
disp = cars["disp"]
hp = cars["hp"]

print("(disp, disp):", cosine_similarity(disp, disp))
print("(disp, hp):", cosine_similarity(disp, hp))

(disp, disp): 1.0
(disp, hp): 0.9576399993708385


## Minkowski Distance

$$
d(\mathbf{x}, \mathbf{y}) = \left( \displaystyle \sum^n_{k=1} \left| x_k - y_k \right|^r \right)^{1/r}
$$

In [4]:
def minkowski_distance(x, y, r):
  sum = 0
  for i in range(len(x)):
    sum += np.power(np.abs(x[i] - y[i]), r)
  return np.power(sum, 1/r)

In [5]:
print("(disp, disp, r=2):", minkowski_distance(disp, disp, 2))
print("(disp, hp, r=2):", minkowski_distance(disp, hp, 2))
print("(disp, hp, r=3):", minkowski_distance(disp, hp, 3))
print("(disp, hp, r=100):", minkowski_distance(disp, hp, 100))

(disp, disp, r=2): 0.0
(disp, hp, r=2): 656.6404419467323
(disp, hp, r=3): 433.2535606783471
(disp, hp, r=100): 267.0004920859905


## Mahalanobis Distance

$$
d(\mathbf{x}, \mathbf{y}) = \left( (\mathbf{x} - \mathbf{y})^T \Sigma^{-1} (\mathbf{x} - \mathbf{y})\right)^{1/2}
$$

In [6]:
def mahalanobis_distance(x, y, cov):
  return np.power(np.transpose(x - y) @ cov @ (x-y), 0.5)

In [7]:
np.cov(disp, disp)

array([[15360.79982863, 15360.79982863],
       [15360.79982863, 15360.79982863]])

In [8]:
cov = np.random.rand(len(disp), len(disp))

print("(disp, disp):", mahalanobis_distance(disp, disp, cov))
print("(disp, hp):", mahalanobis_distance(disp, hp, cov))

(disp, disp): 0.0
(disp, hp): 1888.37286086607


### SMC; Simple Matching Coefficients

$$
\text{SMC} = \frac{f_{11} + f_{00}}{f_{01} + f_{10} + f_{11} + f_{00}}
$$

In [9]:
def SMC(x, y):
  return np.sum(x == y) / len(x)

In [10]:
x = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

SMC(x, y)

0.7

## Jaccard Coefficients

$$
J = \frac{f_{11}}{f_{01} + f_{10} + f_{11}}
$$

In [11]:
def Jaccard(x, y):
  f_11 = 0
  f_00 = 0
  for i in range(len(x)):
    if x[i] == y[i] == 1:
      f_11 += 1
    if x[i] == 0 and y[i] == 0:
      f_00 += 1
  return f_11 / (len(x) - f_00)

In [12]:
x = np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

Jaccard(x, y)

0.0

## Correlation

$$
\text{Corr}(x, y) = \frac{\text{Cov}(x, y)}{\sigma_x \cdot \sigma_y}
$$

In [13]:
def Correlation(x, y):
  n = len(x)
  x_m, y_m = np.mean(x), np.mean(y)
  cov = np.sum((x-x_m) * (y-y_m)) / n
  return cov / (np.std(x) * np.std(y))

In [14]:
disp = cars["disp"]
hp = cars["hp"]

print("(disp, disp):", Correlation(disp, disp))
print("(disp, hp):", Correlation(disp, hp))

(disp, disp): 1.0
(disp, hp): 0.7909485863698065


## Entropy

$$
H(X) = - \sum^n_{i=1} p_i \log_2 p_i
$$

이때, entropy의 단위는 'bit'이다. 엔트로피가 클수록 불확실성이 크다는 것을 의미한다. 👉 ['Skywalk'님의 포스트](https://hyunw.kim/blog/2017/10/14/Entropy.html)를 참고하길 바란다.

In [15]:
def Entropy(x):
  Hx, n = 0, len(x)
  x = pd.value_counts(x.values, sort=False)
  for xi in x:
    pi = xi / n
    Hx -= pi * np.log2(pi)
  return Hx

In [16]:
gear = cars["gear"]

Entropy(gear)

1.4614828307119232

## Mutual Information

$$
I(X, Y) = H(X) + H(Y) - H(X, Y)
$$

where $H(X, Y)$ is the joint entropy of $X$ and $Y$,

$$
H(X, Y) = - \sum_i \sum_j p_{ij} \log_2 p_{ij}
$$

In [17]:
def Mutual_Info(x, y, xy):
  Hx = Entropy(x)
  Hy = Entropy(y)
  Hxy = Entropy(xy.value_counts())

  return Hx + Hy - Hxy


In [18]:
gear = cars["gear"]
carb = cars["carb"]
gear_carb = cars[["gear", "carb"]]

Mutual_Info(gear, carb, gear_carb)

1.5045004384149032