- https://stackoverflow.com/questions/34144632/using-cosine-distance-with-scikit-learn-kneighborsclassifier
you can get the same ordering as the cosine distance by normalizing your data and then using the euclidean distance. So long as you use the uniform weights option, the results will be identical to having used a correct Cosine Distance.


- https://github.com/stanfordnlp/GloVe/blob/765074642a6544e47849bb85d8dc2e11e44c2922/eval/python/evaluate.py#L75-L76
```
#cosine similarity if input W has been normalized
dist = np.dot(W, pred_vec.T)
```

In [31]:
import pandas as pd

import numpy as np
from sklearn import preprocessing
from importlib import reload
from sklearn.metrics.pairwise import cosine_similarity

### normalize in sklearn

In [25]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized                                      

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [26]:
X_normalized = preprocessing.normalize(X, norm='l2', axis=1)

X_normalized                                      


array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [27]:
X_normalized = preprocessing.normalize(X, norm='l2', axis=0)

X_normalized                                      


array([[ 0.4472136 , -0.70710678,  0.89442719],
       [ 0.89442719,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.4472136 ]])

### cosine similarity in sklearn

In [139]:
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import numpy as np

In [140]:
x = np.array([2, 2]).reshape(1, -1)
y = np.array([2, 0]).reshape(1, -1)
1 - cosine(x, y), cosine_similarity(x, y)[0][0]

(0.7071067811865475, 0.7071067811865475)

In [141]:
x = np.array([2, 2]).reshape(1, -1)
y = np.array([-2, 0]).reshape(1, -1)
1 - cosine(x, y), cosine_similarity(x, y)[0][0]

(-0.7071067811865475, -0.7071067811865475)

### normalize v.s. cosine

In [185]:
n_samples = 10
n_features = 5
X = np.random.uniform(0, 2, size=(n_samples, n_features))
Y = np.random.uniform(-1, 3, size=(n_samples, n_features))
cosine_similarity(X, Y)

array([[ 0.68771139,  0.93969998,  0.65501864,  0.44625405,  0.51352883,
         0.51187231,  0.67474548,  0.65198421,  0.70943913,  0.17337859],
       [ 0.68523307,  0.51517104,  0.94933607,  0.58906468,  0.66645032,
         0.81807049,  0.81605257,  0.6692951 ,  0.47534416,  0.07102817],
       [ 0.81552982,  0.62614858,  0.80474978,  0.72622434,  0.79282841,
         0.68712266,  0.93384894,  0.88353535,  0.48636079, -0.12897101],
       [ 0.81456569,  0.62393981,  0.62419184,  0.61199863,  0.62328067,
         0.45731574,  0.7890675 ,  0.84924692,  0.30407625,  0.0299158 ],
       [ 0.71150164,  0.76256653,  0.6463674 ,  0.56068448,  0.61845933,
         0.76510445,  0.68586243,  0.40712186,  0.89894567,  0.44975326],
       [ 0.71119122,  0.73698909,  0.81992195,  0.59347563,  0.6811928 ,
         0.7560881 ,  0.80738022,  0.65442807,  0.71344781,  0.09604695],
       [ 0.8811539 ,  0.81300764,  0.62274431,  0.7332077 ,  0.77853851,
         0.71260213,  0.86048995,  0.67477185

In [186]:
np.allclose(np.dot(X, Y.T), cosine_similarity(X_normalized, Y_normalized))

False

In [187]:
X_normalized = preprocessing.normalize(X, norm='l2')
Y_normalized = preprocessing.normalize(Y, norm='l2')

np.linalg.norm(X_normalized, axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [188]:
cosine_similarity(X_normalized, Y_normalized)

array([[ 0.68771139,  0.93969998,  0.65501864,  0.44625405,  0.51352883,
         0.51187231,  0.67474548,  0.65198421,  0.70943913,  0.17337859],
       [ 0.68523307,  0.51517104,  0.94933607,  0.58906468,  0.66645032,
         0.81807049,  0.81605257,  0.6692951 ,  0.47534416,  0.07102817],
       [ 0.81552982,  0.62614858,  0.80474978,  0.72622434,  0.79282841,
         0.68712266,  0.93384894,  0.88353535,  0.48636079, -0.12897101],
       [ 0.81456569,  0.62393981,  0.62419184,  0.61199863,  0.62328067,
         0.45731574,  0.7890675 ,  0.84924692,  0.30407625,  0.0299158 ],
       [ 0.71150164,  0.76256653,  0.6463674 ,  0.56068448,  0.61845933,
         0.76510445,  0.68586243,  0.40712186,  0.89894567,  0.44975326],
       [ 0.71119122,  0.73698909,  0.81992195,  0.59347563,  0.6811928 ,
         0.7560881 ,  0.80738022,  0.65442807,  0.71344781,  0.09604695],
       [ 0.8811539 ,  0.81300764,  0.62274431,  0.7332077 ,  0.77853851,
         0.71260213,  0.86048995,  0.67477185

In [189]:
np.allclose(cosine_similarity(X, Y), cosine_similarity(X_normalized, Y_normalized))

True

In [190]:
np.allclose(np.dot(X_normalized, Y_normalized.T), cosine_similarity(X_normalized, Y_normalized))

True

So the euclidean distance will degrade to  sqrt(2 − 2x^T y), i.e., sqrt(2 - 2*cosine_sim)

In [195]:
np.sqrt(sum((X_normalized[0] - Y_normalized[0])**2)), np.linalg.norm(X_normalized[0] - Y_normalized[0])

(0.7903019787878622, 0.7903019787878622)

In [196]:
np.sqrt(2 - 2 * np.dot(X_normalized[0], Y_normalized[0])), np.linalg.norm(X_normalized[0] - Y_normalized[0])

(0.7903019787878621, 0.7903019787878622)

In [197]:
np.sqrt(2 - 2 * cosine_similarity(X_normalized[0].reshape(1, -1), Y_normalized[0].reshape(1, -1)))

array([[0.79030198]])