# Code Info

- Contributor：datamonday
- Github Repo：https://github.com/datamonday/BigDataAnalysis

In [1]:
import numpy as np
from scipy.spatial.distance import pdist

`numpy.ones(shape, dtype=None, order='C', *, like=None)`

返回给定形状和类型的新数组，并用1填充。

In [34]:
x_ones1 = np.ones([5,2])
x_ones1

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [25]:
y_ones2 = np.ones([5,2])
y_ones2

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

`numpy.eye(N, M=None, k=0, dtype=<class 'float'>, order='C', *, like=None)`

返回一个二维数组。

- N: 行数
- M: 列数
- k: 对角线索引

In [36]:
x_eye = np.eye(5,5,0)
x_eye

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [85]:
x1 = np.random.random(3)
x1

array([0.90275773, 0.10568311, 0.90440239])

In [86]:
y1 = np.random.random(3)
y1

array([0.80022806, 0.92789639, 0.6041757 ])

`numpy.std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>, *, where=<no value>)`

计算沿指定轴的标准差。

`numpy.sum(a, axis=None, dtype=None, out=None, keepdims=<no value>, initial=<no value>, where=<no value>)`

给定轴上的数组元素的总和。

In [87]:
x2 = np.random.random([3, 2])
x2

array([[0.7382364 , 0.82985292],
       [0.37193289, 0.63203355],
       [0.39068034, 0.70997788]])

In [88]:
y2 = np.random.random([3, 2])
y2

array([[0.72555411, 0.59649138],
       [0.33190826, 0.22485379],
       [0.25404031, 0.50062771]])

In [89]:
x2 - y2

array([[0.01268229, 0.23336154],
       [0.04002463, 0.40717976],
       [0.13664003, 0.20935017]])

In [90]:
np.square(x2 - y2)

array([[1.60840441e-04, 5.44576072e-02],
       [1.60197094e-03, 1.65795354e-01],
       [1.86704989e-02, 4.38274923e-02]])

In [91]:
np.sum(np.square(x2 - y2), axis=0)

array([0.02043331, 0.26408045])

In [92]:
np.sum([[0, 1], [0, 5]], axis=0)

array([0, 6])

In [93]:
np.sum([[0, 1], [0, 5]], axis=1)

array([1, 5])

In [94]:
np.std(x2, axis=0)

array([0.16843229, 0.08136191])

In [95]:
np.square(x1 - y1)

array([0.01051233, 0.67603468, 0.09013606])

In [96]:
np.sum(np.square(x1 - y1))

0.7766830821234479

In [97]:
np.sqrt(np.sum(np.square(x1 - y1)))

0.881296251054915

In [98]:
np.vstack([x2, y2])

array([[0.7382364 , 0.82985292],
       [0.37193289, 0.63203355],
       [0.39068034, 0.70997788],
       [0.72555411, 0.59649138],
       [0.33190826, 0.22485379],
       [0.25404031, 0.50062771]])

In [99]:
np.hstack([x2, y2])

array([[0.7382364 , 0.82985292, 0.72555411, 0.59649138],
       [0.37193289, 0.63203355, 0.33190826, 0.22485379],
       [0.39068034, 0.70997788, 0.25404031, 0.50062771]])

# 1. 闵可夫斯基距离（Minkowski Distance）

In [110]:
x = np.random.random(5)
x

array([0.75173729, 0.34763686, 0.71927609, 0.24151473, 0.22294162])

In [111]:
y = np.random.random(5)
y

array([0.98036113, 0.45482745, 0.87472311, 0.92923963, 0.62922737])

In [180]:
# p = 2 ——> 欧氏距离
pdist(xy, metric="minkowski", p=2)

array([0.85203058])

# 2. 欧氏距离（Euclidean Distance）

- scipy pdist:https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
- sklearn:https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise

种类：
- 基本欧式距离
- 加权欧式距离
- 标准化欧氏距离

In [112]:
# 根据公式求解
np.sqrt(np.sum(np.square(x - y) ) )

0.8520305805970781

In [113]:
# 根据scipy库求解
xy = np.vstack([x, y])
pdist(xy, metric="euclidean")

array([0.85203058])

# 3. 曼哈顿距离（Manhattan/City Block Distance）

In [114]:
np.sum(np.abs(x - y))

1.585272101374208

In [115]:
pdist(xy, metric="cityblock")

array([1.5852721])

# 4. 切比雪夫距离（Chebyshev Distance）

In [117]:
np.max(np.abs(x - y))

0.6877248997688814

In [116]:
pdist(xy, metric="chebyshev")

array([0.6877249])

# 5. 余弦相似度（Cosine Similarity）

Numpy linear algebra：https://numpy.org/doc/stable/reference/routines.linalg.html

linalg.norm:

![image.png](attachment:image.png)

In [118]:
np.dot(x, y) / ( np.linalg.norm(x) * np.linalg.norm(y) )

0.9232011981703329

In [120]:
1 - pdist(xy, metric="cosine")

array([0.9232012])

# 6. 汉明距离（Hamming Distance）

In [121]:
np.mean( x != y )

1.0

In [122]:
pdist(xy, metric="hamming")

array([1.])

In [134]:
# 以字符串格式返回输入数据的二进制编码
bin_x = np.binary_repr(145)
bin_x

'10010001'

In [133]:
bin_y = np.binary_repr(150)
bin_y

'10010110'

In [135]:
np.mean(bin_x != bin_y)

1.0

# 7. 杰卡德距离（Jaccard Distance）

In [125]:
molecular = np.double( (x != y).sum() )
denominator = np.double(np.bitwise_or( x != 0, y != 0).sum() )

molecular / denominator

1.0

In [136]:
pdist(xy, metric="jaccard")

array([1.])

# 8.S$\Phi$rensen-Dice

In [152]:
pdist(xy, metric="dice")

array([0.])

# 9. 半正矢距离（Haversine Distance）

- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html

In [138]:
"""
计算Ezeiza机场（阿根廷布宜诺斯艾利斯）和戴高乐机场（法国巴黎）之间的距离。
"""

from sklearn.metrics.pairwise import haversine_distances
from math import radians

bsas = [-34.83333, -58.5166646]
paris = [49.0083899664, 2.53844117956]

bsas_in_radians = [radians(_) for _ in bsas]
paris_in_radians = [radians(_) for _ in paris]

result = haversine_distances([bsas_in_radians, paris_in_radians])
# multiply by Earth radius to get kilometers
result * 6371000/1000

array([[    0.        , 11099.54035582],
       [11099.54035582,     0.        ]])

# 10. 斜交空间距离（Oblique Space Distance）

# 11. 兰氏距离（Canberra Distance）

In [153]:
np.sum( np.true_divide( np.abs(x - y), np.abs(x) + np.abs(y) ) )

1.4272762731136441

In [154]:
pdist(xy, metric="canberra")

array([1.42727627])

# 12. 马氏距离（Mahalanobis Distance）

马氏距离要求样本个数＞维数，此处重新生成样本集：10个样本，2个属性

马氏距离计算两两样本之间的距离，故结果包含：$C^{2}_{10} = 45$ 个距离

In [171]:
data = np.random.random([10, 2])
data

array([[0.16057991, 0.03173777],
       [0.04984203, 0.63608966],
       [0.0965663 , 0.54125706],
       [0.14562222, 0.50749436],
       [0.12384608, 0.66895134],
       [0.38362246, 0.96750912],
       [0.66204458, 0.34832719],
       [0.62169272, 0.76812896],
       [0.55320254, 0.59736334],
       [0.53135375, 0.97430267]])

In [172]:
data.shape

(10, 2)

In [173]:
# 求解个维度之间协方差矩阵
S = np.cov(data.T)
# 计算协方差矩阵的逆矩阵
ST = np.linalg.inv(S)
ST

array([[18.39262731, -4.22549979],
       [-4.22549979, 13.68987876]])

In [174]:
ST.shape

(2, 2)

In [175]:
n = data.shape[0]
d1 = []

for i in range(0, n):
    for j in range(i + 1, n):
        delta = data[i] - data[j]
        d = np.sqrt( np.dot( np.dot(delta, ST), delta.T) )
        d1.append(d)

d1

[2.4064983868149823,
 1.9761163000756812,
 1.778448926503528,
 2.404430793536302,
 3.3375019493285927,
 2.1576814382238196,
 2.9094250412104405,
 2.3104822379986585,
 3.426008151540264,
 0.4480137866843753,
 0.7065459737678685,
 0.30815685501580464,
 1.618002146140689,
 3.0847744520164553,
 2.3696411013587313,
 2.2012364723722557,
 2.1104688855720037,
 0.2717792884385083,
 0.4554926318973598,
 1.7230353296945067,
 2.7042335514201556,
 2.183968292942155,
 1.9135693479813816,
 2.1102154148029593,
 0.6287347650129346,
 1.735958615801837,
 2.438573435905017,
 2.012440681515558,
 1.6900976084983395,
 2.048918972209157,
 1.3438862451280977,
 2.862373485815439,
 2.067854534269353,
 1.928870122984677,
 1.8108503250774675,
 2.851523901145603,
 1.409891022070067,
 1.7131869461579778,
 0.6273442013634126,
 1.608018006961265,
 1.1384164544766362,
 2.5238527095532692,
 0.6218088758251554,
 0.94309743685501,
 1.422491582300723]

In [178]:
len(d1)  # 10个里选两个计算距离，相当于组合问题，共计算45个

45

In [179]:
pdist(data, metric="mahalanobis")

array([2.40649839, 1.9761163 , 1.77844893, 2.40443079, 3.33750195,
       2.15768144, 2.90942504, 2.31048224, 3.42600815, 0.44801379,
       0.70654597, 0.30815686, 1.61800215, 3.08477445, 2.3696411 ,
       2.20123647, 2.11046889, 0.27177929, 0.45549263, 1.72303533,
       2.70423355, 2.18396829, 1.91356935, 2.11021541, 0.62873477,
       1.73595862, 2.43857344, 2.01244068, 1.69009761, 2.04891897,
       1.34388625, 2.86237349, 2.06785453, 1.92887012, 1.81085033,
       2.8515239 , 1.40989102, 1.71318695, 0.6273442 , 1.60801801,
       1.13841645, 2.52385271, 0.62180888, 0.94309744, 1.42249158])