# **PyDistances**

## **Requirements**

In [None]:
pip install PyDistances --upgrade

In [63]:
import polars as pl
import numpy as np
from PyMachineLearning.preprocessing import encoder
from PyDistances.quantitative import (Euclidean_dist, Euclidean_dist_matrix,
                                      Minkowski_dist, Minkowski_dist_matrix,
                                      Canberra_dist, Canberra_dist_matrix,
                                      Pearson_dist_matrix,
                                      Mahalanobis_dist, Mahalanobis_dist_matrix)
from PyDistances.binary import (Sokal_dist, Sokal_dist_matrix,
                                Jaccard_dist, Jaccard_dist_matrix)

## **Data processing**

In [34]:
madrid_houses_df = pl.read_csv('madrid_houses.csv')
columns_to_exclude = ['id','sq_mt_allotment','floor', 'neighborhood', 'district'] 
madrid_houses_df = madrid_houses_df.select(pl.exclude(columns_to_exclude))

binary_cols = ['is_renewal_needed', 'has_lift', 'is_exterior', 'has_parking']
multi_cols = ['energy_certificate', 'house_type']
quant_cols = [x for x in madrid_houses_df.columns if x not in binary_cols + multi_cols]

encoder_ = encoder(method='ordinal')
encoded_arr = encoder_.fit_transform(madrid_houses_df[binary_cols + multi_cols])
cat_df = pl.DataFrame(encoded_arr)
cat_df.columns =  binary_cols + multi_cols
cat_df = cat_df.with_columns([pl.col(col).cast(pl.Int64) for col in cat_df.columns])
quant_df = madrid_houses_df[quant_cols]

madrid_houses_df = pl.concat([quant_df, cat_df], how='horizontal')

In [35]:
madrid_houses_df.head()

Unnamed: 0_level_0,sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,is_renewal_needed,has_lift,is_exterior,has_parking,energy_certificate,house_type
i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,64.0,2,1,1,85000,0,0,1,0,4,0
1,70.0,3,1,1,129900,1,1,1,0,0,0
2,94.0,2,2,1,144247,0,1,1,0,0,0
3,64.0,2,1,1,109900,0,1,1,0,0,0
4,108.0,2,2,1,260000,0,1,1,1,0,0


In [37]:
madrid_houses_df.shape

(21739, 12)

## **Classic distances**

### **Quantitative data**

#### **Euclidean**  



##### **Using data-set as input**


In [38]:
Euclidean_dist_matrix(X=madrid_houses_df[quant_cols])

array([[     0.        ,  44900.00042316,  59247.00763752, ...,
        595397.00742194, 610387.31821443, 339696.43970168],
       [ 44900.00042316,      0.        ,  14347.02017842, ...,
        550529.34678453, 565518.02985493, 294902.42048176],
       [ 59247.00763752,  14347.02017842,      0.        , ...,
        536193.79028762, 551181.86262521, 280596.37491244],
       ...,
       [595397.00742194, 550529.34678453, 536193.79028762, ...,
             0.        ,  15000.43339374, 256000.02073633],
       [610387.31821443, 565518.02985493, 551181.86262521, ...,
         15000.43339374,      0.        , 271000.08689851],
       [339696.43970168, 294902.42048176, 280596.37491244, ...,
        256000.02073633, 271000.08689851,      0.        ]])


##### **Using couple of observations as input**  

In [46]:
x1 = madrid_houses_df[quant_cols][0,:].to_numpy().flatten()
x3 = madrid_houses_df[quant_cols][2,:].to_numpy().flatten()

In [47]:
Euclidean_dist(x1, x3)

59247.00763751702

#### **Minkowski**  



##### **Using data-set as input**


In [55]:
Minkowski_dist_matrix(X=madrid_houses_df[quant_cols], q=1)

array([[     0.,  44908.,  59280., ..., 616853., 631971., 360750.],
       [ 44908.,      0.,  14374., ..., 571945., 587063., 315844.],
       [ 59280.,  14374.,      0., ..., 557573., 572691., 301514.],
       ...,
       [616853., 571945., 557573., ...,      0.,  15118., 256107.],
       [631971., 587063., 572691., ...,  15118.,      0., 271223.],
       [360750., 315844., 301514., ..., 256107., 271223.,      0.]])

In [56]:
Minkowski_dist_matrix(X=madrid_houses_df[quant_cols], q=3)

array([[     0.        ,  44900.00000004,  59247.00000256, ...,
        595009.67289557, 610009.20431817, 339029.80435414],
       [ 44900.00000004,      0.        ,  14347.00002239, ...,
        550111.31475944, 565110.72356086, 294139.59212861],
       [ 59247.00000256,  14347.00002239,      0.        , ...,
        535764.92720523, 550764.28795236, 279796.75019685],
       ...,
       [595009.67289557, 550111.31475944, 535764.92720523, ...,
             0.        ,  15000.00219489, 256000.00000556],
       [610009.20431817, 565110.72356086, 550764.28795236, ...,
         15000.00219489,      0.        , 271000.00004638],
       [339029.80435414, 294139.59212861, 279796.75019685, ...,
        256000.00000556, 271000.00004638,      0.        ]])


##### **Using couple of observations as input**  

In [49]:
Minkowski_dist(x1, x3, q=1)

59280.0

In [51]:
Minkowski_dist(x1, x3, q=2)

59247.00763751702

In [50]:
Minkowski_dist(x1, x3, q=3)

59247.00000256477

#### **Canberra**  



##### **Using data-set as input**


In [57]:
Canberra_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 1.45371051, 1.78164852, ..., 2.90887959, 3.75277838,
        2.05816865],
       [1.45371051, 0.        , 1.06534137, ..., 2.58388957, 3.43784574,
        2.07828812],
       [1.78164852, 1.06534137, 0.        , ..., 2.28425543, 3.19853435,
        1.62465496],
       ...,
       [2.90887959, 2.58388957, 2.28425543, ..., 0.        , 0.95662175,
        0.98226744],
       [3.75277838, 3.43784574, 3.19853435, ..., 0.95662175, 0.        ,
        1.87664488],
       [2.05816865, 2.07828812, 1.62465496, ..., 0.98226744, 1.87664488,
        0.        ]])


##### **Using couple of observations as input**  

In [58]:
Canberra_dist(x1, x3)

1.7816485191041616

#### **Pearson**  



##### **Using data-set as input**


In [59]:
Pearson_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 0.66557807, 0.73750844, ..., 3.90977907, 5.11755758,
        3.56262794],
       [0.66557807, 0.        , 0.98231537, ..., 3.71998761, 4.96877334,
        3.61675061],
       [0.73750844, 0.98231537, 0.        , ..., 3.80227609, 4.9086971 ,
        3.48434077],
       ...,
       [3.90977907, 3.71998761, 3.80227609, ..., 0.        , 2.98830223,
        1.50055628],
       [5.11755758, 4.96877334, 4.9086971 , ..., 2.98830223, 0.        ,
        3.4727102 ],
       [3.56262794, 3.61675061, 3.48434077, ..., 1.50055628, 3.4727102 ,
        0.        ]])

#### **Mahalanobis**  



##### **Using data-set as input**


In [60]:
Mahalanobis_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 0.98311741, 1.35353199, ..., 3.77794957, 4.77402826,
        3.668157  ],
       [0.98311741, 0.        , 2.03440221, ..., 3.57947183, 4.79390382,
        3.93719195],
       [1.35353199, 2.03440221, 0.        , ..., 4.06604364, 5.0011229 ,
        3.49628828],
       ...,
       [3.77794957, 3.57947183, 4.06604364, ..., 0.        , 3.29825661,
        2.08532502],
       [4.77402826, 4.79390382, 5.0011229 , ..., 3.29825661, 0.        ,
        3.32955171],
       [3.668157  , 3.93719195, 3.49628828, ..., 2.08532502, 3.32955171,
        0.        ]])


##### **Using couple of observations as input**  

In [64]:
S = np.cov(madrid_houses_df[quant_cols], rowvar=False)

In [65]:
Mahalanobis_dist(x1, x3, S=S)

46329153762.43836

### **Binary data**

#### **Sokal**  



##### **Using data-set as input**



##### **Using couple of observations as input**  

#### **Jaccard**  



##### **Using data-set as input**



##### **Using couple of observations as input**  

### **Multiclass data**

#### **Matching**  



##### **Using data-set as input**



##### **Using couple of observations as input**  

## **New proposals**

### **Quantitative data**

#### **Robust Mahalanobis**  



##### **Using data-set as input**



##### **Using couple of observations as input**  

### **Mixed data**

#### **Generalized Gower**  

##### **Using data-set as input**

##### **Using couple of observations as input**  

#### **Related Metric Scaling**  

##### **Using data-set as input**

##### **Using couple of observations as input**  