# **PyDistances**

## **Requirements**

In [None]:
pip install PyDistances --upgrade

In [2]:
import polars as pl
import numpy as np
from PyMachineLearning.preprocessing import encoder
from PyDistances.quantitative import (Euclidean_dist, Euclidean_dist_matrix,
                                      Minkowski_dist, Minkowski_dist_matrix,
                                      Canberra_dist, Canberra_dist_matrix,
                                      Pearson_dist_matrix,
                                      Mahalanobis_dist, Mahalanobis_dist_matrix,
                                      Robust_Maha_dist, Robust_Maha_dist_matrix)
from PyDistances.binary import (Sokal_dist, Sokal_dist_matrix,
                                Jaccard_dist, Jaccard_dist_matrix)
from PyDistances.multiclass import (Matching_dist, Matching_dist_matrix)
from PyDistances.mixed import (GG_dist, GG_dist_matrix, RelMS_dist_matrix, S_robust)

## **Data processing**

In [3]:
madrid_houses_df = pl.read_csv('madrid_houses.csv')
columns_to_exclude = ['', 'id','sq_mt_allotment','floor', 'neighborhood', 'district'] 
madrid_houses_df = madrid_houses_df.select(pl.exclude(columns_to_exclude))

binary_cols = ['is_renewal_needed', 'has_lift', 'is_exterior', 'has_parking']
multi_cols = ['energy_certificate', 'house_type']
quant_cols = [x for x in madrid_houses_df.columns if x not in binary_cols + multi_cols]

encoder_ = encoder(method='ordinal')
encoded_arr = encoder_.fit_transform(madrid_houses_df[binary_cols + multi_cols])
cat_df = pl.DataFrame(encoded_arr)
cat_df.columns =  binary_cols + multi_cols
cat_df = cat_df.with_columns([pl.col(col).cast(pl.Int64) for col in cat_df.columns])
quant_df = madrid_houses_df[quant_cols]

madrid_houses_df = pl.concat([quant_df, cat_df], how='horizontal')

In [4]:
madrid_houses_df.head()

sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,is_renewal_needed,has_lift,is_exterior,has_parking,energy_certificate,house_type
f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
64.0,2,1,1,85000,0,0,1,0,4,0
70.0,3,1,1,129900,1,1,1,0,0,0
94.0,2,2,1,144247,0,1,1,0,0,0
64.0,2,1,1,109900,0,1,1,0,0,0
108.0,2,2,1,260000,0,1,1,1,0,0


In [5]:
madrid_houses_df.shape

(21739, 11)

## **Classic distances**

### **Quantitative data**

#### **Euclidean**  



##### **Using data-set as input**


In [None]:
def Euclidean_dist_matrix(X):
    
    Calculates the Euclidean distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Euclidean distance matrix between the rows of 'X'.

In [6]:
Euclidean_dist_matrix(X=madrid_houses_df[quant_cols])

array([[     0.        ,  44900.00041203,  59247.00760376, ...,
        595000.01035798, 610000.04150574, 339000.00009587],
       [ 44900.00041203,      0.        ,  14347.02014357, ...,
        550100.01002272, 565100.04244381, 294100.0000102 ],
       [ 59247.00760376,  14347.02014357,      0.        , ...,
        535753.00612689, 550753.03452909, 279753.00086505],
       ...,
       [595000.01035798, 550100.01002272, 535753.00612689, ...,
             0.        ,  15000.43336041, 256000.02072851],
       [610000.04150574, 565100.04244381, 550753.03452909, ...,
         15000.43336041,      0.        , 271000.08689667],
       [339000.00009587, 294100.0000102 , 279753.00086505, ...,
        256000.02072851, 271000.08689667,      0.        ]])


##### **Using couple of observations as input**  

In [7]:
x1 = madrid_houses_df[quant_cols][0,:]
x3 = madrid_houses_df[quant_cols][2,:]

In [8]:
Euclidean_dist(x1, x3)

59247.00760376004

#### **Minkowski**  



##### **Using data-set as input**


In [None]:
def Minkowski_dist_matrix(X, q):
    
    Calculates the Minkowski distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.
    q: the parameters that defines the Minkowski form. Some particular cases: q=1 := Manhattan, q=2 := Euclidean.

    Returns (outputs)
    ----------
    M: the Minkowski distance matrix between the rows of 'X'.

In [9]:
Minkowski_dist_matrix(X=madrid_houses_df[quant_cols], q=1)

array([[     0.,  44907.,  59278., ..., 595114., 610231., 339009.],
       [ 44907.,      0.,  14373., ..., 550207., 565324., 294104.],
       [ 59278.,  14373.,      0., ..., 535836., 550953., 279775.],
       ...,
       [595114., 550207., 535836., ...,      0.,  15117., 256105.],
       [610231., 565324., 550953., ...,  15117.,      0., 271222.],
       [339009., 294104., 279775., ..., 256105., 271222.,      0.]])

In [10]:
Minkowski_dist_matrix(X=madrid_houses_df[quant_cols], q=3)

array([[     0.        ,  44900.00000004,  59247.00000256, ...,
        595000.00000129, 610000.0000102 , 339000.        ],
       [ 44900.00000004,      0.        ,  14347.00002239, ...,
        550100.00000127, 565100.00001096, 294100.        ],
       [ 59247.00000256,  14347.00002239,      0.        , ...,
        535753.00000062, 550753.00000815, 279753.00000005],
       ...,
       [595000.00000129, 550100.00000127, 535753.00000062, ...,
             0.        ,  15000.00219489, 256000.00000556],
       [610000.0000102 , 565100.00001096, 550753.00000815, ...,
         15000.00219489,      0.        , 271000.00004638],
       [339000.        , 294100.        , 279753.00000005, ...,
        256000.00000556, 271000.00004638,      0.        ]])


##### **Using couple of observations as input**  

In [11]:
Minkowski_dist(x1, x3, q=1)

59278.0

In [12]:
Minkowski_dist(x1, x3, q=2)

59247.00760376004

In [13]:
Minkowski_dist(x1, x3, q=3)

59247.00000256401

#### **Canberra**  



##### **Using data-set as input**


In [None]:
def Canberra_dist_matrix(X):
    
    Calculates the Canberra distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Canberra distance matrix between the rows of 'X'.

In [14]:
Canberra_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 0.45371051, 0.78164852, ..., 1.90887959, 2.75277838,
        1.05816865],
       [0.45371051, 0.        , 0.73200803, ..., 1.58398156, 2.43793773,
        1.07838011],
       [0.78164852, 0.73200803, 0.        , ..., 1.28443942, 2.19871833,
        0.62483892],
       ...,
       [1.90887959, 1.58398156, 1.28443942, ..., 0.        , 0.95659875,
        0.98222144],
       [2.75277838, 2.43793773, 2.19871833, ..., 0.95659875, 0.        ,
        1.87662188],
       [1.05816865, 1.07838011, 0.62483892, ..., 0.98222144, 1.87662188,
        0.        ]])


##### **Using couple of observations as input**  

In [15]:
Canberra_dist(x1, x3)

0.7816485191041616

#### **Pearson**  



##### **Using data-set as input**


In [None]:
def Pearson_dist_matrix(X):
    
    Calculates the Pearson distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Pearson distance matrix between the rows of 'X'.

In [16]:
Pearson_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 0.66557805, 0.73750837, ..., 1.81426832, 3.76742474,
        0.83385122],
       [0.66557805, 0.        , 0.98231536, ..., 1.35816365, 3.56285077,
        1.04162306],
       [0.73750837, 0.98231536, 0.        , ..., 1.56993872, 3.47873654,
        0.38187408],
       ...,
       [1.81426832, 1.35816365, 1.56993872, ..., 0.        , 2.98830222,
        1.50055625],
       [3.76742474, 3.56285077, 3.47873654, ..., 2.98830222, 0.        ,
        3.4727102 ],
       [0.83385122, 1.04162306, 0.38187408, ..., 1.50055625, 3.4727102 ,
        0.        ]])

#### **Mahalanobis**  



##### **Using data-set as input**


In [None]:
def Mahalanobis_dist_matrix(X):
    
    Calculates the Mahalanobis distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Mahalanobis distance matrix between the rows of 'X'.

In [17]:
Mahalanobis_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0.        , 0.98305366, 1.35352819, ..., 1.51225082, 2.9059013 ,
        1.39140298],
       [0.98305366, 0.        , 2.03435122, ..., 0.95079695, 2.95281549,
        2.01444275],
       [1.35352819, 2.03435122, 0.        , ..., 2.12708355, 3.26187536,
        0.82635883],
       ...,
       [1.51225082, 0.95079695, 2.12708355, ..., 0.        , 3.28213849,
        2.08421407],
       [2.9059013 , 2.95281549, 3.26187536, ..., 3.28213849, 0.        ,
        3.30618935],
       [1.39140298, 2.01444275, 0.82635883, ..., 2.08421407, 3.30618935,
        0.        ]])


##### **Using couple of observations as input**  

In [18]:
S = np.cov(madrid_houses_df[quant_cols], rowvar=False)

In [19]:
Mahalanobis_dist(x1, x3, S=S)

46329151687.146484

### **Binary data**

#### **Sokal**  



##### **Using data-set as input**


In [None]:
def Sokal_dist_matrix(X):
    
    Calculates the Sokal distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Sokal distance matrix between the rows of 'X'.

In [20]:
Sokal_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])


##### **Using couple of observations as input**  

In [21]:
x1 = madrid_houses_df[binary_cols][0,:]
x3 = madrid_houses_df[binary_cols][2,:]

In [22]:
Sokal_dist(x1, x3)

0.4

#### **Jaccard**  



##### **Using data-set as input**


In [None]:
def Jaccard_dist_matrix(X):
    
    Calculates the Jaccard distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Jaccard distance matrix between the rows of 'X'.

In [23]:
Jaccard_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0. , 0.6, 0.6, ..., 0.8, 1. , 0.6],
       [0.6, 0. , 0.8, ..., 0.8, 1. , 0.8],
       [0.6, 0.8, 0. , ..., 0.6, 1. , 0.4],
       ...,
       [0.8, 0.8, 0.6, ..., 0. , 0.8, 0.6],
       [1. , 1. , 1. , ..., 0.8, 0. , 1. ],
       [0.6, 0.8, 0.4, ..., 0.6, 1. , 0. ]])


##### **Using couple of observations as input**  

In [24]:
Jaccard_dist(x1, x3)

0.5

### **Multiclass data**

#### **Matching**  



##### **Using data-set as input**


def Matching_dist_matrix(X):
    
    Calculates the Matching distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.

    Returns (outputs)
    ----------
    M: the Matching distance matrix between the rows of 'X'.

In [25]:
Matching_dist_matrix(X=madrid_houses_df[quant_cols])

array([[0. , 0.6, 0.6, ..., 0.8, 1. , 0.6],
       [0.6, 0. , 0.8, ..., 0.8, 1. , 0.8],
       [0.6, 0.8, 0. , ..., 0.6, 1. , 0.4],
       ...,
       [0.8, 0.8, 0.6, ..., 0. , 0.8, 0.6],
       [1. , 1. , 1. , ..., 0.8, 0. , 1. ],
       [0.6, 0.8, 0.4, ..., 0.6, 1. , 0. ]])


##### **Using couple of observations as input**  

In [26]:
x1 = madrid_houses_df[multi_cols][0,:]
x3 = madrid_houses_df[multi_cols][2,:]

In [27]:
Matching_dist(x1, x3)

0.5

## **New proposals**

### **Quantitative data**

#### **Robust Mahalanobis**  



##### **Using data-set as input**


In [None]:
def Robust_Maha_dist_matrix(X, S_robust):
    
    Calculates the Robust Mahalanobis distance matrix for a data matrix using SciPy.

    Parameters (inputs)
    ----------
    X: a Pandas or Polars DataFrame or a NumPy array. It represents a data matrix.
    S_robust: the robust covariance matrix of 'X'.

    Returns (outputs)
    ----------
    M: the Robust Mahalanobis distance matrix between the rows of 'X'.

In [28]:
S_robust_ = S_robust(X=madrid_houses_df, method='MAD', epsilon=0.05, n_iters=20)
Robust_Maha_dist_matrix(madrid_houses_df, S_robust=S_robust_)

array([[ 0.        ,  6.47092419,  7.01983235, ...,  4.96377088,
         5.69177645,  3.68021705],
       [ 6.47092419,  0.        ,  3.03471006, ..., 10.43356417,
        10.12781147,  5.95613137],
       [ 7.01983235,  3.03471006,  0.        , ..., 11.35024985,
        10.9171085 ,  6.21243845],
       ...,
       [ 4.96377088, 10.43356417, 11.35024985, ...,  0.        ,
         3.65216542,  7.11373136],
       [ 5.69177645, 10.12781147, 10.9171085 , ...,  3.65216542,
         0.        ,  7.86440327],
       [ 3.68021705,  5.95613137,  6.21243845, ...,  7.11373136,
         7.86440327,  0.        ]])

In [29]:
S_robust_ = S_robust(X=madrid_houses_df, method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20)
Robust_Maha_dist_matrix(madrid_houses_df, S_robust=S_robust_)

array([[0.        , 4.42394645, 3.66053524, ..., 2.98676383, 4.16089955,
        3.94457764],
       [4.42394645, 0.        , 3.46180479, ..., 5.40273973, 4.18033284,
        4.93110307],
       [3.66053524, 3.46180479, 0.        , ..., 5.11553776, 4.01350136,
        4.9769136 ],
       ...,
       [2.98676383, 5.40273973, 5.11553776, ..., 0.        , 4.4747319 ,
        4.5077568 ],
       [4.16089955, 4.18033284, 4.01350136, ..., 4.4747319 , 0.        ,
        4.92403875],
       [3.94457764, 4.93110307, 4.9769136 , ..., 4.5077568 , 4.92403875,
        0.        ]])

In [30]:
S_robust_ = S_robust(X=madrid_houses_df, method='winsorized', epsilon=0.05, alpha=0.05, n_iters=20)
Robust_Maha_dist_matrix(madrid_houses_df, S_robust=S_robust_)

array([[0.        , 4.23123309, 3.68407669, ..., 2.86519574, 4.13746138,
        3.4793851 ],
       [4.23123309, 0.        , 3.30110368, ..., 5.08940515, 4.20698026,
        4.92423713],
       [3.68407669, 3.30110368, 0.        , ..., 4.98171308, 4.09016333,
        4.6741273 ],
       ...,
       [2.86519574, 5.08940515, 4.98171308, ..., 0.        , 4.23919057,
        4.00516043],
       [4.13746138, 4.20698026, 4.09016333, ..., 4.23919057, 0.        ,
        4.89968753],
       [3.4793851 , 4.92423713, 4.6741273 , ..., 4.00516043, 4.89968753,
        0.        ]])


##### **Using couple of observations as input**  

In [31]:
Robust_Maha_dist(x1, x3, S_robust=S_robust_)

0.082572808774296

### **Mixed data**

#### **Generalized Gower**  



##### **Using data-set as input**


In [None]:
class GG_dist_matrix: 
    """
    Calculates the Generalized Gower matrix for a data matrix.
    """
    def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, method='trimmed', epsilon=0.05, n_iters=20, alpha=0.05,
                 fast_VG=False, VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
        """
        Constructor method.
        
        Parameters:
            p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
            d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis']. 
            d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
            d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
            q: the parameter that defines the Minkowski distance. Must be a positive integer.
            method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            epsilon : parameter used by the Delvin transformation. epsilon=0.05 is recommended. Only needed when d1 = 'robust_mahalanobis'.
            n_iter : maximum number of iterations run by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
            alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
            fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
            VG_sample_size: sample size to be used to make the estimation of the geometric variability.
            VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
            random_state: the random seed used for the (random) sample elements.
            weights: the sample weights.
        """
    def compute(self, X):
        """
        Compute method.
        
        Parameters:
            X: a Pandas or Polars data-frame or a NumPy array. Represents a data matrix.
            
        Returns:
            D: the Generalized Gower matrix for the data matrix `X`.
        """

- With full VG estimation

In [32]:
GG_init = GG_dist_matrix(p1=5, p2=4, p3=2, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                         method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, fast_VG=False)
D_GG = GG_init.compute(X=madrid_houses_df)
D_GG

array([[0.        , 2.21885363, 1.93318704, ..., 1.93891555, 3.12986955,
        2.26834878],
       [2.21885363, 0.        , 1.22309875, ..., 2.38689136, 2.63841547,
        2.01262089],
       [1.93318704, 1.22309875, 0.        , ..., 2.3585878 , 2.50589448,
        1.63422771],
       ...,
       [1.93891555, 2.38689136, 2.3585878 , ..., 0.        , 2.89514966,
        1.7665964 ],
       [3.12986955, 2.63841547, 2.50589448, ..., 2.89514966, 0.        ,
        3.02408907],
       [2.26834878, 2.01262089, 1.63422771, ..., 1.7665964 , 3.02408907,
        0.        ]])

- With fast VG estimation


In [33]:
GG_init = GG_dist_matrix(p1=5, p2=4, p3=2, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                         method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, fast_VG=True,
                         VG_n_samples=5, VG_sample_size=50)
D_GG = GG_init.compute(X=madrid_houses_df)
D_GG

array([[0.        , 2.21079365, 1.94153407, ..., 1.94710971, 3.19388939,
        2.25935837],
       [2.21079365, 0.        , 1.19943989, ..., 2.3704477 , 2.61811657,
        2.01889193],
       [1.94153407, 1.19943989, 0.        , ..., 2.34794054, 2.49263984,
        1.66029709],
       ...,
       [1.94710971, 2.3704477 , 2.34794054, ..., 0.        , 2.97854468,
        1.78790612],
       [3.19388939, 2.61811657, 2.49263984, ..., 2.97854468, 0.        ,
        3.09744146],
       [2.25935837, 2.01889193, 1.66029709, ..., 1.78790612, 3.09744146,
        0.        ]])

- With weights

In [34]:
n = madrid_houses_df.shape[0]
w = np.random.normal(loc=100, scale=20, size=n)

GG_init = GG_dist_matrix(p1=5, p2=4, p3=2, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                         method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, fast_VG=False,
                         weights=w)
D_GG = GG_init.compute(X=madrid_houses_df)
D_GG

array([[0.        , 2.19522219, 1.88018652, ..., 1.96158612, 3.03894716,
        2.23162851],
       [2.19522219, 0.        , 1.01470463, ..., 2.43908123, 2.57176396,
        1.940635  ],
       [1.88018652, 1.01470463, 0.        , ..., 2.28839255, 2.37552921,
        1.6091117 ],
       ...,
       [1.96158612, 2.43908123, 2.28839255, ..., 0.        , 2.84937993,
        1.67755672],
       [3.03894716, 2.57176396, 2.37552921, ..., 2.84937993, 0.        ,
        2.93702838],
       [2.23162851, 1.940635  , 1.6091117 , ..., 1.67755672, 2.93702838,
        0.        ]])


##### **Using couple of observations as input**  

In [None]:
class GG_dist: 
    """
    Calculates the Generalized Gower distance for a pair of data observations.
    """

    def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, method='trimmed',  epsilon=0.05, n_iters=20, alpha=0.05,
                       VG_sample_size=300, VG_n_samples=5, random_state=123, weights=None):
        """
        Constructor method.
        
        Parameters:
            p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
            d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis']. 
            d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
            d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
            q: the parameter that defines the Minkowski distance. Must be a positive integer.
            method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
            alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
            VG_sample_size: sample size to be used to make the estimation of the geometric variability.
            VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
            random_state: the random seed used for the (random) sample elements.
            weights: the sample weights.
        """
    def fit(self, X) :
        """
        Fit method that computes the geometric variability and (robust) covariance matrix to be used in the Compute method if needed.
        
        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
            
        Returns:
            D: the Generalized Gower matrix for the data matrix `X`.
        """
    def compute(self, xi, xr):
        """
        Compute method.
        
        Parameters:
            xi, xr: a pair of quantitative vectors. They represent a couple of statistical observations.
            
        Returns:
            dist: the Generalized Gower distance between the observations `xi` and `xr`.
        """

In [35]:
x1 = madrid_houses_df[0,:]
x3 = madrid_houses_df[2,:]

In [36]:
GG_dist_ = GG_dist(p1=5, p2=4, p3=2,
                   d1='robust_mahalanobis', d2='jaccard', d3='matching',
                   method='trimmed', epsilon=0.05, alpha=0.05, n_iters=20,
                   VG_sample_size=1000, VG_n_samples=5, random_state=123) 
GG_dist_.fit(X=madrid_houses_df)   
GG_dist_.compute(x1, x3) 

1.928211130154049

#### **Related Metric Scaling**  



##### **Using data-set as input**


In [None]:
class RelMS_dist_matrix: 
    """
    Calculates the Related Metric Scaling matrix for a data matrix.
    """

    def __init__(self, p1, p2, p3, d1='euclidean', d2='sokal', d3='matching', q=1, method='trimmed', 
                       epsilon=0.05, alpha=0.05, n_iters=20, weights=None, fast_VG=False):
        """
        Constructor method.
        
        Parameters:
            p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
            d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis']. 
            d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
            d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
            q: the parameter that defines the Minkowski distance. Must be a positive integer.
            method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            n_iter: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
            alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
            weights: the sample weights.
        """
    def compute(self, X, tol=0.009, Gs_PSD_trans=True, d=2):
        """
        Compute method.
        
        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a data matrix.
            tol: a tolerance value to round the close-to-zero eigenvalues of the Gramm matrices.
            Gs_PSD_trans: controls if a transformation is applied to enforce positive semi-definite Gramm matrices.
            d: a parameter that controls the omega definition involved in the transformation mentioned above.
            
        Returns:
            D: the Related Metric Scaling matrix for the data matrix `X`.
        """

In [37]:
RelMS_init = RelMS_dist_matrix(p1=5, p2=4, p3=2, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                               method='winsorized', epsilon=0.05, alpha=0.05, n_iters=20)
D_RelMS = RelMS_init.compute(X=madrid_houses_df.head(1000), Gs_PSD_trans=True)
D_RelMS

array([[ 0.        , 10.29131982, 10.20148541, ..., 10.25180831,
        10.1963865 , 10.23458336],
       [10.29131994,  0.        , 10.13419898, ..., 10.10288394,
        10.08333674, 10.0731428 ],
       [10.20148539, 10.134199  ,  0.        , ..., 10.14619431,
        10.03394396, 10.11537897],
       ...,
       [10.25180831, 10.10288394, 10.14619431, ...,  0.        ,
        10.05982432, 10.00909222],
       [10.1963865 , 10.08333674, 10.03394396, ..., 10.05982432,
         0.        , 10.03008924],
       [10.23458336, 10.0731428 , 10.11537897, ..., 10.00909221,
        10.03008923,  0.        ]])