# `mixed`

In [1]:
pip show PyDistances

Name: PyDistancesNote: you may need to restart the kernel to use updated packages.

Version: 0.0.34
Summary: PyDistances is a Python package for computing classic statistical distances as well as new proposals suitable for mixed multivariate data, even with outliers.
Home-page: https://github.com/FabioScielzoOrtiz/PyDistances-package
Author: Fabio Scielzo Ortiz
Author-email: fabioscielzo98@gmail.com
License: 
Location: c:\Users\fscielzo\Documents\Proyectos\PyDistances\PyDistances-demo\.venv\Lib\site-packages
Requires: 
Required-by: 


In [4]:
import polars as pl
import pandas as pd
import numpy as np

## Data

In [5]:
data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/PyDistances-demo/refs/heads/main/data/madrid_houses_processed.csv"

In [6]:
quant_cols = ['sq_mt_built', 'n_rooms', 'n_bathrooms', 'n_floors', 'buy_price']
binary_cols = ['is_renewal_needed', 'has_lift', 'is_exterior', 'has_parking']
multiclass_cols = ['energy_certificate', 'house_type']

p1 = len(quant_cols)
p2 = len(binary_cols)
p3 = len(multiclass_cols)

In [7]:
data_pl = pl.read_csv(data_url)
data_pd = pd.read_csv(data_url)
data_np = np.array(data_pd)

xi_pl = data_pl[2,:]
xr_pl = data_pl[10,:]

xi_pd = data_pd.iloc[2,:]
xr_pd = data_pd.iloc[10,:]

xi_np = data_np[2,:]
xr_np = data_np[10,:]

## `GGowerDistMatrix`

In [20]:
from PyDistances.mixed import GGowerDistMatrix

In [22]:
ggower_dist_matrix = GGowerDistMatrix(p1=p1, p2=p2, p3=p3, d1="robust_mahalanobis", d2="jaccard", d3="hamming",
                                      robust_method="trimmed", alpha=0.07, epsilon=0.05, n_iters=20, weights=None)

In [23]:
ggower_dist_matrix.compute(X=data_pl)

array([[0.        , 2.21871457, 1.93429293, ..., 1.94305438, 3.1223396 ,
        2.26768279],
       [2.21871457, 0.        , 1.22327246, ..., 2.38753004, 2.64304949,
        2.00865696],
       [1.93429293, 1.22327246, 0.        , ..., 2.36077974, 2.50019632,
        1.63811682],
       ...,
       [1.94305438, 2.38753004, 2.36077974, ..., 0.        , 2.9036275 ,
        1.75869492],
       [3.1223396 , 2.64304949, 2.50019632, ..., 2.9036275 , 0.        ,
        3.03987403],
       [2.26768279, 2.00865696, 1.63811682, ..., 1.75869492, 3.03987403,
        0.        ]])

In [24]:
ggower_dist_matrix.compute(X=data_pd)

array([[0.        , 2.21871457, 1.93429293, ..., 1.94305438, 3.1223396 ,
        2.26768279],
       [2.21871457, 0.        , 1.22327246, ..., 2.38753004, 2.64304949,
        2.00865696],
       [1.93429293, 1.22327246, 0.        , ..., 2.36077974, 2.50019632,
        1.63811682],
       ...,
       [1.94305438, 2.38753004, 2.36077974, ..., 0.        , 2.9036275 ,
        1.75869492],
       [3.1223396 , 2.64304949, 2.50019632, ..., 2.9036275 , 0.        ,
        3.03987403],
       [2.26768279, 2.00865696, 1.63811682, ..., 1.75869492, 3.03987403,
        0.        ]])

In [25]:
ggower_dist_matrix.compute(X=data_np)

array([[0.        , 2.21871457, 1.93429293, ..., 1.94305438, 3.1223396 ,
        2.26768279],
       [2.21871457, 0.        , 1.22327246, ..., 2.38753004, 2.64304949,
        2.00865696],
       [1.93429293, 1.22327246, 0.        , ..., 2.36077974, 2.50019632,
        1.63811682],
       ...,
       [1.94305438, 2.38753004, 2.36077974, ..., 0.        , 2.9036275 ,
        1.75869492],
       [3.1223396 , 2.64304949, 2.50019632, ..., 2.9036275 , 0.        ,
        3.03987403],
       [2.26768279, 2.00865696, 1.63811682, ..., 1.75869492, 3.03987403,
        0.        ]])

## `GGowerDist`

In [36]:
from PyDistances.mixed import GGowerDist

In [43]:
ggower_dist = GGowerDist(p1=p1, p2=p2, p3=p3, d1="robust_mahalanobis", d2="jaccard", d3="hamming",
                         robust_method="trimmed", alpha=0.07, epsilon=0.05, n_iters=20, weights=None)

In [44]:
ggower_dist.fit(X=data_pl)

In [45]:
ggower_dist.compute(xi=xi_pl, xr=xr_pl)

np.float64(1.7385809635103606)

In [46]:
ggower_dist.fit(X=data_pd)

In [47]:
ggower_dist.compute(xi=xi_pd, xr=xr_pd)

np.float64(1.7385809635103606)

In [48]:
ggower_dist.fit(X=data_np)

In [49]:
ggower_dist.compute(xi=xi_np, xr=xr_np)

np.float64(1.7385809635103606)

## `RelMSDistMatrix`

In [2]:
from PyDistances.mixed import RelMSDistMatrix

In [8]:
relms_dist_matrix = RelMSDistMatrix(p1=p1, p2=p2, p3=p3, d1="robust_mahalanobis", d2="jaccard", d3="hamming",
                                      robust_method="trimmed", alpha=0.07, epsilon=0.05, n_iters=20, weights=None)

In [10]:
relms_dist_matrix.compute(X=data_pd[0:2000])
# Warning: for the whole sample, time > 23 mins.

array([[ 0.        , 15.46117954, 15.40756806, ..., 15.32605925,
        15.44377892, 15.52438765],
       [15.46117867,  0.        , 15.3698612 , ..., 15.40913592,
        15.39775349, 15.49629569],
       [15.40756786, 15.36986068,  0.        , ..., 15.37663682,
        15.30916165, 15.4579883 ],
       ...,
       [15.32605924, 15.40913592, 15.37663682, ...,  0.        ,
        15.40950905, 15.37371301],
       [15.44377893, 15.39775347, 15.30916165, ..., 15.40950905,
         0.        , 15.52255903],
       [15.5243876 , 15.49629553, 15.45798823, ..., 15.37371302,
        15.52255903,  0.        ]])