# Quick Start

## Installation

In [80]:
# pip install robust-mixed-dist

To see the available versions of the package go to the release history at PyPi: https://pypi.org/project/robust_mixed_dist/#history

## Data

| Variable | Description | Variable Type | Possible Categories / Range |
|:---|:---|:---|:---|
| **latitude** | Latitude of the house | Quantitative | 24.86 - 25.27 |
| **longitude** | Longitude of the house | Quantitative | 55.06 - 55.44 |
| **price** | Market price of the house | Quantitative | 220000 - 35000000 |
| **price_per_sqft** | Price per square foot | Quantitative | 361.87 - 4805.87 |
| **size in sqft** | Size in square feet | Quantitative | 294 - 9576 |
| **no of bedrooms** | Number of bedrooms in the house | Multiclass | 0, 1, 2, 3, 4, 5 |
| **no of bathrooms** | Number of bathrooms in the house | Multiclass | 0, 1, 2, 3, 4, 5, 6 |
| **quality** | Quality level of the house *(response variable)* | Binary | Low (0), Medium-High-UltraHigh (1) |
| **balcony** | Indicates if the house has a balcony | Binary | true (1), false (0) |
| **barbecue area** | Indicates if the house has a barbecue area | Binary | true (1), false (0) |
| **private pool** | Indicates if the house has a private pool | Binary | true (1), false (0) |
| **private garden** | Indicates if the house has a private garden | Binary | true (1), false (0) |

In [81]:
import polars as pl

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/robust_mixed_dist-docu/refs/heads/main/data/dubai_houses_processed.csv"
df = pl.read_csv(data_url)

quant_variables = ['latitude', 'longitude', 'price', 'size_in_sqft', 'price_per_sqft']
binary_variables = ['quality', 'balcony', 'barbecue_area', 'private_pool', 'private_garden']
multiclass_variables = ['no_of_bedrooms', 'no_of_bathrooms']

X = df[quant_variables + binary_variables + multiclass_variables]

In [82]:
X.head()

latitude,longitude,price,size_in_sqft,price_per_sqft,quality,balcony,barbecue_area,private_pool,private_garden,no_of_bedrooms,no_of_bathrooms
f64,f64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64
25.113208,55.138932,2700000,1079,2502.32,1,1,1,0,0,1,2
25.106809,55.151201,2850000,1582,1801.52,1,1,0,0,0,2,2
25.063302,55.137728,1150000,1951,589.44,1,1,0,0,0,3,5
25.227295,55.341761,2850000,2020,1410.89,0,1,0,0,0,2,3
25.114275,55.139764,1729200,507,3410.65,1,0,0,0,0,0,1


## `robust_mixed_dist.mixed`

### `GGowerDistMatrix`

In [83]:
from robust_mixed_dist.mixed import GGowerDistMatrix

In [84]:
p1 = len(quant_variables)
p2 = len(binary_variables)
p3 = len(multiclass_variables)

In [85]:
ggower_dist_matrix = GGowerDistMatrix(
    p1=p1, 
    p2=p2, 
    p3=p3,
    d1="robust_mahalanobis", 
    d2="sokal", 
    d3="hamming",
    robust_method="trimmed", 
    alpha=0.05, 
    epsilon=0.05, 
    n_iters=20, 
    weights=None)

ggower_dist_matrix.compute(X=X)

array([[0.        , 1.64946389, 2.79008972, ..., 2.36223596, 2.87529178,
        2.06208371],
       [1.64946389, 0.        , 2.04187428, ..., 1.78060429, 2.02786949,
        1.95803076],
       [2.79008972, 2.04187428, 0.        , ..., 0.77870059, 2.3599971 ,
        2.32341031],
       ...,
       [2.36223596, 1.78060429, 0.77870059, ..., 0.        , 2.50576595,
        2.45118981],
       [2.87529178, 2.02786949, 2.3599971 , ..., 2.50576595, 0.        ,
        1.90657364],
       [2.06208371, 1.95803076, 2.32341031, ..., 2.45118981, 1.90657364,
        0.        ]], shape=(1905, 1905))

### `GGowerDist`

In [86]:
from robust_mixed_dist.mixed import GGowerDist

In [87]:
xi = X[2,:]
xr = X[10,:]

ggower_dist = GGowerDist(
    p1=p1, 
    p2=p2, 
    p3=p3,
    d1="robust_mahalanobis", 
    d2="sokal", 
    d3="hamming",
    robust_method="trimmed", 
    alpha=0.05, 
    epsilon=0.05, 
    n_iters=20, 
    weights=None)

ggower_dist.fit(X=X)

ggower_dist.compute(xi=xi, xr=xr)

np.float64(1.7471954488718715)

### `FastGGowerDistMatrix`

In [88]:
from robust_mixed_dist.mixed import FastGGowerDistMatrix

In [89]:
fastGGower = FastGGowerDistMatrix(
    frac_sample_size=0.1, 
    random_state=123, 
    p1=p1, 
    p2=p2, 
    p3=p3, 
    d1='robust_mahalanobis',
    d2='sokal', 
    d3='hamming', 
    robust_method='trimmed', 
    alpha=0.05, 
    epsilon=0.05
)

fastGGower.compute(X=X)

fastGGower.D_GGower

array([[0.        , 3.68529726, 3.68475948, ..., 4.22094877, 3.66144667,
        3.52869984],
       [3.68529726, 0.        , 2.52875815, ..., 2.66387506, 2.52254662,
        2.62950908],
       [3.68475948, 2.52875815, 0.        , ..., 2.24613845, 0.873628  ,
        0.96966438],
       ...,
       [4.22094877, 2.66387506, 2.24613845, ..., 0.        , 2.28228857,
        2.66848323],
       [3.66144667, 2.52254662, 0.873628  , ..., 2.28228857, 0.        ,
        1.21708063],
       [3.52869984, 2.62950908, 0.96966438, ..., 2.66848323, 1.21708063,
        0.        ]], shape=(190, 190))

### `RelMSDistMatrix`

In [90]:
from robust_mixed_dist.mixed import RelMSDistMatrix

In [91]:
relms_dist_matrix = RelMSDistMatrix(
    p1=p1, 
    p2=p2, 
    p3=p3, 
    d1="robust_mahalanobis", 
    d2="jaccard", 
    d3="hamming", 
    robust_method="trimmed", 
    alpha=0.05, 
    epsilon=0.05, 
    n_iters=20, 
    weights=None
)

relms_dist_matrix.compute(X=X)

G2 is not PSD, a transformation to force it will be applied.
G3 is not PSD, a transformation to force it will be applied.


array([[ 0.        , 18.85971943, 18.93892147, ..., 18.91363393,
        18.90754052, 18.88350166],
       [18.85971936,  0.        , 18.9048603 , ..., 18.89392648,
        18.87910279, 18.87505502],
       [18.93892141, 18.90486003,  0.        , ..., 18.82896998,
        18.90545703, 18.90382089],
       ...,
       [18.91363392, 18.89392648, 18.82896997, ...,  0.        ,
        18.91909055, 18.91526868],
       [18.90754053, 18.8791028 , 18.905457  , ..., 18.91909055,
         0.        , 18.83899926],
       [18.88350169, 18.875055  , 18.90382089, ..., 18.9152687 ,
        18.83899926,  0.        ]], shape=(1905, 1905))

## `robust_mixed_dist.quantitative`

### `euclidean_dist_matrix`

In [92]:
from robust_mixed_dist.quantitative import euclidean_dist_matrix

In [93]:
euclidean_dist_matrix(X=X[quant_variables])

array([[      0.        ,  150002.48041163, 1550001.42564254, ...,
         200004.30910639, 2025000.65272331, 1939113.64052303],
       [ 150002.48041163,       0.        , 1700000.47214668, ...,
          50002.10458763, 2175000.34481037, 2089113.31944683],
       [1550001.42564254, 1700000.47214668,       0.        , ...,
        1750000.23836684,  475001.65333313,  389114.87041128],
       ...,
       [ 200004.30910639,   50002.10458763, 1750000.23836684, ...,
              0.        , 2225000.39656347, 2139113.36955273],
       [2025000.65272331, 2175000.34481037,  475001.65333313, ...,
        2225000.39656347,       0.        ,   85887.02978977],
       [1939113.64052303, 2089113.31944683,  389114.87041128, ...,
        2139113.36955273,   85887.02978977,       0.        ]],
      shape=(1905, 1905))

### `euclidean_dist`

In [94]:
from robust_mixed_dist.quantitative import euclidean_dist

In [95]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

euclidean_dist(xi=xi, xr=xr)

2400000.3317385474

### `minkowski_dist_matrix`

In [96]:
from robust_mixed_dist.quantitative import minkowski_dist_matrix

In [97]:
minkowski_dist_matrix(X=X[quant_variables], q=1)


array([[      0.      ,  151203.818668, 1552784.93111 , ...,
         201851.029416, 2026929.290262, 1940943.259859],
       [ 151203.818668,       0.      , 1701581.13698 , ...,
          50647.223546, 2176731.471594, 2090745.441191],
       [1552784.93111 , 1701581.13698 ,       0.      , ...,
        1750934.500526,  476533.822672,  390625.702813],
       ...,
       [ 201851.029416,   50647.223546, 1750934.500526, ...,
              0.      , 2226780.677854, 2140794.797713],
       [2026929.290262, 2176731.471594,  476533.822672, ...,
        2226780.677854,       0.      ,   85986.030403],
       [1940943.259859, 2090745.441191,  390625.702813, ...,
        2140794.797713,   85986.030403,       0.      ]],
      shape=(1905, 1905))

### `minkowski_dist`

In [98]:
from robust_mixed_dist.quantitative import minkowski_dist

In [99]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

minkowski_dist(xi=xi, xr=xr, q=1)

np.float64(2401294.7191080004)

### `canberra_dist_matrix`

In [100]:
from robust_mixed_dist.quantitative import canberra_dist_matrix

In [101]:
canberra_dist_matrix(X=X[quant_variables])

array([[0.        , 0.3791237 , 1.31009432, ..., 0.57187821, 1.25368465,
        1.15877842],
       [0.3791237 , 0.        , 1.03737598, ..., 0.20199247, 1.30874091,
        1.21638296],
       [1.31009432, 1.03737598, 0.        , ..., 0.87872481, 0.92625515,
        0.85725929],
       ...,
       [0.57187821, 0.20199247, 0.87872481, ..., 0.        , 1.31626027,
        1.22724486],
       [1.25368465, 1.30874091, 0.92625515, ..., 1.31626027, 0.        ,
        0.12046549],
       [1.15877842, 1.21638296, 0.85725929, ..., 1.22724486, 0.12046549,
        0.        ]], shape=(1905, 1905))

### `canberra_dist`

In [102]:
from robust_mixed_dist.quantitative import canberra_dist

In [103]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

canberra_dist(xi=xi, xr=xr)

np.float64(1.0399913488310089)

### `pearson_dist_matrix`

In [104]:
from robust_mixed_dist.quantitative import pearson_dist_matrix

In [105]:
pearson_dist_matrix(X=X[quant_variables])


array([[0.        , 1.20925465, 3.17227178, ..., 3.78782432, 2.9029884 ,
        2.50486894],
       [1.20925465, 0.        , 2.0785152 , ..., 3.2901998 , 2.19157364,
        1.76066423],
       [3.17227178, 2.0785152 , 0.        , ..., 4.06630612, 1.98147384,
        1.44992911],
       ...,
       [3.78782432, 3.2901998 , 4.06630612, ..., 0.        , 3.23000242,
        3.82301831],
       [2.9029884 , 2.19157364, 1.98147384, ..., 3.23000242, 0.        ,
        1.09935355],
       [2.50486894, 1.76066423, 1.44992911, ..., 3.82301831, 1.09935355,
        0.        ]], shape=(1905, 1905))

### `mahalanobis_dist_matrix`

In [106]:
from robust_mixed_dist.quantitative import mahalanobis_dist_matrix

In [107]:
mahalanobis_dist_matrix(X=X[quant_variables])


array([[0.        , 1.53300007, 3.59602077, ..., 2.2183579 , 3.06832405,
        2.96311246],
       [1.53300007, 0.        , 2.1252418 , ..., 0.73353908, 1.96608988,
        1.83655561],
       [3.59602077, 2.1252418 , 0.        , ..., 1.46900905, 2.11887678,
        2.0285113 ],
       ...,
       [2.2183579 , 0.73353908, 1.46900905, ..., 0.        , 1.98371456,
        1.85102134],
       [3.06832405, 1.96608988, 2.11887678, ..., 1.98371456, 0.        ,
        0.13626976],
       [2.96311246, 1.83655561, 2.0285113 , ..., 1.85102134, 0.13626976,
        0.        ]], shape=(1905, 1905))

### `mahalanobis_dist`

In [108]:
from robust_mixed_dist.quantitative import mahalanobis_dist
import numpy as np

In [109]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

S = np.cov(X[quant_variables], rowvar=False)

mahalanobis_dist(xi=xi, xr=xr, S=S)

np.float64(2.7759092403090455)

### `robust_maha_dist_matrix`

In [110]:
from robust_mixed_dist.quantitative import robust_maha_dist_matrix, S_robust

In [111]:
S_robust_trimmed = S_robust(X=X[quant_variables], method="trimmed",
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist_matrix(X=X[quant_variables],   
                        S_robust=S_robust_trimmed)

array([[0.        , 1.81864545, 3.91631623, ..., 2.35363057, 4.98194325,
        4.75458207],
       [1.81864545, 0.        , 2.16959868, ..., 0.6471128 , 3.43994713,
        3.20084918],
       [3.91631623, 2.16959868, 0.        , ..., 1.64792972, 2.60025908,
        2.39706372],
       ...,
       [2.35363057, 0.6471128 , 1.64792972, ..., 0.        , 3.35699379,
        3.11692523],
       [4.98194325, 3.43994713, 2.60025908, ..., 3.35699379, 0.        ,
        0.24221216],
       [4.75458207, 3.20084918, 2.39706372, ..., 3.11692523, 0.24221216,
        0.        ]], shape=(1905, 1905))

In [112]:
S_robust_winsorized = S_robust(X=X[quant_variables],  
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)


robust_maha_dist_matrix(X=X[quant_variables],                          
                        S_robust=S_robust_winsorized)

array([[0.        , 1.63032279, 3.65293571, ..., 2.22371162, 3.90611218,
        3.74688562],
       [1.63032279, 0.        , 2.08532543, ..., 0.66249147, 2.59842998,
        2.42433088],
       [3.65293571, 2.08532543, 0.        , ..., 1.52111172, 2.10402449,
        1.9736514 ],
       ...,
       [2.22371162, 0.66249147, 1.52111172, ..., 0.        , 2.51448355,
        2.33923374],
       [3.90611218, 2.59842998, 2.10402449, ..., 2.51448355, 0.        ,
        0.17812079],
       [3.74688562, 2.42433088, 1.9736514 , ..., 2.33923374, 0.17812079,
        0.        ]], shape=(1905, 1905))

In [114]:
S_robust_MAD = S_robust(X=X[quant_variables], method="MAD", epsilon=0.05, 
                               n_iters=20, alpha=None, weights=None)

robust_maha_dist_matrix(X=X[quant_variables], S_robust=S_robust_MAD)

array([[0.        , 1.86757181, 3.95871434, ..., 2.36822465, 5.39713175,
        5.13977515],
       [1.86757181, 0.        , 2.17155508, ..., 0.64240264, 3.80101401,
        3.5324399 ],
       [3.95871434, 2.17155508, 0.        , ..., 1.67434971, 2.92445535,
        2.68698538],
       ...,
       [2.36822465, 0.64240264, 1.67434971, ..., 0.        , 3.75496147,
        3.48534267],
       [5.39713175, 3.80101401, 2.92445535, ..., 3.75496147, 0.        ,
        0.27150497],
       [5.13977515, 3.5324399 , 2.68698538, ..., 3.48534267, 0.27150497,
        0.        ]], shape=(1905, 1905))

### `robust_maha_dist`

In [115]:
from robust_mixed_dist.quantitative import robust_maha_dist, S_robust

In [116]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

In [118]:
S_robust_trimmed = S_robust(X=X[quant_variables], method="trimmed", 
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_trimmed)

np.float64(2.284177358842924)

In [119]:
S_robust_winsorized = S_robust(X=X[quant_variables],                   
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_winsorized)

np.float64(2.106209364933163)

In [120]:
S_robust_MAD = S_robust(X=X[quant_variables], method="MAD",
                               epsilon=0.05, n_iters=20, alpha=None,
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_MAD)

np.float64(2.321626389691806)

## `robust_mixed_dist.binary`

### `sokal_dist_matrix`

In [121]:
from robust_mixed_dist.binary import sokal_dist_matrix

In [122]:
sokal_dist_matrix(X=X[binary_variables])

array([[0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.57142857,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       ...,
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.57142857, 0.33333333, 0.33333333, ..., 0.33333333, 0.        ,
        0.57142857],
       [0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.57142857,
        0.        ]], shape=(1905, 1905))

### `sokal_dist`

In [123]:
from robust_mixed_dist.binary import sokal_dist

In [124]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

sokal_dist(xi=xi, xr=xr)

2.000000000001225

### `jaccard_dist_matrix`

In [125]:
from robust_mixed_dist.binary import jaccard_dist_matrix

In [126]:
jaccard_dist_matrix(X=X[binary_variables])

array([[0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       ...,
       [0.33333333, 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333],
       [0.5       , 0.33333333, 0.33333333, ..., 0.33333333, 0.        ,
        0.5       ],
       [0.        , 0.33333333, 0.33333333, ..., 0.33333333, 0.5       ,
        0.        ]], shape=(1905, 1905))

### `jaccard_dist`

In [127]:
from robust_mixed_dist.binary import jaccard_dist

In [128]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

jaccard_dist(xi=xi, xr=xr)

np.float64(0.0)

## `robust_mixed_dist.multiclass`

### `hamming_dist_matrix`

In [129]:
from robust_mixed_dist.multiclass import hamming_dist_matrix

In [130]:
hamming_dist_matrix(X=X[multiclass_variables])

array([[0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0.5, 0. , 1. , ..., 1. , 0.5, 0.5],
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       ...,
       [1. , 1. , 0. , ..., 0. , 1. , 1. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ],
       [0. , 0.5, 1. , ..., 1. , 0. , 0. ]], shape=(1905, 1905))

### `hamming_dist`

In [131]:
from robust_mixed_dist.multiclass import hamming_dist

In [132]:
xi = X[quant_variables][2,:]
xr = X[quant_variables][10,:]

hamming_dist(xi=xi, xr=xr)

np.float64(1.0)