# Quick Start

## Installation

In [1]:
# pip install robust-mixed-dist

To see the available versions of the package go to the release history at PyPi: https://pypi.org/project/robust_mixed_dist/#history

## Data

| Variable | Description | Variable Type | Possible Categories / Range |
|:---|:---|:---|:---|
| **latitude** | Latitude of the house | Quantitative | 24.86 - 25.27 |
| **longitude** | Longitude of the house | Quantitative | 55.06 - 55.44 |
| **price** | Market price of the house | Quantitative | 220000 - 35000000 |
| **price_per_sqft** | Price per square foot | Quantitative | 361.87 - 4805.87 |
| **size in sqft** | Size in square feet | Quantitative | 294 - 9576 |
| **no of bedrooms** | Number of bedrooms in the house | Multiclass | 0, 1, 2, 3, 4, 5 |
| **no of bathrooms** | Number of bathrooms in the house | Multiclass | 0, 1, 2, 3, 4, 5, 6 |
| **quality** | Quality level of the house *(response variable)* | Binary | Low (0), Medium-High-UltraHigh (1) |
| **balcony** | Indicates if the house has a balcony | Binary | true (1), false (0) |
| **barbecue area** | Indicates if the house has a barbecue area | Binary | true (1), false (0) |
| **private pool** | Indicates if the house has a private pool | Binary | true (1), false (0) |
| **private garden** | Indicates if the house has a private garden | Binary | true (1), false (0) |

In [2]:
import polars as pl

data_url = "https://raw.githubusercontent.com/FabioScielzoOrtiz/robust_mixed_dist-docu/refs/heads/main/data/dubai_houses_processed.csv"
df = pl.read_csv(data_url)

quant_variables = ['latitude', 'longitude', 'price', 'size_in_sqft', 'price_per_sqft']
binary_variables = ['quality', 'balcony', 'barbecue_area', 'private_pool', 'private_garden']
multiclass_variables = ['no_of_bedrooms', 'no_of_bathrooms']

X = df[quant_variables + binary_variables + multiclass_variables]

HTTPError: HTTP Error 404: Not Found

In [None]:
X.head()

latitude,longitude,price,size_in_sqft,price_per_sqft,quality,balcony,barbecue_area,private_pool,private_garden,no_of_bedrooms,no_of_bathrooms
f64,f64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64
25.113208,55.138932,2700000,1079,2502.32,1,1,1,0,0,1,2
25.106809,55.151201,2850000,1582,1801.52,1,1,0,0,0,2,2
25.063302,55.137728,1150000,1951,589.44,1,1,0,0,0,3,5
25.227295,55.341761,2850000,2020,1410.89,0,1,0,0,0,2,3
25.114275,55.139764,1729200,507,3410.65,1,0,0,0,0,0,1


## `robust_mixed_dist.mixed`

### `GGowerDistMatrix`

In [8]:
from robust_mixed_dist.mixed import GGowerDistMatrix

In [9]:
p1 = len(quant_variables)
p2 = len(binary_variables)
p3 = len(multiclass_variables)

In [None]:
ggower_dist_matrix = GGowerDistMatrix(
    p1=p1, 
    p2=p2, 
    p3=p3,
    d1="robust_mahalanobis", 
    d2="sokal", 
    d3="hamming",
    robust_method="trimmed", 
    alpha=0.05, 
    epsilon=0.05, 
    n_iters=20, 
    weights=None)

ggower_dist_matrix.compute(X=X)

array([[0.        , 1.39571564, 2.57840496, ..., 2.16616395, 2.47550121,
        2.12431338],
       [1.39571564, 0.        , 2.00113583, ..., 1.7743948 , 1.91048904,
        1.82565206],
       [2.57840496, 2.00113583, 0.        , ..., 0.73628326, 2.22174724,
        2.17565075],
       ...,
       [2.16616395, 1.7743948 , 0.73628326, ..., 0.        , 2.41579742,
        2.35070732],
       [2.47550121, 1.91048904, 2.22174724, ..., 2.41579742, 0.        ,
        1.08866947],
       [2.12431338, 1.82565206, 2.17565075, ..., 2.35070732, 1.08866947,
        0.        ]], shape=(1905, 1905))

### `GGowerDist`

In [None]:
xi = 
xr = 

ggower_dist = GGowerDist(
        p1=p1, 
    p2=p2, 
    p3=p3,
    d1="robust_mahalanobis", 
    d2="sokal", 
    d3="hamming",
    robust_method="trimmed", 
    alpha=0.05, 
    epsilon=0.05, 
    n_iters=20, 
    weights=None)

ggower_dist.fit(X=data)

ggower_dist.compute(xi=xi, xr=xr)

### `FastGGowerDistMatrix`

In [None]:
fastGGower = FastGGowerDistMatrix(frac_sample_size=0.03, random_state=123, p1=p1, p2=p2, p3=p3, 
                                  d1='robust_mahalanobis', d2='jaccard', d3='hamming', 
                                  robust_method='trimmed', alpha=0.07, epsilon=0.05)

fastGGower.compute(data_pd)

fastGGower.D_GGower

### `RelMSDistMatrix`

In [None]:
relms_dist_matrix = RelMSDistMatrix(p1=p1, p2=p2, p3=p3, 
                                    d1="robust_mahalanobis", d2="jaccard", d3="hamming", 
                                    robust_method="trimmed", alpha=0.07, epsilon=0.05, 
                                    n_iters=20, weights=None)

relms_dist_matrix.compute(X=data_pd[0:2000])

## `robust_mixed_dist.quantitative`

### `euclidean_dist_matrix`

In [None]:
euclidean_dist_matrix(X=data[quant_cols])


### `euclidean_dist`

In [None]:
xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

euclidean_dist(xi=xi, xr=xr)

### `minkowski_dist_matrix`

In [None]:
minkowski_dist_matrix(X=data[quant_cols], q=1)


### `minkowski_dist`

In [None]:
xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

minkowski_dist(xi=xi, xr=xr, q=1)

### `canberra_dist_matrix`

In [None]:
canberra_dist_matrix(X=data[quant_cols])


### `canberra_dist`

In [None]:
xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

canberra_dist(xi=xi, xr=xr)

### `pearson_dist_matrix`

In [None]:
pearson_dist_matrix(X=data[quant_cols])


### `mahalanobis_dist_matrix`

In [None]:
mahalanobis_dist_matrix(X=data[quant_cols])


### `mahalanobis_dist`

In [None]:
xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

S = np.cov(data_pd[quant_cols], rowvar=False)

mahalanobis_dist(xi=xi, xr=xr, S=S)

### `robust_maha_dist_matrix`

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols], method="trimmed",
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist_matrix(X=data[quant_cols],   
                        S_robust=S_robust_estimation)

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols],  
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)


robust_maha_dist_matrix(X=data[quant_cols],                          
                        S_robust=S_robust_estimation)

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols], method="MAD", epsilon=0.05, 
                               n_iters=20, alpha=None, weights=None)

robust_maha_dist_matrix(X=data[quant_cols], S_robust=S_robust_estimation)

### `robust_maha_dist`

In [None]:
xi = data[quant_cols].iloc[2,:]
xr = data[quant_cols].iloc[10,:]

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols], method="trimmed", 
                               epsilon=0.05, n_iters=20, alpha=0.07, 
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols],                   
                               method="winsorized", epsilon=0.05, 
                               n_iters=20, alpha=0.07, weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)

In [None]:
S_robust_estimation = S_robust(X=data[quant_cols], method="MAD",
                               epsilon=0.05, n_iters=20, alpha=None,
                               weights=None)

robust_maha_dist(xi=xi, xr=xr, S_robust=S_robust_estimation)

## `robust_mixed_dist.binary`

### `sokal_dist_matrix`

In [None]:
sokal_dist_matrix(X=data[binary_cols])

### `sokal_dist`

In [None]:
xi = data[binary_cols].iloc[2,:]
xr = data[binary_cols].iloc[10,:]

sokal_dist(xi=xi, xr=xr)

### `jaccard_dist_matrix`

In [None]:
jaccard_dist_matrix(X=data[binary_cols])


### `jaccard_dist`

In [None]:
xi = data[binary_cols].iloc[2,:]
xr = data[binary_cols].iloc[10,:]

jaccard_dist(xi=xi, xr=xr)

## `robust_mixed_dist.multiclass`

### `hamming_dist_matrix`

In [None]:
hamming_dist_matrix(X=data[multiclass_cols])

### `hamming_dist`

In [None]:
xi = data[multiclass_cols].iloc[2,:]
xr = data[multiclass_cols].iloc[10,:]

hamming_dist(xi=xi, xr=xr)