# **FastKmedoids**

## **Requirements**

In [1]:
pip install FastKmedoids --upgrade


Collecting FastKmedoids
  Downloading FastKmedoids-0.0.18-py3-none-any.whl.metadata (895 bytes)
Downloading FastKmedoids-0.0.18-py3-none-any.whl (12 kB)
Installing collected packages: FastKmedoids
  Attempting uninstall: FastKmedoids
    Found existing installation: FastKmedoids 0.0.17
    Uninstalling FastKmedoids-0.0.17:
      Successfully uninstalled FastKmedoids-0.0.17
Successfully installed FastKmedoids-0.0.18


In [2]:
pip show FastKmedoids

Name: FastKmedoids
Version: 0.0.18
Summary: This is a package to implement a fast and powerful version of the Kmedoids clustering algorithm, which is built on the Generalised Gower distance, already available in the PyDistances package
Home-page: https://github.com/FabioScielzoOrtiz/FastKmedoids_Package
Author: Fabio Scielzo Ortiz
Author-email: fabioscielzo98@gmail.com
License: 
Location: c:\Users\fscielzo\anaconda3\Lib\site-packages
Requires: numpy, polars
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [3]:
import polars as pl
from PyMachineLearning.preprocessing import encoder
from FastKmedoids.models import FastKmedoidsGG, KFoldFastKmedoidsGG

## **Data processing**

In [4]:
madrid_houses_df = pl.read_csv('madrid_houses.csv')
columns_to_exclude = ['', 'id','sq_mt_allotment','floor', 'neighborhood', 'district'] 
madrid_houses_df = madrid_houses_df.select(pl.exclude(columns_to_exclude))

binary_cols = ['is_renewal_needed', 'has_lift', 'is_exterior', 'has_parking']
multi_cols = ['energy_certificate', 'house_type']
quant_cols = [x for x in madrid_houses_df.columns if x not in binary_cols + multi_cols]

encoder_ = encoder(method='ordinal')
encoded_arr = encoder_.fit_transform(madrid_houses_df[binary_cols + multi_cols])
cat_df = pl.DataFrame(encoded_arr)
cat_df.columns =  binary_cols + multi_cols
cat_df = cat_df.with_columns([pl.col(col).cast(pl.Int64) for col in cat_df.columns])
quant_df = madrid_houses_df[quant_cols]

madrid_houses_df = pl.concat([quant_df, cat_df], how='horizontal')

In [4]:
madrid_houses_df.head()

sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,is_renewal_needed,has_lift,is_exterior,has_parking,energy_certificate,house_type
f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
64.0,2,1,1,85000,0,0,1,0,4,0
70.0,3,1,1,129900,1,1,1,0,0,0
94.0,2,2,1,144247,0,1,1,0,0,0
64.0,2,1,1,109900,0,1,1,0,0,0
108.0,2,2,1,260000,0,1,1,1,0,0


In [5]:
madrid_houses_df.shape

(21739, 11)

## **FastKmedoids**

In [5]:
fast_kmedoids = FastKmedoidsGG(n_clusters=3, method='pam', init='heuristic', max_iter=100, random_state=123,
                                frac_sample_size=0.01, p1=5, p2=4, p3=2, 
                                d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                                robust_maha_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20)
fast_kmedoids.fit(X=madrid_houses_df) 

Distance matrix size: (217, 217)


In [8]:
fast_kmedoids.labels

array([2, 1, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
fast_kmedoids.labels.shape

(21739,)

In [10]:
fast_kmedoids.D_GG

array([[0.        , 2.0632824 , 2.09686916, ..., 3.20285353, 1.84380918,
        3.14611266],
       [2.0632824 , 0.        , 2.49128404, ..., 3.09380609, 3.01923008,
        1.43032853],
       [2.09686916, 2.49128404, 0.        , ..., 2.81596982, 3.36492227,
        2.98877842],
       ...,
       [3.20285353, 3.09380609, 2.81596982, ..., 0.        , 3.21190291,
        2.87445442],
       [1.84380918, 3.01923008, 3.36492227, ..., 3.21190291, 0.        ,
        2.97473498],
       [3.14611266, 1.43032853, 2.98877842, ..., 2.87445442, 2.97473498,
        0.        ]])

In [11]:
fast_kmedoids.D_GG.shape

(217, 217)

## **KFold FastKmedoids**

In [6]:
kfold_fast_kmedoids = KFoldFastKmedoidsGG(n_clusters=3, method='pam', init='heuristic', max_iter=100, random_state=123,
                                          frac_sample_size=0.1, n_splits=10, shuffle=True, kfold_random_state=123,
                                          p1=5, p2=4, p3=2, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                                          robust_maha_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20,
                                          fast_VG=False, VG_sample_size=1000, VG_n_samples=5)
kfold_fast_kmedoids.fit(X=madrid_houses_df) 

Num.Folds: 10. Fold size: 2173.
Distance matrix size: 218 (0.1*2173) 
Clustering Fold 0
Distance matrix size: (217, 217)
Clustering Fold 1
Distance matrix size: (217, 217)
Clustering Fold 2
Distance matrix size: (217, 217)
Clustering Fold 3
Distance matrix size: (217, 217)
Clustering Fold 4
Distance matrix size: (217, 217)
Clustering Fold 5
Distance matrix size: (217, 217)
Clustering Fold 6
Distance matrix size: (217, 217)
Clustering Fold 7
Distance matrix size: (217, 217)
Clustering Fold 8
Distance matrix size: (217, 217)
Clustering Fold 9
Distance matrix size: (217, 217)
X_medoids size: (30, 11)
Distance matrix size: (24, 24)


In [7]:
kfold_fast_kmedoids.labels

array([0, 1, 1, ..., 1, 1, 1])

In [8]:
kfold_fast_kmedoids.labels.shape

(21739,)