# **SupervisedClustering Usage**

## **Requirements**

In [None]:
pip install SupervisedClustering --upgrade

In [117]:
import polars as pl
import numpy as np
from PyMachineLearning.preprocessing import encoder, imputer, scaler, features_selector
from PyMachineLearning.evaluation import SimpleEvaluation
from sklearn.ensemble import RandomForestRegressor
from  xgboost import XGBRegressor
from SupervisedClustering.models import FastKmedoidsEstimator, KFoldFastKmedoidsEstimator, KmeansEstimator, MiniBatchKmeansEstimator
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore') 

In [102]:
def processing(df):
    columns_to_exclude = ['', 'id','sq_mt_allotment','floor', 'neighborhood', 'district'] 
    df = df.select(pl.exclude(columns_to_exclude))
    binary_cols = ['is_renewal_needed', 'has_lift', 'is_exterior', 'has_parking']
    multi_cols = ['energy_certificate', 'house_type']
    quant_cols = [x for x in df.columns if x not in binary_cols + multi_cols]
    encoding = encoder(method='ordinal')
    encoded_arr = encoding.fit_transform(df[binary_cols + multi_cols])
    cat_df = pl.DataFrame(encoded_arr)
    cat_df.columns =  binary_cols + multi_cols
    cat_df = cat_df.with_columns([pl.col(col).cast(pl.Int64) for col in cat_df.columns])
    quant_df = df[quant_cols]
    df = pl.concat([quant_df, cat_df], how='horizontal')
    response = 'buy_price'
    quant_predictors = [x for x in quant_cols if x != response]
    binary_predictors = [x for x in binary_cols if x != response]
    multi_predictors = [x for x in multi_cols if x != response]
    cat_predictors = binary_predictors + multi_predictors
    p1, p2, p3 = len(quant_predictors), len(binary_predictors), len(multi_predictors)
    return df, p1, p2, p3, response, quant_predictors, cat_predictors

## **Data processing**

In [103]:
madrid_houses_df = pl.read_csv('madrid_houses.csv')
madrid_houses_df, p1, p2, p3, response, quant_predictors, cat_predictors = processing(madrid_houses_df)
predictors = quant_predictors + cat_predictors

In [104]:
madrid_houses_df.head()

sq_mt_built,n_rooms,n_bathrooms,n_floors,buy_price,is_renewal_needed,has_lift,is_exterior,has_parking,energy_certificate,house_type
f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
64.0,2,1,1,85000,0,0,1,0,4,0
70.0,3,1,1,129900,1,1,1,0,0,0
94.0,2,2,1,144247,0,1,1,0,0,0
64.0,2,1,1,109900,0,1,1,0,0,0
108.0,2,2,1,260000,0,1,1,1,0,0


In [105]:
X = madrid_houses_df[predictors].to_pandas()
Y = madrid_houses_df[response].to_pandas()

In [106]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75, random_state=123)

## **Clustering estimators usage**

In [None]:
class FastKmedoidsEstimator(BaseEstimator, RegressorMixin) :
    """
    Implements the Fast-K-medoids-Estimator based on Fast-K-medoids and Sklearn estimators.
    """

    def __init__(self, estimators, n_clusters, method='pam', init='heuristic', max_iter=100, random_state=123,
                    frac_sample_size=0.1, p1=None, p2=None, p3=None, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                    robust_maha_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, q=1,
                    fast_VG=False, VG_sample_size=1000, VG_n_samples=5, y_type=None) :
        """
        Constructor method.
        
        Parameters:
            estimators: a dictionary with the sklearn estimators (single models or pipelines) to be used in each clusters (keys: cluster indexes, values: estimators initialized).
            n_clusters: the number of clusters.
            method: the k-medoids clustering method. Must be in ['pam', 'alternate']. PAM is the classic one, more accurate but slower.
            init: the k-medoids initialization method. Must be in ['heuristic', 'random']. Heuristic is the classic one, smarter burt slower.
            max_iter: the maximum number of iterations run by k-medodis.
            frac_sample_size: the sample size in proportional terms.
            p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
            d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis']. 
            d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
            d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
            q: the parameter that defines the Minkowski distance. Must be a positive integer.
            robust_maha_method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
            epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
            n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
            fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
            VG_sample_size: sample size to be used to make the estimation of the geometric variability.
            VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
            random_state: the random seed used for the (random) sample elements.
            y_type: the type of response variable. Must be in ['quantitative', 'binary', 'multiclass'].
        """ 
    def set_params(self, **params):
        """
        Set params method: for setting params properly.
        
        Parameters:
            params: a dictionary with params as values and params keys as names, following the sklearn conventions.
        """   
    def fit(self, X, y, weights=None):
        """
        Fit method: fitting the Fast KMedoids algorithm to `X` (and `y` if needed).
        
        Parameters:
            X: a Pandas or Polars data-frame or a NumPy array. Represents a predictors matrix. Is required.
            y: a Pandas or Polars series or a NumPy array. Represents a response variable. Is required.
            weights: the sample weights, if exists.
        """  
    def predict(self, X):
        """
        Predict method: predicting the response variable for `X`.

        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a predictors matrix. Is required.
        """

In [None]:
class KFoldFastKmedoidsEstimator(BaseEstimator, RegressorMixin) :
    """
    Implements the KFold-Fast-K-medoids-Estimator based on KFold-Fast-K-medoids and Sklearn estiamators.
    """

    def __init__(self, estimators, n_clusters, method='pam', init='heuristic', max_iter=100, random_state=123,
                        frac_sample_size=0.1, p1=None, p2=None, p3=None, d1='robust_mahalanobis', d2='jaccard', d3='matching', 
                        robust_maha_method='trimmed', alpha=0.05, epsilon=0.05, n_iters=20, q=1,
                        fast_VG=False, VG_sample_size=1000, VG_n_samples=5, 
                        n_splits=5, shuffle=True, kfold_random_state=123, y_type=None, verbose=True) :
            """
            Constructor method.
            
            Parameters:
                estimators: a dictionary with the sklearn estimators (single models or pipelines) to be used in each clusters (keys: cluster indexes, values: estimators initialized).
                n_clusters: the number of clusters.
                method: the k-medoids clustering method. Must be in ['pam', 'alternate']. PAM is the classic one, more accurate but slower.
                init: the k-medoids initialization method. Must be in ['heuristic', 'random']. Heuristic is the classic one, smarter burt slower.
                max_iter: the maximum number of iterations run by k-medodis.
                frac_sample_size: the sample size in proportional terms.
                p1, p2, p3: number of quantitative, binary and multi-class variables in the considered data matrix, respectively. Must be a non negative integer.
                d1: name of the distance to be computed for quantitative variables. Must be an string in ['euclidean', 'minkowski', 'canberra', 'mahalanobis', 'robust_mahalanobis']. 
                d2: name of the distance to be computed for binary variables. Must be an string in ['sokal', 'jaccard'].
                d3: name of the distance to be computed for multi-class variables. Must be an string in ['matching'].
                q: the parameter that defines the Minkowski distance. Must be a positive integer.
                robust_maha_method: the method to be used for computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
                alpha : a real number in [0,1] that is used if `method` is 'trimmed' or 'winsorized'. Only needed when d1 = 'robust_mahalanobis'.
                epsilon: parameter used by the Delvin algorithm that is used when computing the robust covariance matrix. Only needed when d1 = 'robust_mahalanobis'.
                n_iters: maximum number of iterations used by the Delvin algorithm. Only needed when d1 = 'robust_mahalanobis'.
                fast_VG: whether the geometric variability estimation will be full (False) or fast (True).
                VG_sample_size: sample size to be used to make the estimation of the geometric variability.
                VG_n_samples: number of samples to be used to make the estimation of the geometric variability.
                random_state: the random seed used for the (random) sample elements.
                y_type: the type of response variable. Must be in ['quantitative', 'binary', 'multiclass'].
                n_splits: number of folds to be used.
                shuffle: whether data is shuffled before applying KFold or not, must be in [True, False]. 
                kfold_random_state: the random seed for KFold if shuffle = True.
            """  
    def set_params(self, **params):
        """
        Set params method: for setting params properly.
        
        Parameters:
            params: a dictionary with params as values and params keys as names, following the sklearn conventions.
        """
    def fit(self, X, y, weights=None):
        """
        Fit method: fitting the KFold Fast KMedoids algorithm to `X` (and `y` if needed).
        
        Parameters:
            X: a Pandas or Polars data-frame or a NumPy array. Represents a predictors matrix. Is required.
            y: a Pandas or Polars series or a NumPy array. Represents a response variable. Is required.
            weights: the sample weights, if exists.
        """ 
    def predict(self, X):
        """
        Predict method: predicting the response variable for `X`.

        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a predictors matrix. Is required.
        """

In [None]:
class KmeansEstimator(BaseEstimator, RegressorMixin, ClusterMixin):
    """
    Implements the K-means-Estimator based on K-means and Sklearn estimators.
    """

    def __init__(self, estimators, n_clusters, random_state=123):
        """
        Constructor method.
        
        Parameters:
            estimators: a dictionary with the sklearn estimators (single models or pipelines) to be used in each clusters (keys: cluster indexes, values: estimators initialized).
            n_clusters: the number of clusters.
            random_state: the random seed used for the (random) sample elements.
        """  
    def set_params(self, **params):
        """
        Set params method: for setting params properly.
        
        Parameters:
            params: a dictionary with params as values and params keys as names, following the sklearn conventions.
        """   
    def fit(self, X, y=None):
        """
        Fit method: fitting the KMeans algorithm to `X` (and `y` if needed).
        
        Parameters:
            X: a Pandas or Polars data-frame or a NumPy array. Represents a predictors matrix. Is required.
            y: a Pandas or Polars series or a NumPy array. Represents a response variable. Is required.
            weights: the sample weights, if exists.
        """ 
    def predict(self, X):
        """
        Predict method: predicting the response variable for `X`.

        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a predictors matrix. Is required.
        """

In [None]:
class MiniBatchKmeansEstimator(BaseEstimator, RegressorMixin, ClusterMixin):
    """
    Implements the Mini Batch K-means-Estimator based on K-means and Sklearn estimators.
    """

    def __init__(self, estimators, n_clusters, random_state=123):
        """
        Constructor method.
        
        Parameters:
            estimators: a dictionary with the sklearn estimators (single models or pipelines) to be used in each clusters (keys: cluster indexes, values: estimators initialized).
            n_clusters: the number of clusters.
            random_state: the random seed used for the (random) sample elements.
        """  
    def set_params(self, **params):
        """
        Set params method: for setting params properly.
        
        Parameters:
            params: a dictionary with params as values and params keys as names, following the sklearn conventions.
        """   
    def fit(self, X, y=None):
        """
        Fit method: fitting the Mini Batch KMeans algorithm to `X` (and `y` if needed).
        
        Parameters:
            X: a Pandas or Polars data-frame or a NumPy array. Represents a predictors matrix. Is required.
            y: a Pandas or Polars series or a NumPy array. Represents a response variable. Is required.
            weights: the sample weights, if exists.
        """ 
    def predict(self, X):
        """
        Predict method: predicting the response variable for `X`.

        Parameters:
            X: a pandas/polars data-frame or a numpy array. Represents a predictors matrix. Is required.
        """

In [107]:
meta_models = {'XGB': XGBRegressor(random_state=123),
               'RF': RandomForestRegressor(random_state=123)}

clusters_RF = [0,2,4]
clusters_XGB = [1,3]

estimators_RF_XGB = {j: meta_models['RF'] for j in clusters_RF}
estimators_RF_XGB.update({j: meta_models['XGB'] for j in clusters_XGB}) 

fast_kmedoids_estimator = FastKmedoidsEstimator(estimators=estimators_RF_XGB, 
                                                n_clusters=2, method='pam', init='heuristic', max_iter=100, 
                                                random_state=123,  frac_sample_size=0.015, 
                                                p1=p1, p2=p2, p3=p3, 
                                                d1='robust_mahalanobis', d2='jaccard', d3='matching', q=1,
                                                robust_maha_method='trimmed', alpha=0.05, 
                                                y_type='quantitative')

kfold_fast_kmedoids_estimator = KFoldFastKmedoidsEstimator(estimators=estimators_RF_XGB, 
                                                           n_clusters=2, method='pam', init='heuristic', max_iter=100, 
                                                           random_state=123,  frac_sample_size=0.015, 
                                                           p1=p1, p2=p2, p3=p3, 
                                                           d1='robust_mahalanobis', d2='jaccard', d3='matching', q=1,
                                                           robust_maha_method='trimmed', alpha=0.05, 
                                                           n_splits=10, shuffle=True, kfold_random_state=123,
                                                           y_type='quantitative')

kmeans_estimator = KmeansEstimator(estimators=estimators_RF_XGB, n_clusters=2, random_state=123)

minibach_kmeans_estimator = MiniBatchKmeansEstimator(estimators=estimators_RF_XGB, n_clusters=2, random_state=123)

In [134]:
fast_kmedoids_estimator.fit(X=X, y=Y)
Y_test_hat = fast_kmedoids_estimator.predict(X=X_test)
print('\n-----------------------------------------\n\nFast-Kmedoids Estimator:', f'MAE = {np.round(mean_absolute_error(y_pred=Y_test_hat, y_true=Y_test),2)}')

Distance matrix size: (244, 244)
Num.Clusters: 2. Clusters proportions: [0.6890947 0.3109053]

-----------------------------------------

Fast-Kmedoids Estimator: MAE = 188007.61


In [133]:
kfold_fast_kmedoids_estimator.fit(X=X_train, y=Y_train)
Y_test_hat = kfold_fast_kmedoids_estimator.predict(X=X_test)
print('\n-----------------------------------------\n\nKFold-Fast-Kmedoids Estimator:', f'MAE = {np.round(mean_absolute_error(y_pred=Y_test_hat, y_true=Y_test),2)}')

Num.Folds: 10. Fold size: 1630.
Distance matrix size: 25 (0.015*1630) 
Clustering Fold 0
Distance matrix size: (24, 24)
Clustering Fold 1
Distance matrix size: (24, 24)
Clustering Fold 2
Distance matrix size: (24, 24)
Clustering Fold 3
Distance matrix size: (24, 24)
Clustering Fold 4
Distance matrix size: (24, 24)
Clustering Fold 5
Distance matrix size: (24, 24)
Clustering Fold 6
Distance matrix size: (24, 24)
Clustering Fold 7
Distance matrix size: (24, 24)
Clustering Fold 8
Distance matrix size: (24, 24)
Clustering Fold 9
Distance matrix size: (24, 24)
X_medoids size: (20, 11)
Distance matrix size: (16, 16)
Num.Clusters: 2. Clusters proportions: [0.86015702 0.13984298]

-----------------------------------------

KFold-Fast-Kmedoids Estimator: MAE = 187685.45


In [131]:
kmeans_estimator.fit(X=X_train, y=Y_train)
Y_test_hat = kmeans_estimator.predict(X=X_test)
print('\n-----------------------------------------\n\nKMeans Estimator:', f'MAE = {np.round(mean_absolute_error(y_pred=Y_test_hat, y_true=Y_test),2)}')

Clusters proportions: [0.87794406 0.12205594]

-----------------------------------------

KMeans Estimator: MAE = 194640.46


In [132]:
minibach_kmeans_estimator.fit(X=X_train, y=Y_train)
Y_test_hat = minibach_kmeans_estimator.predict(X=X_test)
print('\n-----------------------------------------\n\nMiniBatch-KMeans Estimator:', f'MAE = {np.round(mean_absolute_error(y_pred=Y_test_hat, y_true=Y_test),2)}')

Clusters weights (proportions): [0.87315996 0.12684004]

-----------------------------------------

MiniBatch-KMeans Estimator: MAE = 193026.66


## **Clustering estimators usage with pipelines**

In [87]:
madrid_houses_df = pl.read_csv('madrid_houses_NaNs.csv')
madrid_houses_df, p1, p2, p3, response, quant_predictors, cat_predictors = processing(madrid_houses_df)

In [88]:
X = madrid_houses_df[predictors].to_pandas()
Y = madrid_houses_df[response].to_pandas()
# The Null values of the Polars columns that are define as Object type by Pandas are treated as None and not as NaN (what we would like)
# The avoid this behavior the next step is necessary
X = X.fillna(value=np.nan)

In [89]:
inner = KFold(n_splits=4, shuffle=True, random_state=111)

In [90]:
quant_pipeline = Pipeline([('imputer', imputer(apply=True)),
                           ('scaler', scaler())
                          ])

cat_pipeline = Pipeline([('encoder', encoder(method='ordinal')),
                         ('imputer', imputer(apply=True))
                        ])

quant_cat_preprocessing = ColumnTransformer(transformers=[('quant', quant_pipeline, quant_predictors),
                                                          ('cat', cat_pipeline, cat_predictors)]) 

In [91]:
meta_models = {'XGB': XGBRegressor(random_state=123),
               'RF': RandomForestRegressor(random_state=123)}

clusters_RF = [0,2,4]
clusters_XGB = [1,3]
max_n_clusters = len(clusters_RF + clusters_XGB)

estimators_RF_XGB = {j: meta_models['RF'] for j in clusters_RF}
estimators_RF_XGB.update({j: meta_models['XGB'] for j in clusters_XGB}) 

fast_kmedoids_estimator = FastKmedoidsEstimator(estimators=estimators_RF_XGB, 
                                                    n_clusters=2, method='pam', init='heuristic', max_iter=100, 
                                                    random_state=123,  frac_sample_size=0.015, 
                                                    p1=p1, p2=p2, p3=p3, 
                                                    d1='robust_mahalanobis', d2='jaccard', d3='matching', q=1,
                                                    robust_maha_method='trimmed', alpha=0.05, 
                                                    y_type='quantitative')

pipeline_fast_kmedoids_estimator = Pipeline([('preprocessing', quant_cat_preprocessing),
                                             ('features_selector', features_selector()),
                                             ('clustering_model', fast_kmedoids_estimator)
                                            ]) 

In [92]:
def preprocessing_param_grid(trial):

    # Fix Grid
    param_grid = {
        'preprocessing__quant__imputer__method': trial.suggest_categorical('preprocessing__quant__imputer__method', ['simple_mean', 'simple_median', 'iterative_mean', 'iterative_median']),
        'preprocessing__cat__imputer__method': trial.suggest_categorical('preprocessing__cat__imputer__method', ['simple_most_frequent']),
        'preprocessing__quant__scaler__apply': trial.suggest_categorical('preprocessing__quant__scaler__apply', [True, False]),
        'preprocessing__cat__encoder__method': trial.suggest_categorical('preprocessing__cat__encoder__method', ['ordinal']),  # with FastKmedoids only ordinal is suitable
        'features_selector__apply': trial.suggest_categorical('features_selector__apply', [False]) # with FastKmedoids we must not select predictors
        }

    # Conditioned Grid
    if param_grid['features_selector__apply'] == True:

        param_grid.update({'features_selector__method': trial.suggest_categorical('features_selector__method', ['Fpr_f_reg', 'Fdr_f_reg'])})
        
    if param_grid['preprocessing__quant__scaler__apply'] == True:
    
        param_grid.update({'preprocessing__quant__scaler__method': trial.suggest_categorical('preprocessing__quant__scaler__method', ['standard', 'min-max'])})

    return param_grid

In [93]:
def param_grid_clustering_model(trial):

    param_grid = {
        'clustering_model__n_clusters': trial.suggest_categorical('clustering_model__n_clusters', [2, 3, 4, 5]),
        'clustering_model__method': trial.suggest_categorical('clustering_model__method', ['pam', 'alternate']),
        'clustering_model__init': trial.suggest_categorical('clustering_model__init', ['random', 'heuristic', 'k-medoids++']),
        'clustering_model__frac_sample_size': trial.suggest_categorical('clustering_model__frac_sample_size', [0.01, 0.015, 0.02, 0.03]),
        'clustering_model__d1': trial.suggest_categorical('clustering_model__d1', ['robust_mahalanobis', 'mahalanobis', 'euclidean', 'minkowski', 'canberra']),
        'clustering_model__d2': trial.suggest_categorical('clustering_model__d2', ['jaccard', 'sokal'])
    }

    if param_grid['clustering_model__d1'] == 'robust_mahalanobis':

        param_grid.update({
            'clustering_model__robust_maha_method': trial.suggest_categorical('clustering_model__robust_maha_method', ['trimmed', 'winsorized', 'MAD']),
            })

        if param_grid['clustering_model__robust_maha_method'] in ['trimmed', 'winsorized']:

                param_grid.update({
                        'clustering_model__alpha': trial.suggest_categorical('clustering_model__alpha', [0.05, 0.1, 0.15, 0.2, 0.25]),
                    })

    return param_grid

In [94]:
def param_grid_RF_XGB(trial):

    param_grid = preprocessing_param_grid(trial)
    param_grid.update(param_grid_clustering_model(trial))

    n_clusters = param_grid['clustering_model__n_clusters']

    for j in range(0, n_clusters): # Create grids only for the effective clusters

        # Grids for RF
        if j in clusters_RF:

            param_grid.update({
                f'clustering_model__estimators__{j}__n_estimators': trial.suggest_int(f'clustering_model__estimators__{j}__n_estimators', 50, 120),
                f'clustering_model__estimators__{j}__max_depth': trial.suggest_categorical(f'clustering_model__estimators__{j}__max_depth', [None, 3, 5, 7, 10, 20, 30, 40, 50]),
                f'clustering_model__estimators__{j}__min_samples_split': trial.suggest_int(f'clustering_model__estimators__{j}__min_samples_split', 2, 25),
                f'clustering_model__estimators__{j}__min_samples_leaf': trial.suggest_int(f'clustering_model__estimators__{j}__min_samples_leaf', 2, 25)
            })

        # Grids for XGB
        if j in clusters_XGB:  

            param_grid.update({
                f'clustering_model__estimators__{j}__max_depth': trial.suggest_categorical(f'clustering_model__estimators__{j}__max_depth', [None, 3, 5, 7, 10, 20, 30, 40, 50]),
                f'clustering_model__estimators__{j}__lambda': trial.suggest_float(f'clustering_model__estimators__{j}__lambda', 0, 0.5, step=0.1, log=False),
                f'clustering_model__estimators__{j}__n_estimators': trial.suggest_categorical(f'clustering_model__estimators__{j}__n_estimators', [30, 50, 70, 100, 150, 180]),
                f'clustering_model__estimators__{j}__eta': trial.suggest_float(f'clustering_model__estimators__{j}__eta', 0, 0.3, step=0.02, log=False),
                f'clustering_model__estimators__{j}__alpha': trial.suggest_float(f'clustering_model__estimators__{j}__alpha', 0.2, 1, step=0.01, log=False)
            })

    return param_grid

In [None]:
simple_eval = SimpleEvaluation(estimator=pipeline_fast_kmedoids_estimator,  
                                cv=inner, 
                                param_grid=param_grid_RF_XGB,
                                search_method='optuna',
                                scoring='neg_mean_absolute_error', 
                                direction='maximize', 
                                n_trials=10, 
                                random_state=666)

simple_eval.fit(X=X, y=Y)

In [97]:
simple_eval.inner_results

Unnamed: 0,preprocessing__quant__imputer__method,preprocessing__cat__imputer__method,preprocessing__quant__scaler__apply,preprocessing__cat__encoder__method,features_selector__apply,preprocessing__quant__scaler__method,clustering_model__n_clusters,clustering_model__method,clustering_model__init,clustering_model__frac_sample_size,clustering_model__d1,clustering_model__d2,clustering_model__estimators__0__n_estimators,clustering_model__estimators__0__max_depth,clustering_model__estimators__0__min_samples_split,clustering_model__estimators__0__min_samples_leaf,clustering_model__estimators__1__max_depth,clustering_model__estimators__1__lambda,clustering_model__estimators__1__n_estimators,clustering_model__estimators__1__eta,clustering_model__estimators__1__alpha,clustering_model__estimators__2__n_estimators,clustering_model__estimators__2__max_depth,clustering_model__estimators__2__min_samples_split,clustering_model__estimators__2__min_samples_leaf,clustering_model__estimators__3__max_depth,clustering_model__estimators__3__lambda,clustering_model__estimators__3__n_estimators,clustering_model__estimators__3__eta,clustering_model__estimators__3__alpha,clustering_model__estimators__4__n_estimators,clustering_model__estimators__4__max_depth,clustering_model__estimators__4__min_samples_split,clustering_model__estimators__4__min_samples_leaf,clustering_model__robust_maha_method,score,time
7,simple_mean,simple_most_frequent,True,ordinal,False,standard,3,pam,heuristic,0.03,minkowski,sokal,55,40,12,3,3,0.5,30,0.28,0.54,51.0,40.0,4.0,11.0,,,,,,,,,,,-189925.033692,73.682768
1,simple_mean,simple_most_frequent,False,ordinal,False,,3,pam,k-medoids++,0.03,robust_mahalanobis,jaccard,77,20,7,22,50,0.2,30,0.12,0.84,54.0,50.0,20.0,22.0,,,,,,,,,,MAD,-193352.354,93.964026
6,simple_median,simple_most_frequent,True,ordinal,False,min-max,2,pam,k-medoids++,0.015,mahalanobis,sokal,62,7,9,4,3,0.5,150,0.06,0.61,,,,,,,,,,,,,,,-194842.623889,66.683697
4,iterative_mean,simple_most_frequent,True,ordinal,False,min-max,2,alternate,random,0.02,mahalanobis,sokal,80,5,5,19,10,0.4,100,0.16,0.75,,,,,,,,,,,,,,,-198370.937385,65.778179
2,simple_mean,simple_most_frequent,False,ordinal,False,,2,alternate,heuristic,0.02,mahalanobis,sokal,77,7,10,17,40,0.5,150,0.26,0.87,,,,,,,,,,,,,,,-198629.430744,88.212142
9,simple_median,simple_most_frequent,False,ordinal,False,,2,pam,random,0.03,minkowski,sokal,84,3,15,11,30,0.2,70,0.12,0.91,,,,,,,,,,,,,,,-209762.624869,77.228467
3,iterative_mean,simple_most_frequent,False,ordinal,False,,3,alternate,heuristic,0.015,canberra,sokal,75,20,4,17,50,0.5,30,0.18,0.21,112.0,50.0,7.0,25.0,,,,,,,,,,,-209935.43226,139.501305
8,simple_median,simple_most_frequent,True,ordinal,False,standard,3,pam,random,0.02,robust_mahalanobis,jaccard,71,50,5,10,10,0.1,30,0.02,0.75,72.0,3.0,12.0,12.0,,,,,,,,,,MAD,-226212.430843,74.882898
5,simple_median,simple_most_frequent,True,ordinal,False,standard,3,pam,random,0.01,mahalanobis,jaccard,64,50,24,23,20,0.0,70,0.2,0.29,51.0,20.0,3.0,22.0,,,,,,,,,,,-249498.494575,63.639676
0,simple_median,simple_most_frequent,True,ordinal,False,standard,5,alternate,heuristic,0.02,canberra,jaccard,91,3,20,7,40,0.2,150,0.24,0.54,105.0,5.0,14.0,9.0,10.0,0.1,150.0,0.0,0.78,50.0,20.0,6.0,4.0,,-259158.519557,128.728094


In [98]:
simple_eval.inner_best_params

{'preprocessing__quant__imputer__method': 'simple_mean',
 'preprocessing__cat__imputer__method': 'simple_most_frequent',
 'preprocessing__quant__scaler__apply': True,
 'preprocessing__cat__encoder__method': 'ordinal',
 'features_selector__apply': False,
 'preprocessing__quant__scaler__method': 'standard',
 'clustering_model__n_clusters': 3,
 'clustering_model__method': 'pam',
 'clustering_model__init': 'heuristic',
 'clustering_model__frac_sample_size': 0.03,
 'clustering_model__d1': 'minkowski',
 'clustering_model__d2': 'sokal',
 'clustering_model__estimators__0__n_estimators': 55,
 'clustering_model__estimators__0__max_depth': 40,
 'clustering_model__estimators__0__min_samples_split': 12,
 'clustering_model__estimators__0__min_samples_leaf': 3,
 'clustering_model__estimators__1__max_depth': 3,
 'clustering_model__estimators__1__lambda': 0.5,
 'clustering_model__estimators__1__n_estimators': 30,
 'clustering_model__estimators__1__eta': 0.28,
 'clustering_model__estimators__1__alpha': 

In [99]:
simple_eval.inner_score

-189925.0336917885