In [1]:
from cohirf.experiment.hpo_open_ml_clustering_experiment import HPOOpenmlClusteringExperiment
from pathlib import Path
from cohirf.models.batch_cohirf import BatchCoHiRF
from cohirf.models.cohirf import BaseCoHiRF
from sklearn.cluster import DBSCAN
import optuna

In [2]:
results_dir = Path("/home/belucci/code/cohirf/results") / "real"
mlflow_tracking_uri = f"sqlite:///{results_dir}/mlflow.db"

In [3]:
experiment_params = dict(
    mlflow_tracking_uri=mlflow_tracking_uri,
    check_if_exists=False,
)

# Example of running one of the defined tested models

In [None]:
experiment = HPOOpenmlClusteringExperiment(
    # hpo
    n_trials=20,
    hpo_seed=0,
    hpo_metric="adjusted_rand",
    direction="maximize",
    # model
    experiment_name="test-dbscan-hpo",
    model="DBSCAN",
    seed_model=0,
    # dataset
    seed_dataset=0,
    n_samples=2000,
    n_spheres=2,
    radius_separation=0.5,
    radius_std=0.01,
    verbose=0,
    **experiment_params,
)
result = experiment.run(return_results=True)[0]
ari = result["evaluate_model_return"]["best/adjusted_rand"]
hpo_time = result["fit_model_return"]["elapsed_time"]
best_time = result["evaluate_model_return"]["best/elapsed_time"]

2025-07-19 18:48:47
Starting experiment...
combination_names: ['model', 'seed_model', 'dataset_id', 'task_id', 'task_repeat', 'task_fold', 'task_sample']
combinations: [('DBSCAN', 0, 61, None, 0, 0, 0)]
unique_params: {'timeout_fit': None, 'timeout_combination': None, 'n_jobs': 1, 'model_params': {}, 'max_threads': None, 'calculate_davies_bouldin': False, 'calculate_full_silhouette': False, 'standardize': False, 'hpo_framework': 'optuna', 'n_trials': 20, 'timeout_hpo': 0, 'timeout_trial': 0, 'max_concurrent_trials': 1, 'hpo_seed': 0, 'sampler': 'tpe', 'pruner': 'none', 'direction': 'maximize', 'hpo_metric': 'adjusted_rand'}
extra_params: {}



Combinations completed:   0%|          | 0/1 [00:00<?, ?it/s]

2025/07/19 18:48:47 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/19 18:48:47 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


2025-07-19 18:48:48
Running...
model: DBSCAN
seed_model: 0
dataset_id: 61
task_id: None
task_repeat: 0
task_fold: 0
task_sample: 0
timeout_fit: None
timeout_combination: None
n_jobs: 1
model_params: {}
max_threads: None
calculate_davies_bouldin: False
calculate_full_silhouette: False
standardize: False
hpo_framework: optuna
n_trials: 20
timeout_hpo: 0
timeout_trial: 0
max_concurrent_trials: 1
hpo_seed: 0
sampler: tpe
pruner: none
direction: maximize
hpo_metric: adjusted_rand



Trials:   0%|          | 0/20 [00:00<?, ?it/s]

2025-07-19 18:48:51
Finished!
total_elapsed_time: 3.138907971999288
model: DBSCAN
seed_model: 0
dataset_id: 61
task_id: None
task_repeat: 0
task_fold: 0
task_sample: 0
timeout_fit: None
timeout_combination: None
n_jobs: 1
model_params: {'eps': 6.1405029293612525, 'min_samples': 11}
max_threads: None
calculate_davies_bouldin: False
calculate_full_silhouette: False
standardize: False
hpo_framework: optuna
n_trials: 20
timeout_hpo: 0
timeout_trial: 0
max_concurrent_trials: 1
hpo_seed: 0
sampler: tpe
pruner: none
direction: maximize
hpo_metric: adjusted_rand

2025-07-19 18:48:51
Combinations completed:   0%|          | 0/1 [00:04<?, ?it/s]
succesfully_completed: 1
failed: 0
none: 0



# Example of running a custom model

In [5]:
model_cls = BatchCoHiRF
model_params = dict(
    cohirf_model=BaseCoHiRF,
    cohirf_kwargs=dict(base_model=DBSCAN, max_iter=1),
    n_batches=10,
    n_jobs=10,
)
search_space = dict(
    cohirf_kwargs=dict(
        n_features=optuna.distributions.FloatDistribution(0.1, 1),
        repetitions=optuna.distributions.IntDistribution(1, 10),
        base_model_kwargs=dict(
            eps=optuna.distributions.FloatDistribution(1e-1, 10),
            min_samples=optuna.distributions.IntDistribution(2, 50),
        ),
    )
)
default_values = [
    dict(
        cohirf_kwargs=dict(
            n_features=0.3,
            repetitions=5,
            base_model_kwargs=dict(
                eps=0.5,
                min_samples=5,
            ),
        )
    ),
]

In [7]:
experiment = HPOOpenmlClusteringExperiment(
    # hpo
    n_trials=20,
    hpo_seed=0,
    hpo_metric="adjusted_rand",
    direction="maximize",
    # model
    model=model_cls,
    model_params=model_params,
    search_space=search_space,
    default_values=default_values,
    # dataset
    dataset_id=61,  # OpenML dataset ID for "Iris"
    **experiment_params,
)
result = experiment.run(return_results=True)[0]
ari = result["evaluate_model_return"]["best/adjusted_rand"]
hpo_time = result["fit_model_return"]["elapsed_time"]
best_time = result["evaluate_model_return"]["best/elapsed_time"]

2025-07-19 18:49:04
Starting experiment...
combination_names: ['model', 'seed_model', 'dataset_id', 'task_id', 'task_repeat', 'task_fold', 'task_sample']
combinations: [(<class 'cohirf.models.batch_cohirf.BatchCoHiRF'>, 0, 61, None, 0, 0, 0)]
unique_params: {'timeout_fit': None, 'timeout_combination': None, 'n_jobs': 1, 'model_params': {'cohirf_model': <class 'cohirf.models.cohirf.BaseCoHiRF'>, 'cohirf_kwargs': {'base_model': <class 'sklearn.cluster._dbscan.DBSCAN'>, 'max_iter': 1}, 'n_batches': 10, 'n_jobs': 10}, 'max_threads': None, 'calculate_davies_bouldin': False, 'calculate_full_silhouette': False, 'standardize': False, 'hpo_framework': 'optuna', 'n_trials': 20, 'timeout_hpo': 0, 'timeout_trial': 0, 'max_concurrent_trials': 1, 'hpo_seed': 0, 'sampler': 'tpe', 'pruner': 'none', 'direction': 'maximize', 'hpo_metric': 'adjusted_rand'}
extra_params: {}



Combinations completed:   0%|          | 0/1 [00:00<?, ?it/s]

2025-07-19 18:49:05
Running...
model: <class 'cohirf.models.batch_cohirf.BatchCoHiRF'>
seed_model: 0
dataset_id: 61
task_id: None
task_repeat: 0
task_fold: 0
task_sample: 0
timeout_fit: None
timeout_combination: None
n_jobs: 1
model_params: {'cohirf_model': <class 'cohirf.models.cohirf.BaseCoHiRF'>, 'cohirf_kwargs': {'base_model': <class 'sklearn.cluster._dbscan.DBSCAN'>, 'max_iter': 1}, 'n_batches': 10, 'n_jobs': 10}
max_threads: None
calculate_davies_bouldin: False
calculate_full_silhouette: False
standardize: False
hpo_framework: optuna
n_trials: 20
timeout_hpo: 0
timeout_trial: 0
max_concurrent_trials: 1
hpo_seed: 0
sampler: tpe
pruner: none
direction: maximize
hpo_metric: adjusted_rand



Trials:   0%|          | 0/20 [00:00<?, ?it/s]

  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '


2025-07-19 18:50:09
Finished!
total_elapsed_time: 61.5913247890021
model: <class 'cohirf.models.batch_cohirf.BatchCoHiRF'>
seed_model: 0
dataset_id: 61
task_id: None
task_repeat: 0
task_fold: 0
task_sample: 0
timeout_fit: None
timeout_combination: None
n_jobs: 1
model_params: {'cohirf_model': <class 'cohirf.models.cohirf.BaseCoHiRF'>, 'cohirf_kwargs': {'base_model': <class 'sklearn.cluster._dbscan.DBSCAN'>, 'max_iter': 1, 'n_features': 0.1176288295482597, 'repetitions': 9, 'base_model_kwargs': {'eps': 0.4751105083193954, 'min_samples': 21}}, 'n_batches': 10, 'n_jobs': 10}
max_threads: None
calculate_davies_bouldin: False
calculate_full_silhouette: False
standardize: False
hpo_framework: optuna
n_trials: 20
timeout_hpo: 0
timeout_trial: 0
max_concurrent_trials: 1
hpo_seed: 0
sampler: tpe
pruner: none
direction: maximize
hpo_metric: adjusted_rand

2025-07-19 18:50:09
Combinations completed:   0%|          | 0/1 [01:04<?, ?it/s]
succesfully_completed: 1
failed: 0
none: 0



  warn(f'metric {metric} not found in dict returned by training_fn, available metrics are '
