# Imports

In [1]:
from cohirf.experiment.spherical_clustering_experiment import SphericalClusteringExperiment
from cohirf.experiment.hpo_spherical_clustering_experiment import HPOSphericalClusteringExperiment
from pathlib import Path
import pandas as pd
from IPython.display import clear_output

In [2]:
results_dir = Path('/home/belucci/code/cohirf/results') / 'spherical_clustering'
mlflow_tracking_uri = f'sqlite:///{results_dir}/mlflow.db'

In [3]:
experiment_params = dict(
    profile_memory=False,
    mlflow_tracking_uri=mlflow_tracking_uri,
    check_if_exists=False,
)

# DBSCAN

In [4]:
seeds = [i for i in range(5)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOSphericalClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="DBSCAN",
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=200,
        n_spheres=2,
        radius_separation=0.5,
        radius_std=0.01,
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI  HPO Time  Best Time
count  5.000000  5.000000   5.000000
mean   0.900896  2.820578   0.001754
std    0.114463  0.065207   0.000815
min    0.735843  2.768930   0.001306
25%    0.827239  2.778518   0.001352
50%    0.961399  2.807963   0.001409
75%    0.980000  2.815603   0.001496
max    1.000000  2.931877   0.003206


# SC-SRGF

In [9]:
seeds = [i for i in range(5)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOSphericalClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="SpectralSubspaceRandomization",
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=200,
        n_spheres=2,
        radius_separation=0.5,
        radius_std=0.01,
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI   HPO Time  Best Time
count  5.000000   5.000000   5.000000
mean   0.181591  20.159330   0.917017
std    0.057476   1.547310   0.183727
min    0.120345  17.911583   0.707207
25%    0.147016  19.231248   0.756030
50%    0.164911  20.906629   0.927398
75%    0.209017  21.094633   1.093446
max    0.266663  21.652558   1.101003


# CoHiRF DBSCAN

In [5]:
seeds = [i for i in range(5)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOSphericalClusteringExperiment(
        # hpo
        n_trials=50,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="CoHiRF-DBSCAN",
        model_params=dict(n_samples_representative=1000),
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=200,
        n_spheres=2,
        radius_separation=0.5,
        radius_std=0.01,
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI  HPO Time  Best Time
count  5.000000  5.000000   5.000000
mean   0.796080  7.218724   0.003512
std    0.444882  0.208034   0.000364
min    0.000302  6.974463   0.003181
25%    0.990050  7.078472   0.003229
50%    0.990050  7.187930   0.003380
75%    1.000000  7.368977   0.003731
max    1.000000  7.483778   0.004037


# Batch-CoHiRF DBSCAN

In [6]:
seeds = [i for i in range(5)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOSphericalClusteringExperiment(
        # hpo
        n_trials=50,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="BatchCoHiRF-DBSCAN-1iter",
        model_params=dict(cohirf_kwargs=dict(n_samples_representative=1000), n_batches=1),
        seed_model=0,
        n_jobs=5,
        # dataset
        seed_dataset=seed,
        n_samples=200,
        n_spheres=2,
        radius_separation=0.5,
        radius_std=0.01,
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI   HPO Time  Best Time
count  5.000000   5.000000   5.000000
mean   0.796080   9.092230   0.011124
std    0.444882   2.825623   0.000113
min    0.000302   7.538738   0.011016
25%    0.990050   7.586757   0.011040
50%    0.990050   7.980529   0.011114
75%    1.000000   8.234461   0.011146
max    1.000000  14.120664   0.011303


In [7]:
print(df.describe())

            ARI   HPO Time  Best Time
count  5.000000   5.000000   5.000000
mean   0.796080   9.092230   0.011124
std    0.444882   2.825623   0.000113
min    0.000302   7.538738   0.011016
25%    0.990050   7.586757   0.011040
50%    0.990050   7.980529   0.011114
75%    1.000000   8.234461   0.011146
max    1.000000  14.120664   0.011303


In [8]:
seeds = [i for i in range(5)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOSphericalClusteringExperiment(
        # hpo
        n_trials=50,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="BatchCoHiRF-DBSCAN",
        model_params=dict(cohirf_kwargs=dict(n_samples_representative=1000), n_batches=1),
        seed_model=0,
        n_jobs=5,
        # dataset
        seed_dataset=seed,
        n_samples=2000,
        n_spheres=2,
        radius_separation=0.5,
        radius_std=0.01,
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

       ARI   HPO Time  Best Time
count  5.0   5.000000   5.000000
mean   1.0  11.259293   0.040712
std    0.0   0.114155   0.009013
min    1.0  11.099574   0.031728
25%    1.0  11.247864   0.032486
50%    1.0  11.257421   0.042201
75%    1.0  11.269930   0.043534
max    1.0  11.421675   0.053609
