# Imports

In [1]:
from cohirf.experiment.classification_clustering_experiment import ClassificationClusteringExperiment
from cohirf.experiment.hpo_classification_clustering_experiment import HPOClassificationClusteringExperiment
from cohirf.models.cohirf import BaseCoHiRF
from cohirf.models.scsrgf import SpectralSubspaceRandomization
from cohirf.models.batch_cohirf import BatchCoHiRF
from sklearn.cluster import SpectralClustering, KMeans
from pathlib import Path
import pandas as pd
from IPython.display import clear_output
import optuna

In [2]:
results_dir = Path('/home/belucci/code/cohirf/results') / 'features'
log_dir = results_dir / 'logs'
work_dir = results_dir / 'work'
mlflow_tracking_uri = f'sqlite:///{results_dir}/mlflow.db'

In [3]:
mlflow_tracking_uri

'sqlite:////home/belucci/code/cohirf/results/features/mlflow.db'

In [4]:
log_dir

PosixPath('/home/belucci/code/cohirf/results/features/logs')

In [5]:
experiment_params = dict(
    work_root_dir=work_dir,
    profile_memory=False,
    mlflow_tracking_uri=mlflow_tracking_uri,
    check_if_exists=False,
)

# SC-SRGF

In [6]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model="SpectralSubspaceRandomization",
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3*(3**0.5),
        verbose=0,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI    HPO Time  Best Time
count  1.000000    1.000000   1.000000
mean   0.906606  449.128962  28.506031
std         NaN         NaN        NaN
min    0.906606  449.128962  28.506031
25%    0.906606  449.128962  28.506031
50%    0.906606  449.128962  28.506031
75%    0.906606  449.128962  28.506031
max    0.906606  449.128962  28.506031


# Spectral Clustering

In [25]:
model_cls = SpectralClustering
model_params = dict(assign_labels="discretize")
search_space = dict(
    n_clusters=optuna.distributions.IntDistribution(2, 10),
	gamma=optuna.distributions.FloatDistribution(0.1, 30),
)
default_values = [
    dict(
        n_clusters=5,
		gamma=1.0,
    )
]

In [26]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=30,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test-dbscan-hpo",
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=1000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI   HPO Time  Best Time
count  1.000000   1.000000   1.000000
mean   0.004565  45.184022   0.178408
std         NaN        NaN        NaN
min    0.004565  45.184022   0.178408
25%    0.004565  45.184022   0.178408
50%    0.004565  45.184022   0.178408
75%    0.004565  45.184022   0.178408
max    0.004565  45.184022   0.178408


In [27]:
print(df.describe())

            ARI   HPO Time  Best Time
count  1.000000   1.000000   1.000000
mean   0.004565  45.184022   0.178408
std         NaN        NaN        NaN
min    0.004565  45.184022   0.178408
25%    0.004565  45.184022   0.178408
50%    0.004565  45.184022   0.178408
75%    0.004565  45.184022   0.178408
max    0.004565  45.184022   0.178408


# Batch CoHiRF SC-SRGF

In [None]:
model_cls = BatchCoHiRF
model_params = dict(
    cohirf_model=BaseCoHiRF,
    cohirf_kwargs=dict(
        base_model=SpectralSubspaceRandomization,
        n_features=1.0,
        max_iter=1,
    ),
)
search_space = dict(
    cohirf_kwargs=dict(
        repetitions=optuna.distributions.IntDistribution(2, 10),
        base_model_kwargs=dict(
            n_similarities=optuna.distributions.IntDistribution(10, 30),
            sampling_ratio=optuna.distributions.FloatDistribution(0.2, 0.8),
            sc_n_clusters=optuna.distributions.IntDistribution(2, 5),
        ),
    )
)
default_values = [
    dict(
        cohirf_kwargs=dict(
            repetitions=5,
            base_model_kwargs=dict(
                n_similarities=20,
                sampling_ratio=0.5,
                sc_n_clusters=3,
            ),
        )
    )
]

In [None]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=10,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI    HPO Time  Best Time
count  1.000000    1.000000   1.000000
mean   0.686725  445.280781  10.681087
std         NaN         NaN        NaN
min    0.686725  445.280781  10.681087
25%    0.686725  445.280781  10.681087
50%    0.686725  445.280781  10.681087
75%    0.686725  445.280781  10.681087
max    0.686725  445.280781  10.681087


In [None]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=40,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=10,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI    HPO Time  Best Time
count  1.000000    1.000000     1.0000
mean   0.634477  802.911014    10.7565
std         NaN         NaN        NaN
min    0.634477  802.911014    10.7565
25%    0.634477  802.911014    10.7565
50%    0.634477  802.911014    10.7565
75%    0.634477  802.911014    10.7565
max    0.634477  802.911014    10.7565


In [21]:
model_cls = BatchCoHiRF
model_params = dict(
    cohirf_model=BaseCoHiRF,
    cohirf_kwargs=dict(
        base_model=SpectralSubspaceRandomization,
        n_features=1.0,
        max_iter=1,
    ),
	n_batches=5
)
search_space = dict(
    cohirf_kwargs=dict(
        repetitions=optuna.distributions.IntDistribution(2, 10),
        base_model_kwargs=dict(
            n_similarities=optuna.distributions.IntDistribution(10, 30),
            sampling_ratio=optuna.distributions.FloatDistribution(0.2, 0.8),
            sc_n_clusters=optuna.distributions.IntDistribution(2, 5),
        ),
    )
)
default_values = [
    dict(
        cohirf_kwargs=dict(
            repetitions=5,
            base_model_kwargs=dict(
                n_similarities=20,
                sampling_ratio=0.5,
                sc_n_clusters=3,
            ),
        )
    )
]

In [None]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=5,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI    HPO Time  Best Time
count  1.000000    1.000000   1.000000
mean   0.735021  610.747898  21.380293
std         NaN         NaN        NaN
min    0.735021  610.747898  21.380293
25%    0.735021  610.747898  21.380293
50%    0.735021  610.747898  21.380293
75%    0.735021  610.747898  21.380293
max    0.735021  610.747898  21.380293


In [23]:
model_cls = BatchCoHiRF
model_params = dict(
    cohirf_model=BaseCoHiRF,
    cohirf_kwargs=dict(
        base_model=SpectralSubspaceRandomization,
        n_features=1.0,
        max_iter=1,
    ),
    n_batches=2,
)
search_space = dict(
    cohirf_kwargs=dict(
        repetitions=optuna.distributions.IntDistribution(2, 10),
        base_model_kwargs=dict(
            n_similarities=optuna.distributions.IntDistribution(10, 30),
            sampling_ratio=optuna.distributions.FloatDistribution(0.2, 0.8),
            sc_n_clusters=optuna.distributions.IntDistribution(2, 5),
        ),
    )
)
default_values = [
    dict(
        cohirf_kwargs=dict(
            repetitions=5,
            base_model_kwargs=dict(
                n_similarities=20,
                sampling_ratio=0.5,
                sc_n_clusters=3,
            ),
        )
    )
]

In [None]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=5,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI   HPO Time  Best Time
count  1.000000     1.0000   1.000000
mean   0.916119  1327.5609  83.862078
std         NaN        NaN        NaN
min    0.916119  1327.5609  83.862078
25%    0.916119  1327.5609  83.862078
50%    0.916119  1327.5609  83.862078
75%    0.916119  1327.5609  83.862078
max    0.916119  1327.5609  83.862078


# CoHiRF

In [10]:
model_cls = BaseCoHiRF
model_params = dict(
    base_model=KMeans,
)
search_space = dict(
    n_features=optuna.distributions.FloatDistribution(0.1, 0.6),
	repetitions=optuna.distributions.IntDistribution(2, 10),
	base_model_kwargs=dict(
		n_clusters=optuna.distributions.IntDistribution(2, 5),
	),
)
default_values = [
    dict(
        n_features=0.3,
		repetitions=5,
		base_model_kwargs=dict(n_clusters=3)
    )
]

In [None]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=20,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=10,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI    HPO Time  Best Time
count  1.000000    1.000000   1.000000
mean   0.503586  116.776287   0.633516
std         NaN         NaN        NaN
min    0.503586  116.776287   0.633516
25%    0.503586  116.776287   0.633516
50%    0.503586  116.776287   0.633516
75%    0.503586  116.776287   0.633516
max    0.503586  116.776287   0.633516


In [12]:
seeds = [i for i in range(1)]
aris = []
hpo_times = []
best_times = []
for seed in seeds:
    experiment = HPOClassificationClusteringExperiment(
        # hpo
        n_trials=100,
        hpo_seed=seed,
        hpo_metric="adjusted_rand",
        direction="maximize",
        # model
        experiment_name="test",
        n_jobs=10,
        model=model_cls,
        model_params=model_params,
        search_space=search_space,
        default_values=default_values,
        seed_model=0,
        # dataset
        seed_dataset=seed,
        n_samples=1000,
        n_informative=3,
        n_random=10000,
        n_classes=5,
        class_sep=3 * (3**0.5),
        verbose=0,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["best/adjusted_rand"]
    hpo_time = result["fit_model_return"]["elapsed_time"]
    best_time = result["evaluate_model_return"]["best/elapsed_time"]
    aris.append(ari)
    hpo_times.append(hpo_time)
    best_times.append(best_time)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "HPO Time": hpo_times, "Best Time": best_times})
print(df.describe())

            ARI   HPO Time  Best Time
count  1.000000    1.00000   1.000000
mean   0.599225  251.60227   0.390373
std         NaN        NaN        NaN
min    0.599225  251.60227   0.390373
25%    0.599225  251.60227   0.390373
50%    0.599225  251.60227   0.390373
75%    0.599225  251.60227   0.390373
max    0.599225  251.60227   0.390373
