# Imports

In [1]:
from sklearn.datasets import make_classification
from cohirf.experiment.gaussian_clustering_experiment import make_multivariate_normal
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np
from cohirf.models.cohirf import BaseCoHiRF
from sklearn.metrics import adjusted_rand_score
from ml_experiments.tuners import OptunaTuner
from ml_experiments.utils import unflatten_dict, update_recursively
import optuna
from sklearn.cluster import KMeans, DBSCAN
from types import SimpleNamespace
from time import perf_counter
from functools import partial
import pandas as pd
from cohirf.models.batch_cohirf import BatchCoHiRF

In [2]:
def training_fn(trial, model_cls, model_kwargs, X, y):
    params = trial.params
    parsed_params = unflatten_dict(params)
    parsed_params = update_recursively(parsed_params, model_kwargs)
    model = model_cls(**parsed_params)
    labels = model.fit_predict(X)
    # if -1 in labels (for example noise data for DBSCAN) we offset it to the last cluster
    if -1 in labels:
        labels = np.where(labels == -1, np.max(labels) + 1, labels)
    return adjusted_rand_score(y, labels)

In [3]:
def train_best_model_and_show_results(model_cls, model_kwargs, study, X, y):
    best_params = study.best_params.copy()
    best_value = study.best_value
    best_params = unflatten_dict(best_params)
    best_params = update_recursively(best_params, model_kwargs)
    best_model = model_cls(**best_params)
    start_time = perf_counter()
    labels = best_model.fit_predict(X)
    fit_time = perf_counter() - start_time
    # if -1 in labels (for example noise data for DBSCAN) we offset it to the last cluster
    if -1 in labels:
        labels = np.where(labels == -1, np.max(labels) + 1, labels)
    ari = adjusted_rand_score(y, labels)
    print(f"Model: {model_cls.__name__}, Best study params: {study.best_params}, Best study value: {best_value:.3f}\n"
          f"ARI: {ari:.3f}, Fit time: {fit_time:.3f} seconds")
    return best_model, labels

# Hypercube harder

In [4]:
n_samples = int(1e4)
n_informative_features = 3
n_redundant_features = 0
n_repeated_features = 0
n_random_features = int(1e4)
n_features = n_informative_features + n_redundant_features + n_repeated_features + n_random_features
n_classes = 5
class_sep = 3 * np.sqrt(n_features)  # sqrt(d) -> intercluster distance ~ cluster radius
seed = 42
X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative_features,
    n_redundant=n_redundant_features,
    n_repeated=n_repeated_features,
    n_classes=n_classes,
    class_sep=class_sep,
    random_state=seed,
    n_clusters_per_class=1,
    flip_y=0.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
)

## KMeans

In [5]:
master_seed = 42
generator = np.random.default_rng(master_seed)
seeds = generator.integers(0, int(1e6), size=10).tolist()
results = dict()
for seed in seeds:
    model_cls = KMeans
    model_kwargs = dict()
    search_space = dict(
        n_clusters=optuna.distributions.IntUniformDistribution(2, 10),
        random_state=optuna.distributions.IntUniformDistribution(0, int(1e6)),
    )
    tuner = OptunaTuner(sampler="tpe", seed=seed, n_trials=20)
    training_fn_partial = partial(training_fn, model_cls=model_cls, model_kwargs=model_kwargs, X=X, y=y)
    study = tuner.tune(training_fn=training_fn_partial, search_space=search_space, direction="maximize")
    results[seed] = {
        "best_params": study.best_params,
        "best_value": study.best_value,
    }

[I 2025-06-09 14:09:29,198] A new study created in memory with name: no-name-83af263a-1363-4aab-bd73-83cb6c53badb


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:10:12,549] A new study created in memory with name: no-name-4d6e61a0-0ebb-44ee-b129-971a5831e061


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:10:58,152] A new study created in memory with name: no-name-f5b49080-a862-47fe-bc52-411ac2b4d200


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:11:43,448] A new study created in memory with name: no-name-daaca44b-2252-4e37-9743-df8f177f2a85


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:12:23,239] A new study created in memory with name: no-name-5c13826b-fa6f-462b-9df0-9e037ed482c4


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:12:59,621] A new study created in memory with name: no-name-7ab9d17f-018a-4087-9e3a-e9c7df2698d2


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:13:42,125] A new study created in memory with name: no-name-079b8566-ebaf-4a04-883f-73ab65b27e46


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:14:27,777] A new study created in memory with name: no-name-9457cca8-47de-4e9c-8f81-8c2223face9f


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:15:10,624] A new study created in memory with name: no-name-3a92310d-44aa-4838-81cc-7cdf01271088


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:15:51,266] A new study created in memory with name: no-name-b83f047e-12c0-4b0a-a3b6-0c3fa9768a0d


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
df = pd.DataFrame.from_dict(results, orient="index")
df = pd.concat([df.drop("best_params", axis=1), df["best_params"].apply(pd.Series)], axis=1)
display(df), display(df.describe().T)

Unnamed: 0,best_value,n_clusters,random_state
89250,1.0,5,38400
773956,1.0,5,680942
654571,1.0,5,530328
438878,1.0,5,935238
433015,1.0,5,333173
858597,1.0,5,69470
85945,1.0,5,250731
697368,1.0,5,261015
201469,1.0,5,956253
94177,1.0,5,459540


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
best_value,10.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
n_clusters,10.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0
random_state,10.0,451509.0,325728.705419,38400.0,253302.0,396356.5,643288.5,956253.0


(None, None)

## CoHiRF KMeans


In [8]:
master_seed = 42
generator = np.random.default_rng(master_seed)
seeds = generator.integers(0, int(1e6), size=10).tolist()
results = dict()
for seed in seeds:
    model_cls = BaseCoHiRF
    model_kwargs = dict(model_cls=KMeans, random_state=seed)
    search_space = dict(
        repetitions=optuna.distributions.IntDistribution(2, 8),
        n_features=optuna.distributions.FloatDistribution(0.1, 0.6),
        base_model_kwargs=dict(
            n_clusters=optuna.distributions.IntUniformDistribution(2, 5),
        ),
    )
    tuner = OptunaTuner(sampler="tpe", seed=seed, n_trials=20)
    training_fn_partial = partial(training_fn, model_cls=model_cls, model_kwargs=model_kwargs, X=X, y=y)
    study = tuner.tune(training_fn=training_fn_partial, search_space=search_space, direction="maximize")
    results[seed] = {
        "best_params": study.best_params,
        "best_value": study.best_value,
    }

[I 2025-06-09 14:18:59,683] A new study created in memory with name: no-name-2b6f3ddd-9ab5-40d2-ab68-504c341dd9e8


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:24:43,076] A new study created in memory with name: no-name-804d6d2e-b26f-44c9-a4f4-67c6b6f5a01b


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:30:33,572] A new study created in memory with name: no-name-37c88e21-0045-4cb2-bcd3-a35a92ff56af


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:36:31,328] A new study created in memory with name: no-name-eeb5cead-5434-4f6b-b790-04d9b91d1c75


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:42:47,984] A new study created in memory with name: no-name-5cd7399a-7145-4412-a6bc-cd70ee317160


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:47:52,831] A new study created in memory with name: no-name-afb26a97-6034-4754-8dab-3b2d5bfb3778


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:51:49,294] A new study created in memory with name: no-name-95732bdb-b5a4-4426-9222-94a800dddc5d


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 14:57:33,367] A new study created in memory with name: no-name-860691f8-ec4d-479b-b9bb-140eb94b68d7


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:01:28,501] A new study created in memory with name: no-name-b648ee6d-f7fd-4508-94b9-ea3a79e63038


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:07:29,353] A new study created in memory with name: no-name-68e79184-db65-4bac-b021-751dbc666bb2


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
df = pd.DataFrame.from_dict(results, orient="index")
df = pd.concat([df.drop("best_params", axis=1), df["best_params"].apply(pd.Series)], axis=1)
display(df), display(df.describe().T)

Unnamed: 0,best_value,repetitions,n_features,base_model_kwargs/n_clusters
89250,1.0,7.0,0.563897,3.0
773956,0.909205,8.0,0.584245,3.0
654571,1.0,8.0,0.596435,3.0
438878,1.0,8.0,0.53126,3.0
433015,0.782541,6.0,0.541691,2.0
858597,1.0,2.0,0.471549,4.0
85945,1.0,8.0,0.564472,3.0
697368,1.0,2.0,0.459299,5.0
201469,0.960762,8.0,0.564322,3.0
94177,1.0,6.0,0.352235,2.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
best_value,10.0,0.965251,0.070708,0.782541,0.970572,1.0,1.0,1.0
repetitions,10.0,6.3,2.406011,2.0,6.0,7.5,8.0,8.0
n_features,10.0,0.522941,0.074927,0.352235,0.486477,0.552794,0.564435,0.596435
base_model_kwargs/n_clusters,10.0,3.1,0.875595,2.0,3.0,3.0,3.0,5.0


(None, None)

## Batch CoHiRF KMeans

In [10]:
master_seed = 42
generator = np.random.default_rng(master_seed)
seeds = generator.integers(0, int(1e6), size=10).tolist()
results = dict()
for seed in seeds:
    batch_size = int(1e3)
    model_cls = BatchCoHiRF
    model_kwargs = dict(
        cohirf_model=BaseCoHiRF,
        cohirf_kwargs=dict(base_model=KMeans, random_state=seed, max_iter=1),
        batch_size=batch_size,
        n_jobs=10,
    )
    search_space = dict(
        cohirf_kwargs=dict(
            repetitions=optuna.distributions.IntDistribution(2, 8),
            n_features=optuna.distributions.FloatDistribution(0.1, 0.6),
            base_model_kwargs=dict(
                n_clusters=optuna.distributions.IntUniformDistribution(2, 5),
            ),
        ),
    )
    tuner = OptunaTuner(sampler="tpe", seed=seed, n_trials=20)
    training_fn_partial = partial(training_fn, model_cls=model_cls, model_kwargs=model_kwargs, X=X, y=y)
    study = tuner.tune(training_fn=training_fn_partial, search_space=search_space, direction="maximize")
    results[seed] = {
        "best_params": study.best_params,
        "best_value": study.best_value,
    }

[I 2025-06-09 15:12:39,567] A new study created in memory with name: no-name-ea787c48-a187-459b-bd4c-47c8fd70e75e


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:14:25,566] A new study created in memory with name: no-name-a367e1ba-b09f-4ccd-a5e5-4f70c3622f2a


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:16:47,655] A new study created in memory with name: no-name-471db1fd-34ff-4811-8fb3-8eff00d9a888


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:18:50,585] A new study created in memory with name: no-name-f3a2b5c5-d980-4a2e-98f4-efb4604f965d


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:20:57,297] A new study created in memory with name: no-name-489eb5e6-f6f8-4205-b197-68adbb4842f6


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:22:49,236] A new study created in memory with name: no-name-f375f61d-f3e3-43e9-b395-2bddc988302e


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:24:15,808] A new study created in memory with name: no-name-f694a99d-61c7-464b-82ba-e391066e3958


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:26:08,939] A new study created in memory with name: no-name-5fc5e466-e16b-426a-8a24-81bd0855bc5a


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:27:54,453] A new study created in memory with name: no-name-b510b64e-f7b6-4f87-b518-093b260cd822


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 15:29:48,298] A new study created in memory with name: no-name-820334da-12e8-4893-967a-c44bee8d986c


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
df = pd.DataFrame.from_dict(results, orient="index")
df = pd.concat([df.drop("best_params", axis=1), df["best_params"].apply(pd.Series)], axis=1)
display(df), display(df.describe().T)

Unnamed: 0,best_value,cohirf_kwargs/repetitions,cohirf_kwargs/n_features,cohirf_kwargs/base_model_kwargs/n_clusters
89250,1.0,4.0,0.429208,2.0
773956,0.954124,5.0,0.578758,2.0
654571,0.956607,5.0,0.433937,2.0
438878,0.818819,8.0,0.556271,2.0
433015,0.782541,2.0,0.489002,4.0
858597,0.95706,2.0,0.348147,4.0
85945,0.954124,6.0,0.595337,2.0
697368,1.0,6.0,0.475096,2.0
201469,0.641705,8.0,0.391961,2.0
94177,1.0,7.0,0.58245,2.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
best_value,10.0,0.906498,0.119722,0.641705,0.852645,0.955365,0.989265,1.0
cohirf_kwargs/repetitions,10.0,5.3,2.162817,2.0,4.25,5.5,6.75,8.0
cohirf_kwargs/n_features,10.0,0.488017,0.087349,0.348147,0.430391,0.482049,0.573136,0.595337
cohirf_kwargs/base_model_kwargs/n_clusters,10.0,2.4,0.843274,2.0,2.0,2.0,2.0,4.0


(None, None)

## Batch CoHiRF KMeans last 5 batches


In [15]:
master_seed = 42
generator = np.random.default_rng(master_seed)
seeds = generator.integers(0, int(1e6), size=10).tolist()
results = dict()
for seed in seeds:
	batch_size = int(1e3)
	model_cls = BatchCoHiRF
	model_kwargs = dict(
		cohirf_model=BaseCoHiRF,
		cohirf_kwargs=dict(base_model=KMeans, random_state=seed, max_iter=1),
		batch_size=batch_size,
		n_jobs=10,
	)
	search_space = dict(
		cohirf_kwargs=dict(
			repetitions=optuna.distributions.IntDistribution(2, 8),
			n_features=optuna.distributions.FloatDistribution(0.1, 0.6),
			base_model_kwargs=dict(
				n_clusters=optuna.distributions.IntUniformDistribution(2, 5),
			),
		),
	)
	tuner = OptunaTuner(sampler="tpe", seed=seed, n_trials=20)
	X_batch = X[-5*batch_size:]
	y_batch = y[-5*batch_size:]
	training_fn_partial = partial(training_fn, model_cls=model_cls, model_kwargs=model_kwargs, X=X_batch, y=y_batch)
	study = tuner.tune(training_fn=training_fn_partial, search_space=search_space, direction="maximize")
	best_value = training_fn(study.best_trial, model_cls=model_cls, model_kwargs=model_kwargs, X=X, y=y)
	results[seed] = {
		"best_params": study.best_params,
		"best_value": best_value,
	}

[I 2025-06-09 17:04:11,659] A new study created in memory with name: no-name-9cfeca5b-2205-4e45-9789-9d646fde6244


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:05:12,226] A new study created in memory with name: no-name-59bd38de-6353-4039-b3eb-88da81675e98


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:06:49,295] A new study created in memory with name: no-name-d42ce799-bd2e-48ba-9154-99430c92b094


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:08:12,597] A new study created in memory with name: no-name-392e61b3-99a0-4ad9-9a3c-0810f80688f7


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:09:53,486] A new study created in memory with name: no-name-843dcc87-e96c-451d-a1af-5b5023a6fbdf


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:11:01,735] A new study created in memory with name: no-name-8bc61ec5-15f0-4128-b136-4ee9f56ecec4


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:12:00,419] A new study created in memory with name: no-name-b6e1d4c3-c7ed-4833-98db-68954b806118


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:13:20,526] A new study created in memory with name: no-name-705e28b0-8b28-451e-bbb7-764484f27bcb


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:14:19,334] A new study created in memory with name: no-name-75ad4324-4cf6-4c7f-b9d6-94eba3b45fb0


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-06-09 17:15:29,643] A new study created in memory with name: no-name-d8252191-db6e-4c58-bf9b-0c6ea23cbde3


Trials:   0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
df = pd.DataFrame.from_dict(results, orient="index")
df = pd.concat([df.drop("best_params", axis=1), df["best_params"].apply(pd.Series)], axis=1)
display(df), display(df.describe().T)

Unnamed: 0,best_value,cohirf_kwargs/repetitions,cohirf_kwargs/n_features,cohirf_kwargs/base_model_kwargs/n_clusters
89250,1.0,4.0,0.429208,2.0
773956,0.753984,5.0,0.598921,2.0
654571,1.0,8.0,0.532976,2.0
438878,0.432418,5.0,0.584439,3.0
433015,0.782541,2.0,0.449955,4.0
858597,0.953449,2.0,0.402036,4.0
85945,0.954124,6.0,0.595337,2.0
697368,0.880709,2.0,0.412695,5.0
201469,0.752464,8.0,0.429691,2.0
94177,0.956154,8.0,0.422847,2.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
best_value,10.0,0.846584,0.175213,0.432418,0.761123,0.917079,0.955647,1.0
cohirf_kwargs/repetitions,10.0,5.0,2.494438,2.0,2.5,5.0,7.5,8.0
cohirf_kwargs/n_features,10.0,0.48581,0.082131,0.402036,0.424437,0.439823,0.571573,0.598921
cohirf_kwargs/base_model_kwargs/n_clusters,10.0,2.8,1.135292,2.0,2.0,2.0,3.75,5.0


(None, None)