# Definitions

In [9]:
import pandas as pd
import optuna
from cohirf.models.cohirf import BaseCoHiRF, CoHiRF
from cohirf.models.vecohirf import VeCoHiRF
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score, calinski_harabasz_score
from ml_experiments.tuners import OptunaTuner
from ml_experiments.utils import unflatten_any, update_recursively, flatten_any
import numpy as np
import pickle

In [10]:
X = pd.read_csv("/home/belucci/code/cohirf/data/wine/X.csv", index_col=0)
y = pd.read_csv("/home/belucci/code/cohirf/data/wine/y.csv", index_col=0)

In [13]:
X.shape

(67211, 876)

In [16]:
len(features_groups[0])

768

In [17]:
len(features_groups[1])

108

In [11]:
y = y.iloc[:, 0]

In [12]:
# keep only 10 most frequent classes
y = y[y.isin(y.value_counts().index[:10])]
X = X.loc[y.index]

In [5]:
description_group = [i for i, col in enumerate(X.columns) if "description" in col]
other_group = [i for i in range(X.shape[1]) if i not in description_group]
features_groups = [description_group, other_group]

In [6]:
def training_fn(trial, X, y, model_cls, model_params, random_generator, features_groups=None):
    if isinstance(random_generator, int):
        seed_model = random_generator
    else:
        seed_model = int(random_generator.integers(0, 2**31 - 1))
    trial.set_user_attr("seed_model", seed_model)
    model_params = model_params.copy()
    model_params = flatten_any(model_params)
    trial_params = trial.params.copy()
    model_params = update_recursively(model_params, trial_params)
    model_params = unflatten_any(model_params)
    model = model_cls(**model_params)
    if hasattr(model, "random_state"):
        model.set_params(random_state=seed_model)
    if hasattr(model, "seed"):
        model.set_params(seed=seed_model)
    if features_groups is not None:
        y_pred = model.fit_predict(X, features_groups=features_groups)
    else:
        y_pred = model.fit_predict(X)
    ari = adjusted_rand_score(y, y_pred)
    return ari

# Runs

In [None]:
model_cls = KMeans
model_params = dict()
search_space = dict(
    n_clusters=optuna.distributions.IntDistribution(2, 30),
)
default_values = [
    dict(
        n_clusters=10,
    )
]
tuner = OptunaTuner(sampler='tpe', n_trials=30, seed=0)
study = tuner.tune(
	training_fn=training_fn,
	search_space=search_space,
	direction='maximize',
	enqueue_configurations=default_values,
	random_generator=np.random.default_rng(0),
	X=X,
	y=y,
	model_cls=model_cls,
	model_params=model_params,
	features_groups=None,
)
print(f"Best trial: {study.best_trial}")

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

Best trial: FrozenTrial(number=13, state=1, values=[0.5205148875338225], datetime_start=datetime.datetime(2025, 9, 15, 16, 48, 46, 355900), datetime_complete=datetime.datetime(2025, 9, 15, 16, 48, 48, 428535), params={'n_clusters': 8}, user_attrs={'seed_model': 1302740407, 'result': 0.5205148875338225}, system_attrs={'tpe:relative_params:0': '{"n_clusters": 8}'}, intermediate_values={}, distributions={'n_clusters': IntDistribution(high=30, log=False, low=2, step=1)}, trial_id=13, value=None)


In [7]:
model_cls = KMeans
model_params = dict()
search_space = dict(
    n_clusters=optuna.distributions.IntDistribution(2, 30),
)
default_values = [
    dict(
        n_clusters=10,
    )
]
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study = tuner.tune(
    training_fn=training_fn,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X.iloc[:, features_groups[0]],
    y=y,
    model_cls=model_cls,
    model_params=model_params,
    features_groups=None,
)
print(f"Best trial: {study.best_trial}")

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

Best trial: FrozenTrial(number=29, state=1, values=[0.5257704700655731], datetime_start=datetime.datetime(2025, 9, 18, 10, 13, 56, 235813), datetime_complete=datetime.datetime(2025, 9, 18, 10, 13, 59, 839104), params={'n_clusters': 9}, user_attrs={'seed_model': 1566923138, 'result': 0.5257704700655731}, system_attrs={'tpe:relative_params:0': '{"n_clusters": 9}'}, intermediate_values={}, distributions={'n_clusters': IntDistribution(high=30, log=False, low=2, step=1)}, trial_id=29, value=None)


In [8]:
model_cls = KMeans
model_params = dict()
search_space = dict(
    n_clusters=optuna.distributions.IntDistribution(2, 30),
)
default_values = [
    dict(
        n_clusters=10,
    )
]
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study = tuner.tune(
    training_fn=training_fn,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X.iloc[:, features_groups[1]],
    y=y,
    model_cls=model_cls,
    model_params=model_params,
    features_groups=None,
)
print(f"Best trial: {study.best_trial}")

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

Best trial: FrozenTrial(number=16, state=1, values=[0.10670567675679409], datetime_start=datetime.datetime(2025, 9, 18, 10, 14, 13, 898122), datetime_complete=datetime.datetime(2025, 9, 18, 10, 14, 14, 668709), params={'n_clusters': 24}, user_attrs={'seed_model': 1357791198, 'result': 0.10670567675679409}, system_attrs={'tpe:relative_params:0': '{"n_clusters": 24}'}, intermediate_values={}, distributions={'n_clusters': IntDistribution(high=30, log=False, low=2, step=1)}, trial_id=16, value=None)


In [51]:
model_cls = CoHiRF
model_params = dict(n_samples_representative=1000, n_jobs=1, max_iter=10)
search_space = dict(
    n_features=optuna.distributions.FloatDistribution(0.1, 1.0),
    repetitions=optuna.distributions.IntDistribution(2, 10),
    kmeans_n_clusters=optuna.distributions.IntDistribution(2, 5),
)
default_values = [
    dict(
        n_features=0.3,
        repetitions=5,
        kmeans_n_clusters=3,
    )
]
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study_1 = tuner.tune(
    training_fn=training_fn,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X.iloc[:, features_groups[0]],
    y=y,
    model_cls=model_cls,
    model_params=model_params,
    features_groups=None,
)
print(study_1.best_trial)

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

FrozenTrial(number=27, state=1, values=[0.3930684675858943], datetime_start=datetime.datetime(2025, 9, 15, 18, 8, 31, 259773), datetime_complete=datetime.datetime(2025, 9, 15, 18, 8, 45, 957550), params={'n_features': 0.7377827727817188, 'repetitions': 7, 'kmeans_n_clusters': 5}, user_attrs={'seed_model': 72124473, 'result': 0.3930684675858943}, system_attrs={'tpe:relative_params:0': '{"kmeans_n_clusters": 5, "n_features": 0.7377827727817188, "repetitions": 7}'}, intermediate_values={}, distributions={'n_features': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'repetitions': IntDistribution(high=10, log=False, low=2, step=1), 'kmeans_n_clusters': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=27, value=None)


In [52]:
top_10_model_1_trials = study_1.best_trials[:10]

In [53]:
model_cls = CoHiRF
model_params = dict(n_samples_representative=1000, n_jobs=1, max_iter=10)
search_space = dict(
    n_features=optuna.distributions.FloatDistribution(0.1, 1.0),
    repetitions=optuna.distributions.IntDistribution(2, 10),
    kmeans_n_clusters=optuna.distributions.IntDistribution(2, 5),
)
default_values = [
    dict(
        n_features=0.3,
        repetitions=5,
        kmeans_n_clusters=3,
    )
]
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study_2 = tuner.tune(
    training_fn=training_fn,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X.iloc[:, features_groups[1]],
    y=y,
    model_cls=model_cls,
    model_params=model_params,
    features_groups=None,
)
print(study_2.best_trial)

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

FrozenTrial(number=22, state=1, values=[0.11092563978476842], datetime_start=datetime.datetime(2025, 9, 15, 18, 9, 58, 156948), datetime_complete=datetime.datetime(2025, 9, 15, 18, 10, 0, 559569), params={'n_features': 0.12849005392209062, 'repetitions': 9, 'kmeans_n_clusters': 5}, user_attrs={'seed_model': 1440696407, 'result': 0.11092563978476842}, system_attrs={'tpe:relative_params:0': '{"kmeans_n_clusters": 5, "n_features": 0.12849005392209062, "repetitions": 9}'}, intermediate_values={}, distributions={'n_features': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'repetitions': IntDistribution(high=10, log=False, low=2, step=1), 'kmeans_n_clusters': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=22, value=None)


In [35]:
print(study_2.best_trial)

FrozenTrial(number=22, state=1, values=[0.10995734534189136], datetime_start=datetime.datetime(2025, 9, 15, 17, 8, 18, 645489), datetime_complete=datetime.datetime(2025, 9, 15, 17, 8, 21, 349815), params={'n_features': 0.12849005392209062, 'repetitions': 9, 'kmeans_n_clusters': 5}, user_attrs={'seed_model': 1440696407, 'result': 0.10995734534189136}, system_attrs={'tpe:relative_params:0': '{"kmeans_n_clusters": 5, "n_features": 0.12849005392209062, "repetitions": 9}'}, intermediate_values={}, distributions={'n_features': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'repetitions': IntDistribution(high=10, log=False, low=2, step=1), 'kmeans_n_clusters': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=22, value=None)


In [54]:
top_10_model_2_trials = study_2.best_trials[:10]

In [55]:
# save study_1 and study_2 to disk
with open("study_1.pkl", "wb") as f:
	pickle.dump(study_1, f)
with open("study_2.pkl", "wb") as f:
	pickle.dump(study_2, f)

In [7]:
# load study_1 and study_2 from disk
with open("study_1.pkl", "rb") as f:
	study_1 = pickle.load(f)
with open("study_2.pkl", "rb") as f:
	study_2 = pickle.load(f)

In [9]:
top_10_model_1_trials = study_1.best_trials[:10]
top_10_model_2_trials = study_2.best_trials[:10]
search_space = dict(
    model_1_trial=optuna.distributions.CategoricalDistribution(top_10_model_1_trials),
    model_2_trial=optuna.distributions.CategoricalDistribution(top_10_model_2_trials),
)



In [10]:
def training_fn_2(trial, X, y, random_generator, features_groups=None):
    if isinstance(random_generator, int):
        seed_model = random_generator
    else:
        seed_model = int(random_generator.integers(0, 2**31 - 1))
    trial.set_user_attr("seed_model", seed_model)
    model_1_kwargs = trial.params["model_1_trial"].params
    model_1_kwargs["random_state"] = trial.params["model_1_trial"].user_attrs["seed_model"]
    model_1_kwargs["max_iter"] = 100
    model_2_kwargs = trial.params["model_2_trial"].params
    model_2_kwargs["random_state"] = trial.params["model_2_trial"].user_attrs["seed_model"]
    model_2_kwargs["max_iter"] = 100
    model = VeCoHiRF(
        cohirf_kwargs=[model_1_kwargs, model_2_kwargs],
        n_jobs=1,
        n_samples_representative=1000,
        random_state=seed_model,
    )
    y_pred = model.fit_predict(X, features_groups=features_groups)
    ari = adjusted_rand_score(y, y_pred)
    return ari

In [58]:
search_space = dict(
    model_1_trial=optuna.distributions.CategoricalDistribution(top_10_model_1_trials),
    model_2_trial=optuna.distributions.CategoricalDistribution(top_10_model_2_trials),
)
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study = tuner.tune(
    training_fn=training_fn_2,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X,
    y=y,
    features_groups=features_groups,
)
print(study.best_trial)

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

FrozenTrial(number=21, state=1, values=[0.21848206467398817], datetime_start=datetime.datetime(2025, 9, 15, 18, 23, 39, 361283), datetime_complete=datetime.datetime(2025, 9, 15, 18, 24, 17, 860876), params={'model_1_trial': FrozenTrial(number=27, state=1, values=[0.3930684675858943], datetime_start=datetime.datetime(2025, 9, 15, 18, 8, 31, 259773), datetime_complete=datetime.datetime(2025, 9, 15, 18, 8, 45, 957550), params={'n_features': 0.7377827727817188, 'repetitions': 7, 'kmeans_n_clusters': 5}, user_attrs={'seed_model': 72124473, 'result': 0.3930684675858943}, system_attrs={'tpe:relative_params:0': '{"kmeans_n_clusters": 5, "n_features": 0.7377827727817188, "repetitions": 7}'}, intermediate_values={}, distributions={'n_features': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'repetitions': IntDistribution(high=10, log=False, low=2, step=1), 'kmeans_n_clusters': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=27, value=None), 'model_2_trial': FrozenTrial(

In [11]:
def training_fn_2(trial, X, y, random_generator, features_groups=None):
    if isinstance(random_generator, int):
        seed_model = random_generator
    else:
        seed_model = int(random_generator.integers(0, 2**31 - 1))
    trial.set_user_attr("seed_model", seed_model)
    model_1_kwargs = trial.params["model_1_trial"].params
    model_1_kwargs["max_iter"] = 100
    model_2_kwargs = trial.params["model_2_trial"].params
    model_2_kwargs["max_iter"] = 100
    model = VeCoHiRF(
        cohirf_kwargs=[model_1_kwargs, model_2_kwargs],
        n_jobs=1,
        n_samples_representative=1000,
        random_state=seed_model,
    )
    y_pred = model.fit_predict(X, features_groups=features_groups)
    ari = adjusted_rand_score(y, y_pred)
    return ari

In [13]:
search_space = dict(
    model_1_trial=optuna.distributions.CategoricalDistribution(top_10_model_1_trials),
    model_2_trial=optuna.distributions.CategoricalDistribution(top_10_model_2_trials),
)
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study = tuner.tune(
    training_fn=training_fn_2,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=[],
    random_generator=np.random.default_rng(0),
    X=X,
    y=y,
    features_groups=features_groups,
)
print(study.best_trial)

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

FrozenTrial(number=25, state=1, values=[0.24032016573967158], datetime_start=datetime.datetime(2025, 9, 15, 18, 58, 39, 295275), datetime_complete=datetime.datetime(2025, 9, 15, 18, 59, 16, 614288), params={'model_1_trial': FrozenTrial(number=27, state=1, values=[0.3930684675858943], datetime_start=datetime.datetime(2025, 9, 15, 18, 8, 31, 259773), datetime_complete=datetime.datetime(2025, 9, 15, 18, 8, 45, 957550), params={'n_features': 0.7377827727817188, 'repetitions': 7, 'kmeans_n_clusters': 5}, user_attrs={'seed_model': 72124473, 'result': 0.3930684675858943}, system_attrs={'tpe:relative_params:0': '{"kmeans_n_clusters": 5, "n_features": 0.7377827727817188, "repetitions": 7}'}, intermediate_values={}, distributions={'n_features': FloatDistribution(high=1.0, log=False, low=0.1, step=None), 'repetitions': IntDistribution(high=10, log=False, low=2, step=1), 'kmeans_n_clusters': IntDistribution(high=5, log=False, low=2, step=1)}, trial_id=27, value=None), 'model_2_trial': FrozenTrial(

In [29]:
model_cls = VeCoHiRF
model_params = dict(
	cohirf_model=[CoHiRF, CoHiRF],
	cohirf_kwargs=[
		dict(n_samples_representative=1000, n_jobs=1),
		dict(n_samples_representative=1000, n_jobs=1),
	],
	n_samples_representative=1000,
	n_jobs=1
)
search_space = dict(
	cohirf_kwargs=[
		dict(
			n_features=optuna.distributions.FloatDistribution(0.1, 1.0),
			repetitions=optuna.distributions.IntDistribution(2, 10),
			kmeans_n_clusters=optuna.distributions.IntDistribution(2, 5),
		),
		dict(
			n_features=optuna.distributions.FloatDistribution(0.1, 1.0),
			repetitions=optuna.distributions.IntDistribution(2, 10),
			kmeans_n_clusters=optuna.distributions.IntDistribution(2, 5),
		)
	]
)
default_values = [
    dict(
		cohirf_kwargs=[
			dict(
				n_features=0.3,
				repetitions=5,
				kmeans_n_clusters=3,
			),
			dict(
				n_features=0.3,
				repetitions=5,
				kmeans_n_clusters=3,
			)
		]
    )
]
tuner = OptunaTuner(sampler="tpe", n_trials=30, seed=0)
study = tuner.tune(
    training_fn=training_fn,
    search_space=search_space,
    direction="maximize",
    enqueue_configurations=default_values,
    random_generator=np.random.default_rng(0),
    X=X,
    y=y,
    model_cls=model_cls,
    model_params=model_params,
    features_groups=features_groups,
)
print(study.best_trial)

Trials:   0%|          | 0/30 [00:00<?, ?it/s]

FrozenTrial(number=26, state=1, values=[0.38874976007534146], datetime_start=datetime.datetime(2025, 9, 15, 16, 56, 58, 447039), datetime_complete=datetime.datetime(2025, 9, 15, 16, 57, 16, 88236), params={'cohirf_kwargs/item_0/n_features': 0.46156708806428093, 'cohirf_kwargs/item_0/repetitions': 10, 'cohirf_kwargs/item_0/kmeans_n_clusters': 5, 'cohirf_kwargs/item_1/n_features': 0.5460508029741323, 'cohirf_kwargs/item_1/repetitions': 3, 'cohirf_kwargs/item_1/kmeans_n_clusters': 2}, user_attrs={'seed_model': 1190382222, 'result': 0.38874976007534146}, system_attrs={'tpe:relative_params:0': '{"cohirf_kwargs/item_0/kmeans_n_clusters": 5, "cohirf_kwargs/item_0/n_features": 0.46156708806428093, "cohirf_kwargs/item_0/repetitions": 10, "cohirf_kwargs/item_1/kmeans_n_clusters": 2, "cohirf_kwargs/item_1/n_features": 0.5460508029741323, "cohirf_kwargs/item_1/repetitions": 3}'}, intermediate_values={}, distributions={'cohirf_kwargs/item_0/n_features': FloatDistribution(high=1.0, log=False, low=0.