In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries
from pathlib import Path
import os
import pickle
from functools import partial

# Save Results

## Load mlflow runs

In [2]:
results_dir = Path.cwd().parent / "results" / "real"
os.makedirs(results_dir, exist_ok=True)

In [3]:
db_port = 5101
db_name = "cohirf"
url = f"postgresql://beluccib@localhost:{db_port}/{db_name}"
# url = f"postgresql://beluccib@clust5:{db_port}/{db_name}"
engine = create_engine(url)
query = "SELECT experiments.name from experiments"
experiment_names = pd.read_sql(query, engine)["name"].tolist()

In [4]:
experiments_names = [exp for exp in experiment_names if (exp.startswith("real-"))]

In [5]:
experiments_names

['real-adjusted_rand-KernelRBFKMeans',
 'real-adjusted_rand-BatchCoHiRF-KernelRBF-1iter',
 'real-adjusted_rand-KMeans',
 'real-adjusted_rand-DBSCAN',
 'real-adjusted_rand-CoHiRF-DBSCAN',
 'real-adjusted_rand-BatchCoHiRF',
 'real-adjusted_rand-CoHiRF',
 'real-adjusted_rand-CoHiRF-KernelRBF',
 'real-adjusted_rand-BatchCoHiRF-DBSCAN-1iter',
 'real-davies_bouldin_score-BatchCoHiRF-1iter',
 'real-adjusted_mutual_info-BatchCoHiRF-KernelRBF-1iter',
 'real-silhouette-BatchCoHiRF-1iter',
 'real-silhouette-DBSCAN',
 'real-calinski_harabasz_score-BatchCoHiRF-1iter',
 'real-davies_bouldin_score-KernelRBFKMeans',
 'real-silhouette-KMeans',
 'real-davies_bouldin_score-KMeans',
 'real-calinski_harabasz_score-CoHiRF',
 'real-adjusted_mutual_info-BatchCoHiRF-DBSCAN-1iter',
 'real-davies_bouldin_score-BatchCoHiRF-KernelRBF-1iter',
 'real-adjusted_mutual_info-CoHiRF',
 'real-adjusted_mutual_info-CoHiRF-KernelRBF',
 'real-davies_bouldin_score-CoHiRF-DBSCAN',
 'real-adjusted_mutual_info-CoHiRF-DBSCAN',
 'r

In [6]:
query = "SELECT DISTINCT(key) FROM params WHERE key LIKE 'best/%%'"
best_params = pd.read_sql(query, engine)["key"].tolist()

In [7]:
params_columns = [
    "model",
    "dataset_id",
	"n_trials",
	"dataset_name",
	"standardize",
	"hpo_metric",
	"direction",
	"hpo_seed",
] + best_params

In [8]:
latest_metrics_columns = [
    "fit_model_return_elapsed_time",
    "max_memory_used_after_fit",
    "max_memory_used",
	"best/n_clusters_",
    "best/rand_score",
    "best/adjusted_rand",
    "best/mutual_info",
    "best/adjusted_mutual_info",
    "best/normalized_mutual_info",
    "best/homogeneity_completeness_v_measure",
    "best/silhouette",
    "best/calinski_harabasz_score",
    "best/davies_bouldin_score",
    "best/inertia_score",
    "best/homogeneity",
    "best/completeness",
    "best/v_measure",
    "best/elapsed_time",
]

In [9]:
tags_columns = ["raised_exception", "EXCEPTION", "mlflow.parentRunId", "Last step finished"]

In [10]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns

In [11]:
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [12]:
dataset_characteristics = pd.read_csv(results_dir / "datasets_characteristics.csv", index_col=0)
dataset_characteristics.index = dataset_characteristics["openml_id"].astype(str)

In [13]:
df_runs_raw_1 = df_params.join(df_latest_metrics)
df_runs_raw_1 = df_runs_raw_1.join(df_tags)

In [14]:
db_port = 5001
db_name = "cohirf"
url = f"postgresql://belucci@localhost:{db_port}/{db_name}"
# url = f"postgresql://beluccib@clust5:{db_port}/{db_name}"
engine = create_engine(url)
query = "SELECT experiments.name from experiments"
experiment_names = pd.read_sql(query, engine)["name"].tolist()
experiments_names = [exp for exp in experiment_names if (exp.startswith('real-'))]

In [15]:
experiments_names

['real-ari-BatchCoHiRF-1iter',
 'real-ari-AverageAgglomerativeClustering',
 'real-ari-BatchCoHiRF-DBSCAN-1iter',
 'real-ari-AffinityPropagation',
 'real-ari-BatchCoHiRF-SC-SRGF',
 'real-ari-CoHiRF-DBSCAN',
 'real-ari-CoHiRF-KernelRBF',
 'real-ari-CoHiRF',
 'real-ari-CompleteAgglomerativeClustering',
 'real-ari-DBSCAN',
 'real-ari-HDBSCAN',
 'real-ari-IRFLLRR',
 'real-ari-KMeans',
 'real-ari-MeanShift',
 'real-ari-OPTICS',
 'real-ari-Proclus',
 'real-ari-SingleAgglomerativeClustering',
 'real-ari-SpectralClustering',
 'real-ari-SpectralSubspaceRandomization',
 'real-ari-WardAgglomerativeClustering',
 'real-adjusted_mutual_info-BatchCoHiRF-DBSCAN-1iter',
 'real-adjusted_mutual_info-DBSCAN',
 'real-adjusted_mutual_info-BatchCoHiRF-SC-SRGF',
 'real-adjusted_mutual_info-CompleteAgglomerativeClustering',
 'real-adjusted_mutual_info-AverageAgglomerativeClustering',
 'real-adjusted_mutual_info-BatchCoHiRF-1iter',
 'real-adjusted_mutual_info-CoHiRF',
 'real-adjusted_mutual_info-CoHiRF-KernelRBF

In [16]:
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [17]:
df_runs_raw_2 = df_params.join(df_latest_metrics)
df_runs_raw_2 = df_runs_raw_2.join(df_tags)

In [18]:
df_runs_raw_2

key,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,best/rand_score,best/silhouette,best/v_measure,fit_model_return_elapsed_time,max_memory_used,max_memory_used_after_fit,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000158194df4bbc83acad810a6126e8,FINISHED,1753568824186,1.753569e+12,,,,,,,,...,,,,0.220166,499.724,499.724,,,35bc54052da54c98b5c8f43b71499982,False
0001535570df44ea95a4d402de597513,FINISHED,1762818248019,1.762818e+12,,,,,,,,...,,,,4.103539,405.120,405.120,,_on_train_end,f48fff122669440cb13ef97bfdc5b32c,False
0001e2df550247b7a3e752cefeeba0e7,FINISHED,1762771963219,1.762772e+12,,,2,,,,336e251e84e149d89a4df9219bffc888,...,0.839463,0.364308,0.59746,333.427949,373.932,373.932,,_on_train_end,,False
0001e8fa8b8e4568a763c6900949e517,FINISHED,1763044887769,1.763045e+12,,,,,,,,...,,,,120.652804,409.836,409.836,,_on_train_end,bfeb346a88a14c7f9b080e684e983cf5,False
00024364e3674a14992d65e47cde9e3d,FINISHED,1763061897103,1.763062e+12,,,,,,,,...,,,,0.280940,389.508,389.508,,_on_train_end,97f7f09de59c4dfebae0ff2e5b82a478,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffee9233db84b49b5fa6003f8f89366,FINISHED,1753549823592,1.753550e+12,,,,,,,,...,,,,0.073361,480.604,480.604,,,c462b7097d5e4812bc69f5eb17bf69f7,False
fffefd1d0af949a2bf3a3785c7dac124,FINISHED,1763043371271,1.763044e+12,,,,,,,,...,,,,3.477920,1299.324,1299.324,,_on_train_end,bf9d376100f4417a9d8d36609209568f,False
ffff93fb569c4b8793a5380d13025643,FINISHED,1762772184197,1.762772e+12,,,,,,,,...,,,,0.071934,363.192,363.192,,_on_train_end,b4200f58c720472487e5fde672a402c5,False
ffffa4d82cc24aa28aebbd044001f31d,FINISHED,1762821197811,1.762821e+12,,,,,,,,...,,,,0.294223,381.604,381.604,,_on_train_end,0e6b84d4521f45c2ab5e82436368a988,False


In [19]:
df_runs_raw_2["start_time"] = pd.to_datetime(df_runs_raw_2["start_time"], unit='ms')
df_runs_raw_2 = df_runs_raw_2.loc[df_runs_raw_2["start_time"] > "2025-11-01"] # to filter out old runs

In [20]:
df_runs_raw_2

key,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,best/rand_score,best/silhouette,best/v_measure,fit_model_return_elapsed_time,max_memory_used,max_memory_used_after_fit,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001535570df44ea95a4d402de597513,FINISHED,2025-11-10 23:44:08.019,1.762818e+12,,,,,,,,...,,,,4.103539,405.120,405.120,,_on_train_end,f48fff122669440cb13ef97bfdc5b32c,False
0001e2df550247b7a3e752cefeeba0e7,FINISHED,2025-11-10 10:52:43.219,1.762772e+12,,,2,,,,336e251e84e149d89a4df9219bffc888,...,0.839463,0.364308,0.59746,333.427949,373.932,373.932,,_on_train_end,,False
0001e8fa8b8e4568a763c6900949e517,FINISHED,2025-11-13 14:41:27.769,1.763045e+12,,,,,,,,...,,,,120.652804,409.836,409.836,,_on_train_end,bfeb346a88a14c7f9b080e684e983cf5,False
00024364e3674a14992d65e47cde9e3d,FINISHED,2025-11-13 19:24:57.103,1.763062e+12,,,,,,,,...,,,,0.280940,389.508,389.508,,_on_train_end,97f7f09de59c4dfebae0ff2e5b82a478,False
0002931b889849979ec42e6c98d8f992,FINISHED,2025-11-13 14:08:07.218,1.763043e+12,,,,,,,,...,,,,0.352017,427.628,427.628,,_on_train_end,ff737b59729a4ee6ba69f9fd2b1578ed,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffe744b3b7e428d8a9e66e294f235ea,FINISHED,2025-11-11 07:49:05.036,1.762862e+12,,,,,,,,...,,,,100.511987,6572.368,6572.368,,_on_train_end,5d93c8448d7943d9bb0a83f87ddab367,False
fffefd1d0af949a2bf3a3785c7dac124,FINISHED,2025-11-13 14:16:11.271,1.763044e+12,,,,,,,,...,,,,3.477920,1299.324,1299.324,,_on_train_end,bf9d376100f4417a9d8d36609209568f,False
ffff93fb569c4b8793a5380d13025643,FINISHED,2025-11-10 10:56:24.197,1.762772e+12,,,,,,,,...,,,,0.071934,363.192,363.192,,_on_train_end,b4200f58c720472487e5fde672a402c5,False
ffffa4d82cc24aa28aebbd044001f31d,FINISHED,2025-11-11 00:33:17.811,1.762821e+12,,,,,,,,...,,,,0.294223,381.604,381.604,,_on_train_end,0e6b84d4521f45c2ab5e82436368a988,False


In [21]:
df_runs_raw = pd.concat([df_runs_raw_1, df_runs_raw_2], axis=0)
df_runs_raw = df_runs_raw.join(dataset_characteristics, on="dataset_id", rsuffix="_dataset")
df_runs_raw.to_csv(results_dir / 'df_runs_raw_cer_tgcc.csv', index=True)

In [3]:
df_runs_raw = pd.read_csv(results_dir / "df_runs_raw_cer_tgcc.csv", index_col=0, low_memory=False)
df_runs_raw["model"] = df_runs_raw["model"] + "-" + df_runs_raw["n_trials"].astype(int).astype(str)
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents["mlflow.parentRunId"].isna()]

In [4]:
df_runs_raw_parents.head(5)

Unnamed: 0_level_0,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001f124c40c40b6b902fcabc76e7eea,FINISHED,1757511120768,1757511000000.0,,,,,,,c854172d93d945b5b98d1d2e7f8b8244,...,,,,False,iris,61,150,5,3,1
00027aa26863479689cb9159eb05d27d,FINISHED,1759251752094,1759254000000.0,,,21.0,,,,c968bd4c602b4cc1b80ea7ebf3ef514a,...,,,,False,mnist_784,554,70000,785,10,1
000ed0a4c2254d48a2ea293ecf9480b9,FINISHED,1761159995601,1761160000000.0,,,,,,,775080a847654e7885ed3003bbb9d622,...,,_on_train_end,,False,nursery,1568,12958,9,4,9
0016b21b09e44c7595b26130cbf448f9,FINISHED,1757524956961,1757525000000.0,,,3.0,,,,318c5fee63ac48ee93c0f8897a15e567,...,,,,False,golub-1999-v2,46780,72,1869,3,0
001b3f77c3e44c12b14761e15c808ec9,FINISHED,1757510889568,1757511000000.0,,,,,,,c3df07fdb54f487a922cb765d3b9c0a7,...,,,,False,ecoli,39,336,8,8,1


## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [5]:
non_duplicate_columns = [
    "model",
    "dataset_id",
	"standardize",
	"hpo_metric",
	"hpo_seed",
]
# df_runs_parents.loc[df_runs_parents["best/n_clusters_"]*0.5 > df_runs_parents["n_instances"], "best/adjusted_rand"] = 
df_runs_parents = df_runs_raw_parents.dropna(axis=0, how="all", subset=["best/adjusted_rand"]).copy()
# add back runs that were not evaluated because we judged too many clusters (but they run anyway)
# df_valid_runs = df_runs_raw_parents.loc[df_runs_raw_parents["best/n_clusters_"] > df_runs_raw_parents["n_instances"]*0.5].copy()
# df_runs_parents = pd.concat([df_runs_parents, df_valid_runs], axis=0)
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]
# fill missing values with "None"
df_runs_parents = df_runs_parents.fillna("None")

In [6]:
# get number of children runs that raised exception for each parent run
children_exceptions = df_runs_raw.groupby("mlflow.parentRunId")["raised_exception"].sum()
df_runs_parents["n_children_raised_exception"] = df_runs_parents.index.map(children_exceptions).fillna(0)

In [7]:
df_runs_parents.loc[(df_runs_parents["n_children_raised_exception"] > 0) & (df_runs_parents["raised_exception"] == False) & (df_runs_parents["model"].str.find("SC-SRGF") == -1), ["dataset_id", "model", "hpo_metric", "n_children_raised_exception"]]

Unnamed: 0_level_0,dataset_id,model,hpo_metric,n_children_raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0523d99695664a3294fa128a1eae79af,554,CoHiRF-DBSCAN-60,calinski_harabasz_score,9
075cbb1e8b9f41a0b1b1cb54ff6673af,39,CoHiRF-top-down-60,davies_bouldin_score,4
08433992474241cdbe44aca2692cd7df,554,CoHiRF-DBSCAN-60,davies_bouldin_score,2
086989a27cdb467b85ed2248ce8b32d3,554,CoHiRF-DBSCAN-60,calinski_harabasz_score,8
0abf67098276481cabbcb002ae6bdf84,1568,CoHiRF-top-down-60,calinski_harabasz_score,2
...,...,...,...,...
52dc295568c149cfb492a083a3b1c635,554,CoHiRF-KernelRBF-top-down-60,silhouette,1
6355a1f80c8d4b42938fe69c8f3bcfad,554,CoHiRF-KernelRBF-top-down-60,calinski_harabasz_score,3
80eadee4a6674b8eaa6e1667f31ab97d,554,CoHiRF-KernelRBF-top-down-60,silhouette,1
b82795273adf4479ac73d14ba64ade30,554,CoHiRF-KernelRBF-top-down-inv-60,adjusted_rand,1


In [8]:
df_to_cat = []
hpo_metrics = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
    "normalized_mutual_info",
]
standardize = [True]
hpo_seed = [i for i in range(5)]
fill_value = pd.NA
fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]

In [9]:
# Too memory intensive
dataset_ids_to_complete = [182, 554, 1478, 1568, 40685]
model_names = [
    "CoHiRF-SC-SRGF-60",
    "CoHiRF-SC-SRGF-top-down-60",
    "CoHiRF-SC-SRGF-top-down-inv-60",
    "SpectralSubspaceRandomization-60",
    "CoHiRF-SC-SRGF-1R-60",
    "CoHiRF-SC-SRGF-top-down-1R-60",
    "CoHiRF-SC-SRGF-top-down-inv-1R-60",
    "CoHiRF-SC-SRGF-2R-60",
    "CoHiRF-SC-SRGF-top-down-2R-60",
	"CoHiRF-SC-SRGF-top-down-inv-2R-60",
]
for dataset_id in dataset_ids_to_complete:
    for model_name in model_names:
        for hpo_metric in hpo_metrics:
            for std in standardize:
                for seed in hpo_seed:
                    new_row = {
						"dataset_id": dataset_id,
						"model": model_name,
						"hpo_metric": hpo_metric,
						"standardize": std,
						"hpo_seed": seed
					}
                    for col in fill_columns:
                        new_row[col] = fill_value
                    df_to_cat.append(new_row)

In [10]:
# # Too few examples (<100) to run in batch
# dataset_ids_to_complete = [46773, 46774, 46775, 46776, 46777, 46779, 46780, 46781]
# model_names = [
#     "BatchCoHiRF-1iter-30",
#     "BatchCoHiRF-DBSCAN-1iter-30",
#     "BatchCoHiRF-SC-SRGF-30",
#     "BatchCoHiRF-KernelRBF-1iter-30",
#     "BatchCoHiRF-1iter-60",
# 	"BatchCoHiRF-DBSCAN-1iter-60",
# 	# "BatchCoHiRF-SC-SRGF-60",
# 	# "BatchCoHiRF-KernelRBF-1iter-60",
# ]
# for dataset_id in dataset_ids_to_complete:
# 	for model_name in model_names:
# 		for hpo_metric in hpo_metrics:
# 			for std in standardize:
# 				mask = (df_runs_parents["dataset_id"] == dataset_id) & (df_runs_parents["model"] == model_name) & (df_runs_parents["hpo_metric"] == hpo_metric) & (df_runs_parents["standardize"] == std)
# 				if not mask.any():
# 					new_row = {
# 						"dataset_id": dataset_id,
# 						"model": model_name,
# 						"hpo_metric": hpo_metric,
# 						"standardize": std,
# 					}
# 					for col in fill_columns:
# 						new_row[col] = fill_value
# 					df_to_cat.append(new_row)

In [11]:
# # Too many examples for IRFLLRR
# dataset_ids_to_complete = [40685]
# model_names = [
#     "IRFLLRR-30",
#     "IRFLLRR-60",
# ]
# hpo_metrics = ["adjusted_rand", "adjusted_mutual_info", "calinski_harabasz_score", "silhouette", "davies_bouldin_score", "normalized_mutual_info"]
# standardize = [True]
# fill_value = pd.NA
# fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]
# for dataset_id in dataset_ids_to_complete:
#     for model_name in model_names:
#         for hpo_metric in hpo_metrics:
#             for std in standardize:
#                 mask = (
#                     (df_runs_parents["dataset_id"] == dataset_id)
#                     & (df_runs_parents["model"] == model_name)
#                     & (df_runs_parents["hpo_metric"] == hpo_metric)
#                     & (df_runs_parents["standardize"] == std)
#                 )
#                 if not mask.any():
#                     new_row = {
#                         "dataset_id": dataset_id,
#                         "model": model_name,
#                         "hpo_metric": hpo_metric,
#                         "standardize": std,
#                     }
#                     for col in fill_columns:
#                         new_row[col] = fill_value
#                     df_to_cat.append(new_row)

In [12]:
df_runs_parents = pd.concat([df_runs_parents, pd.DataFrame(df_to_cat)], axis=0)

# Missing

In [13]:
model_nickname = df_runs_parents['model'].unique().tolist()
model_nickname.sort()
model_nickname

['BatchCoHiRF-1iter-60',
 'BatchCoHiRF-1iter-random-60',
 'BatchCoHiRF-1iter-stratified-60',
 'BatchCoHiRF-60',
 'BatchCoHiRF-DBSCAN-1iter-60',
 'BatchCoHiRF-DBSCAN-1iter-random-60',
 'BatchCoHiRF-DBSCAN-1iter-stratified-60',
 'BatchCoHiRF-KernelRBF-1iter-60',
 'BatchCoHiRF-KernelRBF-1iter-random-60',
 'BatchCoHiRF-KernelRBF-1iter-stratified-60',
 'BatchCoHiRF-SC-SRGF-1iter-60',
 'BatchCoHiRF-SC-SRGF-2-60',
 'CoHiRF-1000-60',
 'CoHiRF-60',
 'CoHiRF-DBSCAN-60',
 'CoHiRF-DBSCAN-top-down-60',
 'CoHiRF-DBSCAN-top-down-inv-60',
 'CoHiRF-KernelRBF-60',
 'CoHiRF-KernelRBF-top-down-60',
 'CoHiRF-KernelRBF-top-down-inv-60',
 'CoHiRF-SC-SRGF-1R-60',
 'CoHiRF-SC-SRGF-1R-top-down-60',
 'CoHiRF-SC-SRGF-1R-top-down-inv-60',
 'CoHiRF-SC-SRGF-2R-60',
 'CoHiRF-SC-SRGF-60',
 'CoHiRF-SC-SRGF-top-down-1R-60',
 'CoHiRF-SC-SRGF-top-down-2R-60',
 'CoHiRF-SC-SRGF-top-down-60',
 'CoHiRF-SC-SRGF-top-down-inv-1R-60',
 'CoHiRF-SC-SRGF-top-down-inv-2R-60',
 'CoHiRF-SC-SRGF-top-down-inv-60',
 'CoHiRF-top-down-60',


In [14]:
non_duplicate_columns = [
	"model",
	"dataset_id",
	"standardize",
	"hpo_metric",
	"hpo_seed",
]

In [15]:
model_nickname = [
    "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-1iter-random-60",
    # "BatchCoHiRF-1iter-stratified-60",
    "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-random-60",
    # "BatchCoHiRF-DBSCAN-1iter-stratified-60",
    "BatchCoHiRF-KernelRBF-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-random-60",
    # "BatchCoHiRF-KernelRBF-1iter-stratified-60",
    "BatchCoHiRF-SC-SRGF-1iter-60",
    # "BatchCoHiRF-SC-SRGF-1iter-random-60",
    # "BatchCoHiRF-SC-SRGF-1iter-stratified-60",
    # "BatchCoHiRF-SC-SRGF-2-60",
    "CoHiRF-60",
    "CoHiRF-top-down-60",
    "CoHiRF-top-down-inv-60",
    # "CoHiRF-1000-60",
    "CoHiRF-DBSCAN-60",
    "CoHiRF-DBSCAN-top-down-60",
    "CoHiRF-DBSCAN-top-down-inv-60",
    "CoHiRF-KernelRBF-60",
	"CoHiRF-KernelRBF-top-down-60",
    "CoHiRF-KernelRBF-top-down-inv-60",
    # "CoHiRF-SC-SRGF-60",
    "CoHiRF-SC-SRGF-1R-60",
    # "CoHiRF-SC-SRGF-1R-top-down-60",
    # "CoHiRF-SC-SRGF-1R-top-down-inv-60",
    "DBSCAN-60",
    "KMeans-60",
    "KernelRBFKMeans-60",
    "SpectralSubspaceRandomization-60",
]
dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
    # 1110,
    # 47039
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    # "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    # "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed
0,CoHiRF-DBSCAN-top-down-60,40685,True,adjusted_rand,0
1,CoHiRF-DBSCAN-top-down-60,40685,True,adjusted_rand,1
2,CoHiRF-DBSCAN-top-down-60,40685,True,adjusted_rand,2
3,CoHiRF-DBSCAN-top-down-60,40685,True,adjusted_rand,3
4,CoHiRF-DBSCAN-top-down-60,40685,True,adjusted_rand,4
...,...,...,...,...,...
87,CoHiRF-KernelRBF-top-down-inv-60,40685,True,silhouette,1
88,CoHiRF-KernelRBF-top-down-inv-60,40685,True,silhouette,2
89,CoHiRF-KernelRBF-top-down-inv-60,40685,True,silhouette,3
90,CoHiRF-KernelRBF-top-down-inv-60,40685,True,silhouette,4


In [16]:
df_runs_parents[["model", "dataset_id", "max_memory_used"]].sort_values(by="max_memory_used", ascending=False)

Unnamed: 0,model,dataset_id,max_memory_used
8002d12704c142ff99021098b82ac3a7,CoHiRF-DBSCAN-60,40685,52893.696
569719f398044e5d8b5056c702355584,CoHiRF-DBSCAN-60,40685,52854.720
640e3f59ca4144e68cc3a497d7787280,CoHiRF-DBSCAN-60,40685,52814.868
a5f9c8d0d9ca497ca067be35b8cb5b3e,CoHiRF-DBSCAN-60,40685,52742.684
02ec0fef9d864abd8da2ac7adeb7d505,CoHiRF-DBSCAN-60,40685,51440.404
...,...,...,...
1495,CoHiRF-SC-SRGF-top-down-inv-2R-60,40685,
1496,CoHiRF-SC-SRGF-top-down-inv-2R-60,40685,
1497,CoHiRF-SC-SRGF-top-down-inv-2R-60,40685,
1498,CoHiRF-SC-SRGF-top-down-inv-2R-60,40685,


In [17]:
model_nickname = [
    "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-1iter-random-60",
    # "BatchCoHiRF-1iter-stratified-60",
    "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-random-60",
    # "BatchCoHiRF-DBSCAN-1iter-stratified-60",
    "BatchCoHiRF-KernelRBF-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-random-60",
    # "BatchCoHiRF-KernelRBF-1iter-stratified-60",
    "BatchCoHiRF-SC-SRGF-1iter-60",
    # "BatchCoHiRF-SC-SRGF-1iter-random-60",
    # "BatchCoHiRF-SC-SRGF-1iter-stratified-60",
    # "BatchCoHiRF-SC-SRGF-2-60",
    "CoHiRF-60",
    "CoHiRF-top-down-60",
    "CoHiRF-top-down-inv-60",
    # "CoHiRF-1000-60",
    "CoHiRF-DBSCAN-60",
    "CoHiRF-KernelRBF-60",
    "CoHiRF-SC-SRGF-60",
    "DBSCAN-60",
    "KMeans-60",
    "KernelRBFKMeans-60",
    "SpectralSubspaceRandomization-60",
]
dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
    # 1110,
    # 47039
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed


In [18]:
model_nickname = [
    "CoHiRF-1000-60",
]
dataset_id = [
    182,
    1478,
    1568,
    40685,
    40984,
    46782,
    46783,
    554,
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed


In [19]:
model_nickname = [
    "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-1iter-random-60",
    # "BatchCoHiRF-1iter-stratified-60",
    "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-random-60",
    # "BatchCoHiRF-DBSCAN-1iter-stratified-60",
    "BatchCoHiRF-KernelRBF-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-random-60",
    # "BatchCoHiRF-KernelRBF-1iter-stratified-60",
    "BatchCoHiRF-SC-SRGF-1iter-60",
    # "BatchCoHiRF-SC-SRGF-1iter-random-60",
    # "BatchCoHiRF-SC-SRGF-1iter-stratified-60",
    # "BatchCoHiRF-SC-SRGF-2-60",
    # "CoHiRF-60",
    # "CoHiRF-1000-60",
    # "CoHiRF-DBSCAN-60",
    # "CoHiRF-KernelRBF-60",
    # "CoHiRF-SC-SRGF-60",
    # "DBSCAN-60",
    "KMeans-60",
    # "KernelRBFKMeans-60",
    # "SpectralSubspaceRandomization-60",
]
dataset_id = [47039]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    # "davies_bouldin_score",
    # "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed
0,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,0
1,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,1
2,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,2
3,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,3
4,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,4
5,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,0
6,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,1
7,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,2
8,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,3
9,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,4


In [20]:
model_nickname = ["CoHiRF-SC-SRGF-1R-60", "CoHiRF-SC-SRGF-2R-60", "CoHiRF-SC-SRGF-60", "SpectralSubspaceRandomization-60"]
dataset_id = [
    39,
    61,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed
0,CoHiRF-SC-SRGF-2R-60,39,True,adjusted_rand,2


In [21]:
# Join df_runs_raw_parents into df_missing using non_duplicate_columns to get the EXCEPTION column
df_missing_with_exception = df_missing.merge(
    df_runs_raw_parents[non_duplicate_columns + ["raised_exception", "EXCEPTION", "Last step finished"]],
    how="left",
    left_on=["model", "dataset_id", "standardize", "hpo_metric", "hpo_seed"],
    right_on=["model", "dataset_id", "standardize", "hpo_metric", "hpo_seed"],
)
df_missing_with_exception[
    [
        "model",
        "dataset_id",
        "standardize",
        "hpo_metric",
        "hpo_seed",
        "raised_exception",
        "EXCEPTION",
        "Last step finished",
    ]
]

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed,raised_exception,EXCEPTION,Last step finished
0,CoHiRF-SC-SRGF-2R-60,39,True,adjusted_rand,2,,,


In [22]:
df_missing_dict = df_missing.copy()
# get only rows from high_mem_tuples
# df_missing_dict = df_missing_dict.merge(high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "both"].drop(columns="_merge")
# exclude rows that are in missing_ari_tuples
# df_missing_dict = df_missing_dict.merge(
# 	missing_ari_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )|
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# exclude rows that are in high_mem_tuples
# df_missing_dict = df_missing_dict.merge(
# 	high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# to_drop = pd.concat([missing_ari_tuples, high_mem_tuples], ignore_index=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")

In [23]:
# get rid of -60
df_missing_dict["model"] = df_missing_dict["model"].str.replace("-60", "")
df_missing_dict["seed_dataset_order"] = df_missing_dict["hpo_seed"]
# df_missing_dict = df_missing_dict.loc[~df_missing_dict["dataset_id"].isin([40685, 554])]
df_missing_dict.to_csv(results_dir / "df_missing_dict.csv", index=False)

In [24]:
df_missing_dict

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed,seed_dataset_order
0,CoHiRF-SC-SRGF-2R,39,True,adjusted_rand,2,2


# Tables

In [25]:
def get_parameters_string(row):
    parameter_names = {
		"best/alpha": "\\alpha",
		"best/avg_dims": "d",
		"best/base_model_kwargs/eps": "\\epsilon",
		"best/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/base_model_kwargs/n_clusters": "C",
		"best/c": "c",
		"best/cohirf_kwargs/base_model_kwargs/eps": "\\epsilon",
		"best/cohirf_kwargs/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/cohirf_kwargs/kmeans_n_clusters": "C",
		"best/cohirf_kwargs/n_features": "q",
		"best/cohirf_kwargs/repetitions": "R",
		"best/damping": "\\lambda",
		# "best/density_threshold": "\\tau",
		"best/eps": "\\epsilon",
		"best/kmeans_n_clusters": "C",
		"best/lambda_": "\\lambda",
		"best/min_bin_freq": "bin_{\\text{min}}",
		"best/min_cluster_size": "C_{\\text{min}}",
		"best/min_samples": "n_{\\text{min}}",
		"best/n_clusters": "C",
		"best/n_features": "q",
		# "best/n_partitions": "P",
		"best/n_similarities": "m",
		"best/p": "p",
		"best/repetitions": "R",
		"best/sampling_ratio": "r",
		"best/sc_n_clusters": "C",
		"best/transform_kwargs/gamma": "\\gamma",
	}
    first = True
    str = ""
    for p in parameter_names.keys():
        if not pd.isna(row[p]) and row[p] != "None":
            if not first:
                str += "; "
            else:
                first = False
            value = float(row[p])
            if value.is_integer():
                value = int(value)
                str += f"${parameter_names[p]}={value}$"
            else:
                str += f"${parameter_names[p]}={value:0.2f}$"
    return str

In [26]:
def highlight_max(df, column_name, level=0):
    df_column = df[column_name]
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_highlighted] = 'font-weight: bold'
    return df_css

In [27]:
def highlight_min(df, column_name, level=0):
    df_column = df[column_name]
    min_values = df_column.groupby(level=level).transform("min")
    is_highlighted = df_column.round(3) == min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_highlighted] = "font-weight: bold"
    return df_css

In [28]:
def highlight_max_index(series_index, df_column, level=0):
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css[:] = ''
    series_css[is_highlighted.values] = 'font-weight: bold'
    return series_css

In [29]:
def underline_2nd_max(df, column_name, level=0):
    df_column = df[column_name]
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.round(3).drop_duplicates().nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_underlined] = 'underline: --latex--rwrap'
    return df_css

In [30]:
def underline_2nd_min(df, column_name, level=0):
    df_column = df[column_name]
    # get the second min value
    second_min_values = df_column.groupby(level=level).transform(
        lambda x: x.round(3).drop_duplicates().nsmallest(2).iloc[-1]
    )
    is_underlined = df_column.round(3) == second_min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_underlined] = "underline: --latex--rwrap"
    return df_css

In [31]:
def underline_2nd_max_index(series_index, df_column, level=0):
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css.loc[:] = ''
    series_css[is_underlined.values] = 'underline: --latex--rwrap'
    return series_css

## Some Models

In [32]:
print(*df_runs_parents["model"].sort_values().unique(), sep="\n")

BatchCoHiRF-1iter-60
BatchCoHiRF-1iter-random-60
BatchCoHiRF-1iter-stratified-60
BatchCoHiRF-60
BatchCoHiRF-DBSCAN-1iter-60
BatchCoHiRF-DBSCAN-1iter-random-60
BatchCoHiRF-DBSCAN-1iter-stratified-60
BatchCoHiRF-KernelRBF-1iter-60
BatchCoHiRF-KernelRBF-1iter-random-60
BatchCoHiRF-KernelRBF-1iter-stratified-60
BatchCoHiRF-SC-SRGF-1iter-60
BatchCoHiRF-SC-SRGF-2-60
CoHiRF-1000-60
CoHiRF-60
CoHiRF-DBSCAN-60
CoHiRF-DBSCAN-top-down-60
CoHiRF-DBSCAN-top-down-inv-60
CoHiRF-KernelRBF-60
CoHiRF-KernelRBF-top-down-60
CoHiRF-KernelRBF-top-down-inv-60
CoHiRF-SC-SRGF-1R-60
CoHiRF-SC-SRGF-1R-top-down-60
CoHiRF-SC-SRGF-1R-top-down-inv-60
CoHiRF-SC-SRGF-2R-60
CoHiRF-SC-SRGF-60
CoHiRF-SC-SRGF-top-down-1R-60
CoHiRF-SC-SRGF-top-down-2R-60
CoHiRF-SC-SRGF-top-down-60
CoHiRF-SC-SRGF-top-down-inv-1R-60
CoHiRF-SC-SRGF-top-down-inv-2R-60
CoHiRF-SC-SRGF-top-down-inv-60
CoHiRF-top-down-60
CoHiRF-top-down-inv-60
DBSCAN-60
KMeans-60
KernelRBFKMeans-60
SpectralSubspaceRandomization-60


In [33]:
model_names = {
    "BatchCoHiRF-1iter-60": "BatchCoHiRF",
    "BatchCoHiRF-1iter-random-60": "BatchCoHiRF-random",
    "BatchCoHiRF-1iter-stratified-60": "BatchCoHiRF-stratified",
    # "BatchCoHiRF-60": "BatchCoHiRF",
    "BatchCoHiRF-DBSCAN-1iter-60": "BatchCoHiRF-DBSCAN",
    "BatchCoHiRF-DBSCAN-1iter-random-60": "BatchCoHiRF-DBSCAN-random",
    "BatchCoHiRF-DBSCAN-1iter-stratified-60": "BatchCoHiRF-DBSCAN-stratified",
    "BatchCoHiRF-KernelRBF-1iter-60": "BatchCoHiRF-KernelRBF",
    "BatchCoHiRF-KernelRBF-1iter-random-60": "BatchCoHiRF-KernelRBF-random",
    "BatchCoHiRF-KernelRBF-1iter-stratified-60": "BatchCoHiRF-KernelRBF-stratified",
    "BatchCoHiRF-SC-SRGF-1iter-60": "BatchCoHiRF-SC-SRGF",
    # "BatchCoHiRF-SC-SRGF-2-60": "BatchCoHiRF-SC-SRGF-2",
    "CoHiRF-1000-60": "CoHiRF-1000",
    "CoHiRF-60": "CoHiRF",
    "CoHiRF-top-down-60": "CoHiRF-top-down",
    "CoHiRF-top-down-inv-60": "CoHiRF-top-down-inv",
    "CoHiRF-DBSCAN-60": "CoHiRF-DBSCAN",
    "CoHiRF-DBSCAN-top-down-60": "CoHiRF-DBSCAN-top-down",
    "CoHiRF-DBSCAN-top-down-inv-60": "CoHiRF-DBSCAN-top-down-inv",
    "CoHiRF-KernelRBF-60": "CoHiRF-KernelRBF",
    "CoHiRF-KernelRBF-top-down-60": "CoHiRF-KernelRBF-top-down",
    "CoHiRF-KernelRBF-top-down-inv-60": "CoHiRF-KernelRBF-top-down-inv",
    "CoHiRF-SC-SRGF-1R-60": "CoHiRF-SC-SRGF-1R",
    "CoHiRF-SC-SRGF-2R-60": "CoHiRF-SC-SRGF-2R",
    "CoHiRF-SC-SRGF-60": "CoHiRF-SC-SRGF",
    # "CoHiRF-SC-SRGF-top-down-1R-60": "CoHiRF-SC-SRGF-top-down-1R",
    # "CoHiRF-SC-SRGF-top-down-2R-60": "CoHiRF-SC-SRGF-top-down-2R",
    # "CoHiRF-SC-SRGF-top-down-60": "CoHiRF-SC-SRGF-top-down",
    # "CoHiRF-SC-SRGF-top-down-inv-1R-60": "CoHiRF-SC-SRGF-top-down-inv-1R",
    # "CoHiRF-SC-SRGF-top-down-inv-2R-60": "CoHiRF-SC-SRGF-top-down-inv-2R",
    # "CoHiRF-SC-SRGF-top-down-inv-60": "CoHiRF-SC-SRGF-top-down-inv",
    "DBSCAN-60": "DBSCAN",
    "KMeans-60": "KMeans",
    "KernelRBFKMeans-60": "KernelRBFKMeans",
    "SpectralSubspaceRandomization-60": "SC-SRGF",
}

dataset_names = {
    "binary_alpha_digits": "binary-alpha-digits",
	"mnist_784": "mnist",
}  # otherwise we get an error in latex

dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
]

hpo_metrics = [
	"adjusted_rand",
    "calinski_harabasz_score",
	"silhouette",
]

# Filter to only standardized runs
df = df_runs_parents.copy()
df = df.loc[df['standardize'] == True]
df = df.loc[df['model'].isin(model_names.keys())]
df = df.loc[df["dataset_id"].isin(dataset_id)]
df = df.loc[df['hpo_metric'].isin(hpo_metrics)]
df = df.replace({"model": model_names})
df = df.replace({"dataset_name": dataset_names})

# Filter to only runs with hpo_seed in range(5)
df = df.loc[df['hpo_seed'].isin(range(5))]

# Filter to only show batch methods for datasets with more than 1000 instances
df = df.loc[~((df['n_instances'] < 1000) & (df['model'].str.find('Batch') != -1))]

# define group of models
model_groups = {
	"KMeans": ["KMeans", "CoHiRF", "CoHiRF-top-down", "CoHiRF-top-down-inv", "CoHiRF-1000", "BatchCoHiRF"],
	"KernelKMeans": ["KernelRBFKMeans", "CoHiRF-KernelRBF", "CoHiRF-KernelRBF-top-down", "CoHiRF-KernelRBF-top-down-inv", "BatchCoHiRF-KernelRBF"],
    "DBSCAN": ["DBSCAN", "CoHiRF-DBSCAN", "CoHiRF-DBSCAN-top-down", "CoHiRF-DBSCAN-top-down-inv", "BatchCoHiRF-DBSCAN"],
    "SC-SRGF": ["SC-SRGF", "CoHiRF-SC-SRGF", "CoHiRF-SC-SRGF-1R", "CoHiRF-SC-SRGF-2R", "BatchCoHiRF-SC-SRGF"],
}
df['model_group'] = df['model'].apply(lambda x: next((group for group, models in model_groups.items() if x in models), 'Other'))

# re-scale some metrics and build composite metric
# re-scale ari to be between 0 and 1 (originally between -0.5 and 1), by considering everything below 0 as 0
df["best/adjusted_rand_rescaled"] = df["best/adjusted_rand"].apply(lambda x: 0.0 if x < 0 else x)

# re-scale silhouette to be between 0 and 1 (originally between -1 and 1)
df["best/silhouette_rescaled"] = (df["best/silhouette"] - (-1)) / (1 - (-1)) 

# re-scale calinski to be between 0 and 1 normalized by dataset, model_group and hpo_metric
# replace calinksi -1.0 with 0.0
df["best/calinski_harabasz_score_rescaled"] = df["best/calinski_harabasz_score"].replace(-1.0, 0.0)
df["best/calinski_harabasz_score_rescaled"] = df.groupby(["dataset_id", "model_group", "hpo_metric"])[
    "best/calinski_harabasz_score_rescaled"
].transform(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() != x.min() else (0.0 if x.max() == 0 else 1.0))

In [34]:
hpo_metrics = [
    # "adjusted_rand",
    # "adjusted_mutual_info",
    # "calinski_harabasz_score",
    # "silhouette",
    # "davies_bouldin_score",
    # "normalized_mutual_info",
    "adjusted_rand",
    "adjusted_rand_rescaled",
    "calinski_harabasz_score",
    "calinski_harabasz_score_rescaled",
    "silhouette",
    "silhouette_rescaled",
]

hpo_metrics_rename = [
    # "ARI",
    # "AMI",
    # "Calinski",
    # "Silhouette",
    # "Davies-Bouldin",
    # "NMI",
    "ARI",
    "Rescaled ARI",
    "Calinski",
    "Rescaled Calinski",
    "Silhouette",
    "Rescaled Silhouette",
]

dfs_metrics = {}

for hpo_metric, hpo_metric_rename in zip(hpo_metrics, hpo_metrics_rename):
    if hpo_metric.find("_rescaled") != -1:
        original_metric = hpo_metric.replace("_rescaled", "")
    else:
        original_metric = hpo_metric
    df_metric = df.loc[df["hpo_metric"] == original_metric][
        ["dataset_name", "model", "hpo_seed", f"best/{hpo_metric}"]
    ].rename(columns={f"best/{hpo_metric}": hpo_metric_rename})
    df_metric = df_metric.dropna(subset=[hpo_metric_rename])
    df_metric = df_metric.set_index(['dataset_name', 'model', 'hpo_seed'])
    df_metric = df_metric.astype({hpo_metric_rename: float})
    dfs_metrics[hpo_metric_rename] = df_metric

df_metrics = pd.concat(dfs_metrics.values(), axis=1, join="outer")
df_metrics = df_metrics.reset_index()

# calculate mean and std
df_metrics = df_metrics.groupby(['dataset_name', 'model']).agg(['mean', 'std'])
# flatten multiindex columns
df_metrics.columns = [' '.join(col).strip() for col in df_metrics.columns.values]
# drop hpo_seed level
df_metrics = df_metrics.drop(columns=['hpo_seed mean', 'hpo_seed std'])
# Rename index levels
df_metrics.index.names = ["Dataset", "Model"]

# create a composite metric as the average of the metrics
df_metrics["Composite Metric mean"] = df_metrics[[f"{metric} mean" for metric in hpo_metrics_rename if "Rescaled" in metric]].mean(axis=1)
df_metrics["Composite Metric std"] = 1/len(hpo_metrics_rename) * (df_metrics[[f"{metric} std" for metric in hpo_metrics_rename if "Rescaled" in metric]]**2).sum(axis=1)**0.5
hpo_metrics_rename.append("Composite Metric")


for metric in hpo_metrics_rename:
    df_metrics[f"{metric}"] = (
        df_metrics[f"{metric} mean"].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else "No Run")
        + " $\\pm$ "
        + df_metrics[f"{metric} std"].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else "No Run")
    )

In [35]:
# Calculate mean and std times for each dataset-model combination across all metrics
df_times = (
    df.groupby(["dataset_name", "model"])
    .agg({"best/elapsed_time": ["mean", "std"], "fit_model_return_elapsed_time": ["mean", "std"]})
    .rename(columns={"best/elapsed_time": "Best Time", "fit_model_return_elapsed_time": "HPO Time"})
)

# Flatten multiindex columns
df_times.columns = [' '.join(col).strip() for col in df_times.columns.values]
# Set the same index structure as df_metrics
df_times.index.names = ["Dataset", "Model"]

df_times["Best Time"] = (
	df_times["Best Time mean"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
	+ " $\\pm$ " 
	+ df_times["Best Time std"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
)
df_times["HPO Time"] = (
	df_times["HPO Time mean"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
	+ " $\\pm$ "
	+ df_times["HPO Time std"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
)

# Join with the existing df_metrics (verify we have the same number of rows!)
df_metrics = df_metrics.join(df_times, how="outer")

In [36]:
df_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI mean,ARI std,Rescaled ARI mean,Rescaled ARI std,Calinski mean,Calinski std,Rescaled Calinski mean,Rescaled Calinski std,Silhouette mean,Silhouette std,...,Rescaled Calinski,Silhouette,Rescaled Silhouette,Composite Metric,Best Time mean,Best Time std,HPO Time mean,HPO Time std,Best Time,HPO Time
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
alizadeh-2000-v2,CoHiRF,0.865875,0.006962,0.865875,0.006962,15.094428,0.125896,0.800000,0.447214,0.195661,0.014860,...,0.800 $\pm$ 0.447,0.196 $\pm$ 0.015,0.598 $\pm$ 0.007,0.755 $\pm$ 0.075,0.036928,0.012063,141.002382,4.138507,0.037 $\pm$ 0.012,141.002 $\pm$ 4.139
alizadeh-2000-v2,CoHiRF-DBSCAN,0.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,...,0.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,0.000 $\pm$ 0.000,0.000 $\pm$ 0.000,1.622059,0.128318,142.119993,4.960189,1.622 $\pm$ 0.128,142.120 $\pm$ 4.960
alizadeh-2000-v2,CoHiRF-DBSCAN-top-down,0.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,...,0.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,0.000 $\pm$ 0.000,0.000 $\pm$ 0.000,3.367055,0.242905,438.809734,19.707713,3.367 $\pm$ 0.243,438.810 $\pm$ 19.708
alizadeh-2000-v2,CoHiRF-DBSCAN-top-down-inv,0.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,0.000000,0.000000,-1.000000,0.000000,...,0.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,0.000 $\pm$ 0.000,0.000 $\pm$ 0.000,3.409933,0.307794,549.589531,90.319148,3.410 $\pm$ 0.308,549.590 $\pm$ 90.319
alizadeh-2000-v2,CoHiRF-KernelRBF,0.095651,0.046741,0.095651,0.046741,1.726348,0.463955,0.262125,0.236603,0.027594,0.013708,...,0.262 $\pm$ 0.237,0.028 $\pm$ 0.014,0.514 $\pm$ 0.007,0.291 $\pm$ 0.040,0.172984,0.116717,154.118753,8.905365,0.173 $\pm$ 0.117,154.119 $\pm$ 8.905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shuttle,CoHiRF-top-down,0.645521,0.024557,0.645521,0.024557,20315.310691,180.882042,0.535965,0.007674,0.969336,0.004637,...,0.536 $\pm$ 0.008,0.969 $\pm$ 0.005,0.985 $\pm$ 0.002,0.722 $\pm$ 0.004,17.545554,1.891131,1565.533466,908.117926,17.546 $\pm$ 1.891,1565.533 $\pm$ 908.118
shuttle,CoHiRF-top-down-inv,0.604689,0.052787,0.604689,0.052787,20425.366028,149.914779,0.540635,0.006361,0.652623,0.174395,...,0.541 $\pm$ 0.006,0.653 $\pm$ 0.174,0.826 $\pm$ 0.087,0.657 $\pm$ 0.017,16.421603,3.854344,1363.672504,926.578025,16.422 $\pm$ 3.854,1363.673 $\pm$ 926.578
shuttle,DBSCAN,0.708522,0.012708,0.708522,0.012708,7504.081967,12.888252,0.998572,0.001958,0.948136,0.000715,...,0.999 $\pm$ 0.002,0.948 $\pm$ 0.001,0.974 $\pm$ 0.000,0.894 $\pm$ 0.002,35.036979,11.071918,3514.463884,1254.458490,35.037 $\pm$ 11.072,3514.464 $\pm$ 1254.458
shuttle,KMeans,0.608404,0.000000,0.608404,0.000000,30845.870523,297.411898,0.982752,0.012618,0.960408,0.008693,...,0.983 $\pm$ 0.013,0.960 $\pm$ 0.009,0.980 $\pm$ 0.004,0.857 $\pm$ 0.002,0.068935,0.066632,750.227468,1000.558793,0.069 $\pm$ 0.067,750.227 $\pm$ 1000.559


The following will provide the latex code for a clean table, we only need to make a little adjustement in the first line to delete the "key" and have only one header. For the longtable environment (full data) we need to add the "\*" at the end of lines we dont want to have a page break. We also should replace the entire begin{table} ... end{table} by begin{longtable} ... end{longtable} in the latex file, if you want to put caption and labels you should break the line after with '\\' (put both on the same line!)


In [37]:
df_latex = df_metrics.copy()
hpo_metrics_to_hide = [metric for metric in hpo_metrics_rename if metric.find("Rescaled") != -1] 
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += hpo_metrics_to_hide
df_latex = df_latex.style.hide(columns_to_hide, axis=1)
for col in hpo_metrics_rename + ["Best Time", "HPO Time"]:
    highlight_metric = partial(highlight_max, column_name=f"{col} mean")
    underline_2nd_metric = partial(underline_2nd_max, column_name=f"{col} mean")
    if col in ["Davies-Bouldin", "Best Time", "HPO Time"]:
        highlight_metric = partial(highlight_min, column_name=f"{col} mean")
        underline_2nd_metric = partial(underline_2nd_min, column_name=f"{col} mean")
    (df_latex.apply(highlight_metric, subset=[col, f"{col} mean"], axis=None)
    .apply(underline_2nd_metric, subset=[col, f"{col} mean"], axis=None))

environment = 'longtable'
latex_output = df_latex.to_latex(
    hrules=True,
    clines="skip-last;data",
    convert_css=True,
    column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
    environment=environment,
)

# fix header
columns = df_latex.index.names + [col for col in df_latex.columns if col not in columns_to_hide]
header_line = ' & '.join(columns) + r' \\'
latex_output = latex_output.splitlines()
if environment is None:
    # remove 3th and 4th line and replace with header_line
    latex_output = latex_output[:2] + [header_line] + latex_output[4:]
else:
    # remove 3rd and 4th line and 8th and 9th line and replace with header_line
    latex_output = latex_output[:2] + [header_line] + latex_output[4:7] + [header_line] + latex_output[9:]

latex_output = "\n".join(latex_output)
print(latex_output)

\begin{longtable}{llllllll}
\toprule
Dataset & Model & ARI & Calinski & Silhouette & Composite Metric & Best Time & HPO Time \\
\midrule
\endfirsthead
\toprule
Dataset & Model & ARI & Calinski & Silhouette & Composite Metric & Best Time & HPO Time \\
\midrule
\endhead
\midrule
\multicolumn{8}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\multirow[c]{16}{*}{alizadeh-2000-v2} & CoHiRF & 0.866 $\pm$ 0.007 & \underline{15.094 $\pm$ 0.126} & 0.196 $\pm$ 0.015 & 0.755 $\pm$ 0.075 & \underline{0.037 $\pm$ 0.012} & 141.002 $\pm$ 4.139 \\
 & CoHiRF-DBSCAN & 0.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & 0.000 $\pm$ 0.000 & 1.622 $\pm$ 0.128 & 142.120 $\pm$ 4.960 \\
 & CoHiRF-DBSCAN-top-down & 0.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & 0.000 $\pm$ 0.000 & 3.367 $\pm$ 0.243 & 438.810 $\pm$ 19.708 \\
 & CoHiRF-DBSCAN-top-down-inv & 0.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & 0.000 $\pm$ 0.000 & 3.410 $\pm$ 0.308 & 549.

# per dataset per model group

In [38]:
model_groups

{'KMeans': ['KMeans',
  'CoHiRF',
  'CoHiRF-top-down',
  'CoHiRF-top-down-inv',
  'CoHiRF-1000',
  'BatchCoHiRF'],
 'KernelKMeans': ['KernelRBFKMeans',
  'CoHiRF-KernelRBF',
  'CoHiRF-KernelRBF-top-down',
  'CoHiRF-KernelRBF-top-down-inv',
  'BatchCoHiRF-KernelRBF'],
 'DBSCAN': ['DBSCAN',
  'CoHiRF-DBSCAN',
  'CoHiRF-DBSCAN-top-down',
  'CoHiRF-DBSCAN-top-down-inv',
  'BatchCoHiRF-DBSCAN'],
 'SC-SRGF': ['SC-SRGF',
  'CoHiRF-SC-SRGF',
  'CoHiRF-SC-SRGF-1R',
  'CoHiRF-SC-SRGF-2R',
  'BatchCoHiRF-SC-SRGF']}

In [39]:
df_latex = df_metrics.copy()
df_latex = df_latex.reset_index()
# reapply model groups
df_latex['Base Model'] = df_latex['Model'].apply(lambda x: next((group for group, models in model_groups.items() if x in models), 'Other'))
# redefine index with model_group
df_latex = df_latex.set_index(['Dataset', 'Base Model', 'Model'])
# sort by dataset, model_group, model
df_latex = df_latex.sort_index(level=['Dataset', 'Base Model', 'Model'])


# print per dataset
for dataset in df_latex.index.get_level_values('Dataset').unique():
    df_print = df_latex.copy()
    df_print = df_print.loc[dataset]
    hpo_metrics_to_hide = [metric for metric in hpo_metrics_rename if metric.find("Rescaled") != -1] 
    columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
    columns_to_hide += hpo_metrics_to_hide
    df_print = df_print.style.hide(columns_to_hide, axis=1)
    for col in hpo_metrics_rename + ["Best Time", "HPO Time"]:
        highlight_metric = partial(highlight_max, column_name=f"{col} mean")
        underline_2nd_metric = partial(underline_2nd_max, column_name=f"{col} mean")
        if col in ["Davies-Bouldin", "Best Time", "HPO Time"]:
            highlight_metric = partial(highlight_min, column_name=f"{col} mean")
            underline_2nd_metric = partial(underline_2nd_min, column_name=f"{col} mean")
        (
            df_print.apply(highlight_metric, subset=[col, f"{col} mean"], axis=None).apply(
                underline_2nd_metric, subset=[col, f"{col} mean"], axis=None
            )
        )

    df_print = df_print.hide(level=0, axis=0)
    latex_output = (
        df_print.to_latex(
            hrules=True,
            clines="skip-last;data",
            convert_css=True,
            column_format="ll" + "l" * (len(df_print.columns) - len(columns_to_hide)),
            # environment="longtable",
            caption=f"Clustering results on dataset {dataset}",
        )
    )

    # fix header
    columns = df_print.index.names[:1] + [col for col in df_print.columns if col not in columns_to_hide]
    header_line = ' & '.join(columns) + r' \\'

    # split into lines
    latex_output = latex_output.splitlines()
    # remove 5th and 6th line and replace with header_line
    latex_output = latex_output[:4] + [header_line] + latex_output[6:]
    latex_output = '\n'.join(latex_output)

    # manually add clines after model groups
    model_groups_in_data = df_print.index.get_level_values('Base Model').unique().tolist()
    lines = latex_output.splitlines()
    new_lines = []
    last_line = ""
    for i, line in enumerate(lines[6:-3]): # skip first 6 lines and last 3 lines
        model_last_line = last_line.split('&')[0].strip()
        model_current_line = line.split('&')[0].strip()
        model_group_last_line = next((group for group, models in model_groups.items() if model_last_line in models), 'Other')
        model_group_current_line = next((group for group, models in model_groups.items() if model_current_line in models), 'Other')
        if model_group_last_line != model_group_current_line and i != 0:
            new_lines.append(r'\cline{' + f'1-{len(columns)}' + r'}')
        new_lines.append(line)
        last_line = line

    latex_output = '\n'.join(lines[:6] + new_lines + lines[-3:])

    print(latex_output)
    print("\n\n")
    print("\pagebreak")

\begin{table}
\caption{Clustering results on dataset alizadeh-2000-v2}
\begin{tabular}{llllllll}
\toprule
Base Model & ARI & Calinski & Silhouette & Composite Metric & Best Time & HPO Time \\
\midrule
CoHiRF-DBSCAN & \bfseries \underline{0.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{0.000 $\pm$ 0.000} & \underline{1.622 $\pm$ 0.128} & \underline{142.120 $\pm$ 4.960} \\
CoHiRF-DBSCAN-top-down & \bfseries \underline{0.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{0.000 $\pm$ 0.000} & 3.367 $\pm$ 0.243 & 438.810 $\pm$ 19.708 \\
CoHiRF-DBSCAN-top-down-inv & \bfseries \underline{0.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{-1.000 $\pm$ 0.000} & \bfseries \underline{0.000 $\pm$ 0.000} & 3.410 $\pm$ 0.308 & 549.590 $\pm$ 90.319 \\
DBSCAN & \bfseries \underline{0.000 $\pm$ 0.000} & \bfseries \un

# KMeans

In [56]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = [
    "garber-2001",
    "alizadeh-2000-v2",
    "golub-1999-v2",
    "armstrong-2002-v1",
    "nursery",
    "segment",
]
models_to_keep = [
    "K-Means",
    "CoHiRF",
	"CoHiRF-1000",
    "Batch CoHiRF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{alizadeh-2000-v2} & Batch CoHiRF & 0.360 $\pm$ 0.014 & 0.507 $\pm$ 0.021 & 4.274 $\pm$ 0.238 & 0.049 $\pm$ 0.010 & \bfseries 0.411 $\pm$ 0.084 & 0.143 $\pm$ 0.069 & 154.717 $\pm$ 4.251 \\
 & CoHiRF & \bfseries 0.866 $\pm$ 0.007 & \bfseries 0.771 $\pm$ 0.030 & \underline{15.094 $\pm$ 0.126} & \underline{0.196 $\pm$ 0.015} & \underline{0.896 $\pm$ 0.083} & 0.054 $\pm$ 0.033 & 142.623 $\pm$ 4.180 \\
 & K-Means & \underline{0.838 $\pm$ 0.014} & \underline{0.767 $\pm$ 0.037} & \bfseries 15.151 $\pm$ 0.000 & \bfseries 0.204 $\pm$ 0.036 & 0.934 $\pm$ 0.021 & 0.012 $\pm$ 0.003 & 137.865 $\pm$ 3.673 \\
\cline{1-9}
\multirow[c]{3}{*}{armstrong-2002-v1} & Batch CoHiRF & 0.225 $\pm$ 0.071 & 0.267 $\pm$ 0.061 & 2.933 $\pm$ 0.333 & 0.003 $\pm$ 0.007 & \bfseries 0.401 $\pm$ 0.025 & 0.125 $\pm$ 0.039 & 90.402 $\pm$ 3.

# Kernel KMeans

In [57]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = [
    "khan-2001",
    "bittner-2000",
    "iris",
    "satimage",
]
models_to_keep = [
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{bittner-2000} & Batch CoHiRF-KernelRBF & 0.088 $\pm$ 0.015 & 0.098 $\pm$ 0.011 & \bfseries 1.918 $\pm$ 0.217 & 0.041 $\pm$ 0.020 & \bfseries 0.499 $\pm$ 0.083 & 0.224 $\pm$ 0.180 & 171.649 $\pm$ 4.442 \\
 & CoHiRF-KernelRBF & \bfseries 0.104 $\pm$ 0.049 & \underline{0.136 $\pm$ 0.043} & \underline{1.569 $\pm$ 0.146} & \bfseries 0.054 $\pm$ 0.066 & \underline{0.530 $\pm$ 0.053} & 0.191 $\pm$ 0.118 & 158.762 $\pm$ 4.244 \\
 & Kernel RBF K-Means & \underline{0.090 $\pm$ 0.038} & \bfseries 0.137 $\pm$ 0.034 & 1.334 $\pm$ 0.128 & \underline{0.043 $\pm$ 0.021} & 0.905 $\pm$ 0.014 & 0.062 $\pm$ 0.017 & 148.491 $\pm$ 2.636 \\
\cline{1-9}
\multirow[c]{3}{*}{iris} & Batch CoHiRF-KernelRBF & \underline{0.687 $\pm$ 0.090} & 0.704 $\pm$ 0.073 & 121.522 $\pm$ 26.880 & 0.396 $\pm$ 0.162 & 0.889 $\pm$ 0.169 & 0.351 $\

# DBSCAN

In [58]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["ecoli", "binary-alpha-digits", "segment", "chowdary-2006", "shuttle"]
models_to_keep = [
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{binary-alpha-digits} & Batch CoHiRF-DBSCAN & \underline{0.006 $\pm$ 0.008} & \underline{0.032 $\pm$ 0.033} & \underline{1.473 $\pm$ 0.131} & \bfseries 0.080 $\pm$ 0.025 & \bfseries 0.837 $\pm$ 0.037 & 0.284 $\pm$ 0.238 & 58.526 $\pm$ 3.098 \\
 & CoHiRF-DBSCAN & \bfseries 0.009 $\pm$ 0.005 & \bfseries 0.035 $\pm$ 0.026 & 1.353 $\pm$ 0.109 & \underline{0.011 $\pm$ 0.035} & \underline{0.862 $\pm$ 0.035} & 0.189 $\pm$ 0.041 & 57.093 $\pm$ 1.612 \\
 & DBSCAN & 0.000 $\pm$ 0.000 & 0.002 $\pm$ 0.003 & \bfseries 1.574 $\pm$ 3.525 & -0.561 $\pm$ 0.602 & 600.462 $\pm$ 547.090 & 0.053 $\pm$ 0.036 & 44.413 $\pm$ 2.076 \\
\cline{1-9}
\multirow[c]{3}{*}{chowdary-2006} & Batch CoHiRF-DBSCAN & \underline{0.068 $\pm$ 0.053} & \underline{0.099 $\pm$ 0.047} & \underline{37.353 $\pm$ 0.000} & 0.398 $\pm$ 0.366 & \underlin

# SC-SRGF


In [59]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["alizadeh-2000-v3", "alizadeh-2000-v2", "har", "satimage", "chowdary-2006"]
models_to_keep = [
    "SC-SRGF",
	"CoHiRF-SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{alizadeh-2000-v2} & CoHiRF-SC-SRGF & \underline{0.633 $\pm$ 0.042} & \underline{0.758 $\pm$ 0.031} & \underline{7.422 $\pm$ 0.041} & \underline{0.094 $\pm$ 0.001} & \underline{1.550 $\pm$ 0.019} & 1.503 $\pm$ 1.527 & 254.264 $\pm$ 14.525 \\
 & SC-SRGF & \bfseries 0.891 $\pm$ 0.125 & \bfseries 0.854 $\pm$ 0.093 & \bfseries 12.338 $\pm$ 0.000 & \bfseries 0.194 $\pm$ 0.000 & \bfseries 0.936 $\pm$ 0.014 & 0.132 $\pm$ 0.061 & 142.764 $\pm$ 4.481 \\
\cline{1-9}
\multirow[c]{2}{*}{alizadeh-2000-v3} & CoHiRF-SC-SRGF & \underline{0.443 $\pm$ 0.013} & \underline{0.628 $\pm$ 0.019} & \underline{7.411 $\pm$ 0.060} & \underline{0.093 $\pm$ 0.002} & \underline{1.562 $\pm$ 0.047} & 0.928 $\pm$ 1.146 & 245.332 $\pm$ 13.850 \\
 & SC-SRGF & \bfseries 0.519 $\pm$ 0.000 & \bfseries 0.702 $\pm$ 0.048 & \bfseries 12.341 $\p

# COIL 20

In [61]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["coil-20", "mnist"]
models_to_keep = [
    "K-Means",
    "CoHiRF",
	"CoHiRF-1000",
    "Batch CoHiRF",
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
    "SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{11}{*}{coil-20} & Batch CoHiRF & 0.381 $\pm$ 0.008 & 0.632 $\pm$ 0.010 & 62.639 $\pm$ 4.789 & 0.107 $\pm$ 0.018 & 1.532 $\pm$ 0.065 & 0.541 $\pm$ 0.153 & 172.626 $\pm$ 7.695 \\
 & Batch CoHiRF-DBSCAN & 0.335 $\pm$ 0.055 & 0.588 $\pm$ 0.051 & \bfseries 291.162 $\pm$ 71.197 & -0.001 $\pm$ 0.008 & \bfseries 0.077 $\pm$ 0.041 & 1.305 $\pm$ 1.005 & 175.064 $\pm$ 19.429 \\
 & Batch CoHiRF-KernelRBF & 0.004 $\pm$ 0.006 & 0.023 $\pm$ 0.020 & 1.804 $\pm$ 0.276 & 0.005 $\pm$ 0.002 & 2.404 $\pm$ 0.100 & 10.855 $\pm$ 17.100 & 510.956 $\pm$ 280.500 \\
 & CoHiRF & 0.355 $\pm$ 0.041 & 0.627 $\pm$ 0.017 & 287.989 $\pm$ 0.164 & \underline{0.220 $\pm$ 0.095} & 1.597 $\pm$ 0.166 & 0.290 $\pm$ 0.087 & 151.293 $\pm$ 4.393 \\
 & CoHiRF-1000 & 0.338 $\pm$ 0.009 & 0.632 $\pm$ 0.006 & 288.053 $\pm$ 0.078 & 0.180 $\pm$ 0.004 & 1.725 

# Debug and explore

In [25]:
df = df_runs_raw_2.copy()

In [26]:
df = df.loc[df["status"].isin(["FAILED", "RUNNING"])]
df

key,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,best/rand_score,best/silhouette,best/v_measure,fit_model_return_elapsed_time,max_memory_used,max_memory_used_after_fit,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002f1471a6f4a639bcfd8a3d7c90f1f,FAILED,2025-07-16 14:26:07.607,1.752676e+12,,,,,,,,...,,,,21.065334,1492.632,1492.632,Expected a 2-dimensional container but got <cl...,,4b3d5799ec4448719f0a0c5c291eda0c,True
0002f564f6e3405d859abc0abd6c448e,FAILED,2025-07-15 15:39:34.468,1.752594e+12,,,,,,,,...,,,,0.013465,455.484,455.484,could not convert string to float: 'OG',,742cb3f363ef47a79da84cc6e4086844,True
000303ea6e564058a7def8cddeeda1b0,FAILED,2025-07-15 13:18:50.524,1.752586e+12,,,,,,,,...,,,,0.025954,595.816,595.816,"could not convert string to float: '""SCC""'",,67862e364b9e4bf9a3a723cf9fae0873,True
00065738f4ba45a9b05cf8d62618e24b,FAILED,2025-07-15 12:30:22.107,1.752583e+12,,,,,,,,...,,,,0.029382,591.684,591.684,"could not convert string to float: '""SCC""'",,53b4533a3e62427d90f539c1e9ac0337,True
0012a8230edf41cca17eb741cbc86867,FAILED,2025-07-16 14:24:32.448,1.752676e+12,,,,,,,,...,,,,0.015206,493.316,493.316,could not convert string to float: 'DLBCL2',,181c0696d3b643758607c7dec14dfe0c,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffc589efa744f54b3378fcda6ccbcfe,FAILED,2025-11-06 11:50:01.603,1.762430e+12,,,,,,,,...,,,,,359.500,,kth(=3) out of bounds (3),_before_fit_model,bff12221a1684a2f88776ab5859934c6,True
fffca7628fd3470d98a993a367c8aa3d,RUNNING,2025-08-12 18:54:10.939,,,,,,,,,...,,,,,,,,,236c3ffdfb1b441da0c1db9ca53994b3,
fffe9e46b22949c48aa2eaffc64b7028,FAILED,2025-07-16 18:04:27.271,1.752689e+12,,,,,,,,...,,,,0.067094,485.088,485.088,"could not convert string to float: '""ML2""'",,8c5ee25a49bc4dd4a8c0f5960653aba4,True
ffff2ab265274063bd33d808a0f27dfe,FAILED,2025-07-16 09:24:41.361,1.752658e+12,,,,,,,,...,,,,0.343937,404.880,404.760,could not convert string to float: 'A',,09a7b833676745ce8fd47a6c5048db70,True


In [28]:
runs_to_delete_parents = list(df.index)
df = df_runs_raw_2.copy()
df = df.loc[df["mlflow.parentRunId"].isin(runs_to_delete_parents)]
runs_to_delete_children = list(df.index)
runs_to_delete = runs_to_delete_children + runs_to_delete_parents
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ", ".join(run_uuid_query)

In [29]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

In [30]:
query = f"""
DELETE
FROM
	experiment_tags
WHERE
	experiment_id = ANY(
	SELECT
		experiment_id
	FROM
		experiments
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	latest_metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	params
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	tags
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE 
FROM 
	runs
WHERE 
	lifecycle_stage = 'deleted';

DELETE 
FROM 
	experiments
WHERE 
	lifecycle_stage = 'deleted';
"""
with engine.begin() as conn:
    conn.execute(text(query))