In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries
from pathlib import Path
import os
import pickle
from functools import partial

# Save Results

## Load mlflow runs

In [2]:
results_dir = Path.cwd().parent / "results" / "real"
os.makedirs(results_dir, exist_ok=True)

In [3]:
db_port = 5101
db_name = "cohirf"
url = f"postgresql://beluccib@localhost:{db_port}/{db_name}"
# url = f"postgresql://beluccib@clust5:{db_port}/{db_name}"
engine = create_engine(url)
query = "SELECT experiments.name from experiments"
experiment_names = pd.read_sql(query, engine)["name"].tolist()

In [4]:
experiment_names

['Default',
 'real-adjusted_rand-KernelRBFKMeans',
 'real-adjusted_rand-BatchCoHiRF-KernelRBF-1iter',
 'real-adjusted_rand-KMeans',
 'real-adjusted_rand-DBSCAN',
 'real-adjusted_rand-CoHiRF-DBSCAN',
 'real-adjusted_rand-BatchCoHiRF',
 'real-adjusted_rand-CoHiRF',
 'real-adjusted_rand-CoHiRF-KernelRBF',
 'real-adjusted_rand-BatchCoHiRF-DBSCAN-1iter',
 'real-davies_bouldin_score-BatchCoHiRF-1iter',
 'real-adjusted_mutual_info-BatchCoHiRF-KernelRBF-1iter',
 'real-silhouette-BatchCoHiRF-1iter',
 'real-silhouette-DBSCAN',
 'real-calinski_harabasz_score-BatchCoHiRF-1iter',
 'real-davies_bouldin_score-KernelRBFKMeans',
 'real-silhouette-KMeans',
 'real-davies_bouldin_score-KMeans',
 'real-calinski_harabasz_score-CoHiRF',
 'real-adjusted_mutual_info-BatchCoHiRF-DBSCAN-1iter',
 'real-davies_bouldin_score-BatchCoHiRF-KernelRBF-1iter',
 'real-adjusted_mutual_info-CoHiRF',
 'real-adjusted_mutual_info-CoHiRF-KernelRBF',
 'real-davies_bouldin_score-CoHiRF-DBSCAN',
 'real-adjusted_mutual_info-CoHiRF-

In [5]:
experiments_names = [exp for exp in experiment_names if exp.startswith("real-")]

In [7]:
query = "SELECT DISTINCT(key) FROM params WHERE key LIKE 'best/%%'"
best_params = pd.read_sql(query, engine)["key"].tolist()

In [8]:
params_columns = [
    "model",
    "dataset_id",
	"n_trials",
	"dataset_name",
	"standardize",
	"hpo_metric",
	"direction",
	"hpo_seed",
] + best_params

In [9]:
latest_metrics_columns = [
    "fit_model_return_elapsed_time",
    "max_memory_used_after_fit",
    "max_memory_used",
	"best/n_clusters_",
    "best/rand_score",
    "best/adjusted_rand",
    "best/mutual_info",
    "best/adjusted_mutual_info",
    "best/normalized_mutual_info",
    "best/homogeneity_completeness_v_measure",
    "best/silhouette",
    "best/calinski_harabasz_score",
    "best/davies_bouldin_score",
    "best/inertia_score",
    "best/homogeneity",
    "best/completeness",
    "best/v_measure",
    "best/elapsed_time",
]

In [10]:
tags_columns = ["raised_exception", "EXCEPTION", "mlflow.parentRunId", "Last step finished"]

In [11]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [4]:
dataset_characteristics = pd.read_csv(results_dir / "datasets_characteristics.csv", index_col=0)
dataset_characteristics.index = dataset_characteristics["openml_id"].astype(str)

In [5]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)
df_runs_raw = df_runs_raw.join(dataset_characteristics, on='dataset_id', rsuffix='_dataset')
df_runs_raw.to_csv(results_dir / 'df_runs_raw_tgcc2.csv', index=True)

NameError: name 'df_params' is not defined

In [6]:
df_runs_raw = pd.read_csv(results_dir / "df_runs_raw_tgcc2.csv", index_col=0)
df_runs_raw["model"] = df_runs_raw["model"] + "-" + df_runs_raw["n_trials"].astype(str)
# mask = df_runs_raw["model"].str.contains("CoHiRF")
# df_runs_raw.loc[mask, "model"] = df_runs_raw.loc[mask].apply(lambda row: f"{row['model']}-{row['n_trials']}", axis=1)
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents["mlflow.parentRunId"].isna()]

In [7]:
df_runs_raw_parents.head(5)

Unnamed: 0_level_0,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001f124c40c40b6b902fcabc76e7eea,FINISHED,1757511120768,1757511000000.0,,,,,,,c854172d93d945b5b98d1d2e7f8b8244,...,,,,False,iris,61,150,5,3,1
00027aa26863479689cb9159eb05d27d,FINISHED,1759251752094,1759254000000.0,,,21.0,,,,c968bd4c602b4cc1b80ea7ebf3ef514a,...,,,,False,mnist_784,554,70000,785,10,1
000ed0a4c2254d48a2ea293ecf9480b9,FINISHED,1761159995601,1761160000000.0,,,,,,,775080a847654e7885ed3003bbb9d622,...,,_on_train_end,,False,nursery,1568,12958,9,4,9
0016b21b09e44c7595b26130cbf448f9,FINISHED,1757524956961,1757525000000.0,,,3.0,,,,318c5fee63ac48ee93c0f8897a15e567,...,,,,False,golub-1999-v2,46780,72,1869,3,0
001b3f77c3e44c12b14761e15c808ec9,FINISHED,1757510889568,1757511000000.0,,,,,,,c3df07fdb54f487a922cb765d3b9c0a7,...,,,,False,ecoli,39,336,8,8,1


## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [8]:
non_duplicate_columns = [
    "model",
    "dataset_id",
	"standardize",
	"hpo_metric",
	"hpo_seed",
]
# df_runs_parents.loc[df_runs_parents["best/n_clusters_"]*0.5 > df_runs_parents["n_instances"], "best/adjusted_rand"] = 
df_runs_parents = df_runs_raw_parents.dropna(axis=0, how="all", subset=["best/adjusted_rand"]).copy()
# add back runs that were not evaluated because we judged too many clusters (but they run anyway)
# df_valid_runs = df_runs_raw_parents.loc[df_runs_raw_parents["best/n_clusters_"] > df_runs_raw_parents["n_instances"]*0.5].copy()
# df_runs_parents = pd.concat([df_runs_parents, df_valid_runs], axis=0)
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]
# fill missing values with "None"
df_runs_parents = df_runs_parents.fillna("None")

In [9]:
# get number of children runs that raised exception for each parent run
children_exceptions = df_runs_raw.groupby("mlflow.parentRunId")["raised_exception"].sum()
df_runs_parents["n_children_raised_exception"] = df_runs_parents.index.map(children_exceptions).fillna(0)

In [10]:
df_runs_parents.loc[(df_runs_parents["n_children_raised_exception"] > 0) & (df_runs_parents["raised_exception"] == False) & (df_runs_parents["model"].str.find("SC-SRGF") == -1), ["dataset_id", "model", "hpo_metric", "n_children_raised_exception"]]

Unnamed: 0_level_0,dataset_id,model,hpo_metric,n_children_raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0523d99695664a3294fa128a1eae79af,554,CoHiRF-DBSCAN-60,calinski_harabasz_score,9
075cbb1e8b9f41a0b1b1cb54ff6673af,39,CoHiRF-top-down-60,davies_bouldin_score,4
08433992474241cdbe44aca2692cd7df,554,CoHiRF-DBSCAN-60,davies_bouldin_score,2
086989a27cdb467b85ed2248ce8b32d3,554,CoHiRF-DBSCAN-60,calinski_harabasz_score,8
0abf67098276481cabbcb002ae6bdf84,1568,CoHiRF-top-down-60,calinski_harabasz_score,2
...,...,...,...,...
f7504ad3ccad429e82fb12541282747b,40685,CoHiRF-top-down-60,calinski_harabasz_score,1
f84f03dfb6964911a05142a3072c699f,1568,CoHiRF-top-down-60,adjusted_mutual_info,3
fa554932b2f842d6833bc8331388e3f7,47039,BatchCoHiRF-1iter-60,adjusted_rand,7
fe81e6a0ceac403db992df84e6e5d682,39,CoHiRF-top-down-60,davies_bouldin_score,4


In [11]:
df_to_cat = []
hpo_metrics = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
    "normalized_mutual_info",
]
standardize = [True]
hpo_seed = [i for i in range(5)]
fill_value = pd.NA
fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]

In [12]:
# Too memory intensive
dataset_ids_to_complete = [182, 554, 1478, 1568, 40685]
model_names = ["CoHiRF-SC-SRGF-60", "SpectralSubspaceRandomization-60"]
for dataset_id in dataset_ids_to_complete:
	for model_name in model_names:
		for hpo_metric in hpo_metrics:
			for std in standardize:
				for seed in hpo_seed:
					new_row = {
						"dataset_id": dataset_id,
						"model": model_name,
						"hpo_metric": hpo_metric,
						"standardize": std,
						"hpo_seed": seed
					}
					for col in fill_columns:
						new_row[col] = fill_value
					df_to_cat.append(new_row)

In [13]:
# # Too few examples (<100) to run in batch
# dataset_ids_to_complete = [46773, 46774, 46775, 46776, 46777, 46779, 46780, 46781]
# model_names = [
#     "BatchCoHiRF-1iter-30",
#     "BatchCoHiRF-DBSCAN-1iter-30",
#     "BatchCoHiRF-SC-SRGF-30",
#     "BatchCoHiRF-KernelRBF-1iter-30",
#     "BatchCoHiRF-1iter-60",
# 	"BatchCoHiRF-DBSCAN-1iter-60",
# 	# "BatchCoHiRF-SC-SRGF-60",
# 	# "BatchCoHiRF-KernelRBF-1iter-60",
# ]
# for dataset_id in dataset_ids_to_complete:
# 	for model_name in model_names:
# 		for hpo_metric in hpo_metrics:
# 			for std in standardize:
# 				mask = (df_runs_parents["dataset_id"] == dataset_id) & (df_runs_parents["model"] == model_name) & (df_runs_parents["hpo_metric"] == hpo_metric) & (df_runs_parents["standardize"] == std)
# 				if not mask.any():
# 					new_row = {
# 						"dataset_id": dataset_id,
# 						"model": model_name,
# 						"hpo_metric": hpo_metric,
# 						"standardize": std,
# 					}
# 					for col in fill_columns:
# 						new_row[col] = fill_value
# 					df_to_cat.append(new_row)

In [14]:
# # Too many examples for IRFLLRR
# dataset_ids_to_complete = [40685]
# model_names = [
#     "IRFLLRR-30",
#     "IRFLLRR-60",
# ]
# hpo_metrics = ["adjusted_rand", "adjusted_mutual_info", "calinski_harabasz_score", "silhouette", "davies_bouldin_score", "normalized_mutual_info"]
# standardize = [True]
# fill_value = pd.NA
# fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]
# for dataset_id in dataset_ids_to_complete:
#     for model_name in model_names:
#         for hpo_metric in hpo_metrics:
#             for std in standardize:
#                 mask = (
#                     (df_runs_parents["dataset_id"] == dataset_id)
#                     & (df_runs_parents["model"] == model_name)
#                     & (df_runs_parents["hpo_metric"] == hpo_metric)
#                     & (df_runs_parents["standardize"] == std)
#                 )
#                 if not mask.any():
#                     new_row = {
#                         "dataset_id": dataset_id,
#                         "model": model_name,
#                         "hpo_metric": hpo_metric,
#                         "standardize": std,
#                     }
#                     for col in fill_columns:
#                         new_row[col] = fill_value
#                     df_to_cat.append(new_row)

In [15]:
df_runs_parents = pd.concat([df_runs_parents, pd.DataFrame(df_to_cat)], axis=0)

# Slow / Memory intensive datasets/models

In [16]:
df = df_runs_parents.copy()
df = df[["model", "dataset_id", "max_memory_used", "max_memory_used_after_fit"]]
df = df.sort_values(by="max_memory_used", ascending=False)
high_mem_tuples = df.loc[df["max_memory_used"] > 10000].copy()
high_mem_tuples = high_mem_tuples[["model", "dataset_id"]]

In [17]:
df

Unnamed: 0,model,dataset_id,max_memory_used,max_memory_used_after_fit
8002d12704c142ff99021098b82ac3a7,CoHiRF-DBSCAN-60,40685,52893.696,52893.696
569719f398044e5d8b5056c702355584,CoHiRF-DBSCAN-60,40685,52854.720,52854.720
640e3f59ca4144e68cc3a497d7787280,CoHiRF-DBSCAN-60,40685,52814.868,52814.868
a5f9c8d0d9ca497ca067be35b8cb5b3e,CoHiRF-DBSCAN-60,40685,52742.684,52742.684
02ec0fef9d864abd8da2ac7adeb7d505,CoHiRF-DBSCAN-60,40685,51440.404,51440.404
...,...,...,...,...
295,SpectralSubspaceRandomization-60,40685,,
296,SpectralSubspaceRandomization-60,40685,,
297,SpectralSubspaceRandomization-60,40685,,
298,SpectralSubspaceRandomization-60,40685,,


In [18]:
model_nickname = [
    # "AffinityPropagation",
    # "AverageAgglomerativeClustering",
    # "BatchCoHiRF",
    "BatchCoHiRF-1iter",
    # "BatchCoHiRF-DBSCAN",
    "BatchCoHiRF-DBSCAN-1iter",
    "BatchCoHiRF-KernelRBF-1iter",
    "BatchCoHiRF-SC-SRGF",
    # "Clique",
    "CoHiRF",
    "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF",
    # "CompleteAgglomerativeClustering",
    "DBSCAN",
    # "HDBSCAN",
    # "IRFLLRR",
    "KMeans",
    # "KMeansProj",
    "KernelRBFKMeans",
    # "MeanShift",
    # "OPTICS",
    # "Proclus",
    # "SingleAgglomerativeClustering",
    # "SpectralClustering",
    "SpectralSubspaceRandomization",
    # "WardAgglomerativeClustering",
]

In [19]:
dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
]

In [20]:
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "normalized_mutual_info",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
]

In [21]:
# from itertools import product
# combinations_keys = ["model", "dataset_id", "hpo_metric", "standardize"]
# combinations = product(model_nickname, dataset_id, hpo_metric, standardize)
# combination_list = [dict(zip(combinations_keys, comb)) for comb in combinations]
# df_combination = pd.DataFrame(combination_list)
# # exclude slow/mem intensive runs
# # df_combination = df_combination.merge(
# # 	high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True
# # )
# # df_combination = df_combination.loc[df_combination["_merge"] == "left_only"].drop(columns=["_merge"])
# # get only slow/mem intensive runs
# df_combination = df_combination.merge(high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True)
# df_combination = df_combination.loc[df_combination["_merge"] == "both"].drop(columns=["_merge"])
# runs_dict = {}
# for model in df_combination["model"].unique():
#     sub = df_combination[df_combination["model"] == model].drop(columns=["model"])
#     # standardize = True
#     sub_standardized = sub.loc[sub["standardize"] == True].copy()
#     sub_standardized["standardize"] = ""
#     sub_standardized_dict = sub_standardized.to_dict(orient="records")
#     # standardize = False
#     sub_not_standardized = sub.loc[sub["standardize"] == False].copy()
#     sub_not_standardized.drop(columns=["standardize"], inplace=True)
#     sub_not_standardized_dict = sub_not_standardized.to_dict(orient="records")
#     # combine both dictionaries
#     runs_dict[model] = sub_standardized_dict + sub_not_standardized_dict
# if len(runs_dict) != 0:
#     with open(results_dir / "runs_dict.pkl", "wb") as f:
#         pickle.dump(runs_dict, f)

In [22]:
df_runs_raw.head(5)

Unnamed: 0_level_0,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000208c805246b99faffa1de27af1ef,FINISHED,1757711218952,1757711000000.0,,,,,,,,...,,,60d4079cf33149eea4faeb9dd9edcd76,False,khan-2001,46781,83,1070,4,0
0000219091d04b54b2deb488d80372cb,FINISHED,1757519633653,1757520000000.0,,,,,,,,...,,,6a51f509c15c4027b51831a6932f1da1,False,bredel-2005,46777,50,1740,3,0
0000328e9d8c48a2b65b0b0068626eff,FINISHED,1757513591211,1757514000000.0,,,,,,,,...,,,5aa95abbcb3348e0bbb0c8f13ca888d6,False,satimage,182,6430,37,6,1
0000ed82407546de89923a2ae5c18f40,FINISHED,1757766655932,1757767000000.0,,,,,,,,...,,,147979577e7c40368773e106a98585b4,False,har,1478,10299,562,6,1
00018ba47c2c4d358096ab82decc67d1,FAILED,1757725820112,1757730000000.0,,,,,,,,...,"Expected n_neighbors <= n_samples_fit, but n_n...",,895d17c603cf4e1e81430030acf2933b,True,binary_alpha_digits,46782,1404,321,36,0


# Missing

In [44]:
df = df_runs_parents.copy()
df = df.loc[df["model"] == "BatchCoHiRF-SC-SRGF-1iter-60"]
df = df.loc[df["dataset_id"] == 40685]
df

Unnamed: 0,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,Last step finished,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical,n_children_raised_exception
13d74a0a35a149258003a50224eaec8a,FINISHED,1760610000000.0,1760637000000.0,,,,,,,43b369e06e004816a7c7ccd116eda6ab,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
15718296e50c492fbc450ff2f1041b2b,FINISHED,1760610000000.0,1760633000000.0,,,,,,,fca881bee1784d00a5a770528cfaa187,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
188a09ec8f744c7e8b4f1dbc6528ba66,FINISHED,1760610000000.0,1760638000000.0,,,,,,,d7dda8649df94288bc168d898c14f096,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
2747789790104448ae6876643a5fe49c,FINISHED,1760610000000.0,1760645000000.0,,,,,,,c028a35d6c4d46be8c5de564fd9aebce,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
2dce548e975c4e2392d7662076dfaaaa,FINISHED,1760610000000.0,1760639000000.0,,,,,,,30ba366c81954233bbbc7af20bdeb164,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
2e4a96562c1b4e2c995719c9fd074353,FINISHED,1760521000000.0,1760545000000.0,,,,,,,7ff74e173713447ba9d5c71022c7c2c6,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
342dd1052eb5415db3081955d28ba35c,FINISHED,1760610000000.0,1760637000000.0,,,,,,,f0094f15cb4644139ff5901dd1af02e4,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
60c8265022f5421680ce4cde8ba54619,FINISHED,1760610000000.0,1760637000000.0,,,,,,,2c632b3ee8e44f54aea6eb5eaff9cb56,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
704f3aecfdbd451bab9c519bf6173b60,FINISHED,1760610000000.0,1760635000000.0,,,,,,,69000ffc0d96498fa6396ffdc9285e50,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0
7aed72d246b2439b86981eac048bffcb,FINISHED,1760610000000.0,1760648000000.0,,,,,,,5f08267fb45f4cabbc87cd90cec23e85,...,_on_train_end,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0,0.0


In [23]:
model_nickname = df_runs_parents['model'].unique().tolist()
model_nickname.sort()
model_nickname

['BatchCoHiRF-1iter-60',
 'BatchCoHiRF-1iter-random-60',
 'BatchCoHiRF-1iter-stratified-60',
 'BatchCoHiRF-60',
 'BatchCoHiRF-DBSCAN-1iter-60',
 'BatchCoHiRF-DBSCAN-1iter-random-60',
 'BatchCoHiRF-DBSCAN-1iter-stratified-60',
 'BatchCoHiRF-KernelRBF-1iter-60',
 'BatchCoHiRF-KernelRBF-1iter-random-60',
 'BatchCoHiRF-KernelRBF-1iter-stratified-60',
 'BatchCoHiRF-SC-SRGF-1iter-60',
 'BatchCoHiRF-SC-SRGF-2-60',
 'CoHiRF-1000-60',
 'CoHiRF-60',
 'CoHiRF-DBSCAN-60',
 'CoHiRF-KernelRBF-60',
 'CoHiRF-SC-SRGF-60',
 'CoHiRF-top-down-60',
 'CoHiRF-top-down-inv-60',
 'DBSCAN-60',
 'KMeans-60',
 'KernelRBFKMeans-60',
 'SpectralSubspaceRandomization-60']

In [24]:
non_duplicate_columns = [
	"model",
	"dataset_id",
	"standardize",
	"hpo_metric",
	"hpo_seed",
]

In [25]:
model_nickname = [
    "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-1iter-random-60",
    # "BatchCoHiRF-1iter-stratified-60",
    "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-random-60",
    # "BatchCoHiRF-DBSCAN-1iter-stratified-60",
    "BatchCoHiRF-KernelRBF-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-random-60",
    # "BatchCoHiRF-KernelRBF-1iter-stratified-60",
    "BatchCoHiRF-SC-SRGF-1iter-60",
    # "BatchCoHiRF-SC-SRGF-1iter-random-60",
    # "BatchCoHiRF-SC-SRGF-1iter-stratified-60",
    # "BatchCoHiRF-SC-SRGF-2-60",
    "CoHiRF-60",
    "CoHiRF-top-down-60",
    "CoHiRF-top-down-inv-60",
    # "CoHiRF-1000-60",
    "CoHiRF-DBSCAN-60",
    "CoHiRF-KernelRBF-60",
    "CoHiRF-SC-SRGF-60",
    "DBSCAN-60",
    "KMeans-60",
    "KernelRBFKMeans-60",
    "SpectralSubspaceRandomization-60",
]
dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
    # 1110,
    # 47039
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed


In [104]:
model_nickname = [
    "CoHiRF-1000-60",
]
dataset_id = [
    182,
    1478,
    1568,
    40685,
    40984,
    46782,
    46783,
    554,
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    "davies_bouldin_score",
    "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed


In [38]:
model_nickname = [
    "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-1iter-random-60",
    # "BatchCoHiRF-1iter-stratified-60",
    "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-random-60",
    # "BatchCoHiRF-DBSCAN-1iter-stratified-60",
    "BatchCoHiRF-KernelRBF-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-random-60",
    # "BatchCoHiRF-KernelRBF-1iter-stratified-60",
    "BatchCoHiRF-SC-SRGF-1iter-60",
    # "BatchCoHiRF-SC-SRGF-1iter-random-60",
    # "BatchCoHiRF-SC-SRGF-1iter-stratified-60",
    # "BatchCoHiRF-SC-SRGF-2-60",
    # "CoHiRF-60",
    # "CoHiRF-1000-60",
    # "CoHiRF-DBSCAN-60",
    # "CoHiRF-KernelRBF-60",
    # "CoHiRF-SC-SRGF-60",
    # "DBSCAN-60",
    "KMeans-60",
    # "KernelRBFKMeans-60",
    # "SpectralSubspaceRandomization-60",
]
dataset_id = [
    47039
]
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    # "normalized_mutual_info",
    # "davies_bouldin_score",
    # "silhouette",
]
hpo_seed = [i for i in range(5)]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric, hpo_seed]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed
0,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,0
1,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,1
2,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,2
3,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,3
4,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_rand,4
5,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,0
6,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,1
7,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,2
8,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,3
9,BatchCoHiRF-SC-SRGF-1iter-60,47039,True,adjusted_mutual_info,4


In [149]:
# Join df_runs_raw_parents into df_missing using non_duplicate_columns to get the EXCEPTION column
df_missing_with_exception = df_missing.merge(
    df_runs_raw_parents[non_duplicate_columns + ["raised_exception", "EXCEPTION", "Last step finished"]],
    how="left",
    left_on=["model", "dataset_id", "standardize", "hpo_metric", "hpo_seed"],
    right_on=["model", "dataset_id", "standardize", "hpo_metric", "hpo_seed"],
)
df_missing_with_exception[
    [
        "model",
        "dataset_id",
        "standardize",
        "hpo_metric",
        "hpo_seed",
        "raised_exception",
        "EXCEPTION",
        "Last step finished",
    ]
]

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed,raised_exception,EXCEPTION,Last step finished
0,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,2,,,_before_fit_model
1,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,2,,,_before_fit_model
2,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,2,,,_before_fit_model
3,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,3,,,_before_fit_model
4,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,3,,,_before_fit_model
5,CoHiRF-top-down-60,1568,True,adjusted_mutual_info,3,,,_before_fit_model
6,CoHiRF-top-down-inv-60,1568,True,adjusted_rand,1,,,_before_fit_model
7,CoHiRF-top-down-inv-60,1568,True,adjusted_rand,1,,,_before_fit_model
8,CoHiRF-top-down-inv-60,1568,True,adjusted_mutual_info,1,,,_before_fit_model
9,CoHiRF-top-down-inv-60,1568,True,adjusted_mutual_info,1,,,_before_fit_model


In [197]:
df_missing_dict = df_missing.copy()
# get only rows from high_mem_tuples
# df_missing_dict = df_missing_dict.merge(high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "both"].drop(columns="_merge")
# exclude rows that are in missing_ari_tuples
# df_missing_dict = df_missing_dict.merge(
# 	missing_ari_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )|
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# exclude rows that are in high_mem_tuples
# df_missing_dict = df_missing_dict.merge(
# 	high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# to_drop = pd.concat([missing_ari_tuples, high_mem_tuples], ignore_index=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")

In [198]:
# get rid of -60
df_missing_dict["model"] = df_missing_dict["model"].str.replace("-60", "")
df_missing_dict["seed_dataset_order"] = df_missing_dict["hpo_seed"]

In [199]:
df_missing_dict

Unnamed: 0,model,dataset_id,standardize,hpo_metric,hpo_seed,seed_dataset_order
0,CoHiRF-top-down,554,True,adjusted_rand,0,0
1,CoHiRF-top-down,554,True,adjusted_rand,1,1
2,CoHiRF-top-down,554,True,adjusted_rand,2,2
3,CoHiRF-top-down,554,True,adjusted_rand,3,3
4,CoHiRF-top-down,554,True,adjusted_rand,4,4
5,CoHiRF-top-down,554,True,adjusted_mutual_info,0,0
6,CoHiRF-top-down,554,True,adjusted_mutual_info,1,1
7,CoHiRF-top-down,554,True,adjusted_mutual_info,2,2
8,CoHiRF-top-down,554,True,adjusted_mutual_info,3,3
9,CoHiRF-top-down,554,True,adjusted_mutual_info,4,4


In [200]:
missing_dict = {}
for model in df_missing_dict["model"].unique():
    sub = df_missing_dict[df_missing_dict["model"] == model].drop(columns=["model"])
    # standardize = True
    sub_standardized = sub.loc[sub["standardize"] == True].copy()
    sub_standardized["standardize"] = ''
    sub_standardized_dict = sub_standardized.to_dict(orient="records")
    # standardize = False
    sub_not_standardized = sub.loc[sub["standardize"] == False].copy()
    sub_not_standardized.drop(columns=["standardize"], inplace=True)
    sub_not_standardized_dict = sub_not_standardized.to_dict(orient="records")
    # combine both dictionaries
    missing_dict[model] = sub_standardized_dict + sub_not_standardized_dict
if len(missing_dict) != 0:
    with open(results_dir / 'missing_dict.pkl', 'wb') as f:
        pickle.dump(missing_dict, f)

In [201]:
missing_dict

{'CoHiRF-top-down': [{'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_rand',
   'hpo_seed': 0,
   'seed_dataset_order': 0},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_rand',
   'hpo_seed': 1,
   'seed_dataset_order': 1},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_rand',
   'hpo_seed': 2,
   'seed_dataset_order': 2},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_rand',
   'hpo_seed': 3,
   'seed_dataset_order': 3},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_rand',
   'hpo_seed': 4,
   'seed_dataset_order': 4},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_mutual_info',
   'hpo_seed': 0,
   'seed_dataset_order': 0},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_mutual_info',
   'hpo_seed': 1,
   'seed_dataset_order': 1},
  {'dataset_id': 554,
   'standardize': '',
   'hpo_metric': 'adjusted_mutual_info',
   'hpo

# Tables

In [45]:
def get_parameters_string(row):
    parameter_names = {
		"best/alpha": "\\alpha",
		"best/avg_dims": "d",
		"best/base_model_kwargs/eps": "\\epsilon",
		"best/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/base_model_kwargs/n_clusters": "C",
		"best/c": "c",
		"best/cohirf_kwargs/base_model_kwargs/eps": "\\epsilon",
		"best/cohirf_kwargs/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/cohirf_kwargs/kmeans_n_clusters": "C",
		"best/cohirf_kwargs/n_features": "q",
		"best/cohirf_kwargs/repetitions": "R",
		"best/damping": "\\lambda",
		# "best/density_threshold": "\\tau",
		"best/eps": "\\epsilon",
		"best/kmeans_n_clusters": "C",
		"best/lambda_": "\\lambda",
		"best/min_bin_freq": "bin_{\\text{min}}",
		"best/min_cluster_size": "C_{\\text{min}}",
		"best/min_samples": "n_{\\text{min}}",
		"best/n_clusters": "C",
		"best/n_features": "q",
		# "best/n_partitions": "P",
		"best/n_similarities": "m",
		"best/p": "p",
		"best/repetitions": "R",
		"best/sampling_ratio": "r",
		"best/sc_n_clusters": "C",
		"best/transform_kwargs/gamma": "\\gamma",
	}
    first = True
    str = ""
    for p in parameter_names.keys():
        if not pd.isna(row[p]) and row[p] != "None":
            if not first:
                str += "; "
            else:
                first = False
            value = float(row[p])
            if value.is_integer():
                value = int(value)
                str += f"${parameter_names[p]}={value}$"
            else:
                str += f"${parameter_names[p]}={value:0.2f}$"
    return str

In [46]:
def highlight_max(df, column_name, level=0):
    df_column = df[column_name]
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_highlighted] = 'font-weight: bold'
    return df_css

In [47]:
def highlight_min(df, column_name, level=0):
    df_column = df[column_name]
    min_values = df_column.groupby(level=level).transform("min")
    is_highlighted = df_column.round(3) == min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_highlighted] = "font-weight: bold"
    return df_css

In [48]:
def highlight_max_index(series_index, df_column, level=0):
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css[:] = ''
    series_css[is_highlighted.values] = 'font-weight: bold'
    return series_css

In [49]:
def underline_2nd_max(df, column_name, level=0):
    df_column = df[column_name]
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.round(3).drop_duplicates().nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_underlined] = 'underline: --latex--rwrap'
    return df_css

In [50]:
def underline_2nd_min(df, column_name, level=0):
    df_column = df[column_name]
    # get the second min value
    second_min_values = df_column.groupby(level=level).transform(
        lambda x: x.round(3).drop_duplicates().nsmallest(2).iloc[-1]
    )
    is_underlined = df_column.round(3) == second_min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_underlined] = "underline: --latex--rwrap"
    return df_css

In [51]:
def underline_2nd_max_index(series_index, df_column, level=0):
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css.loc[:] = ''
    series_css[is_underlined.values] = 'underline: --latex--rwrap'
    return series_css

## Some Models

In [52]:
print(*df_runs_parents['model'].unique(), sep="\n")

BatchCoHiRF-DBSCAN-1iter-60
KernelRBFKMeans-60
CoHiRF-top-down-inv-60
CoHiRF-KernelRBF-60
BatchCoHiRF-1iter-60
CoHiRF-top-down-60
CoHiRF-SC-SRGF-60
CoHiRF-1000-60
KMeans-60
BatchCoHiRF-SC-SRGF-1iter-60
SpectralSubspaceRandomization-60
BatchCoHiRF-KernelRBF-1iter-60
DBSCAN-60
CoHiRF-DBSCAN-60
BatchCoHiRF-DBSCAN-1iter-stratified-60
CoHiRF-60
BatchCoHiRF-DBSCAN-1iter-random-60
BatchCoHiRF-KernelRBF-1iter-stratified-60
BatchCoHiRF-60
BatchCoHiRF-KernelRBF-1iter-random-60
BatchCoHiRF-SC-SRGF-2-60
BatchCoHiRF-1iter-random-60
BatchCoHiRF-1iter-stratified-60


In [53]:
model_names = {
    "BatchCoHiRF-1iter-60": "Batch CoHiRF",
    "BatchCoHiRF-1iter-stratified-60": "Batch CoHiRF-Stratified",
    "BatchCoHiRF-1iter-random-60": "Batch CoHiRF-Random",
    "BatchCoHiRF-DBSCAN-1iter-60": "Batch CoHiRF-DBSCAN",
    "BatchCoHiRF-DBSCAN-1iter-stratified-60": "Batch CoHiRF-DBSCAN-Stratified",
    "BatchCoHiRF-DBSCAN-1iter-random-60": "Batch CoHiRF-DBSCAN-Random",
    "BatchCoHiRF-KernelRBF-1iter-60": "Batch CoHiRF-KernelRBF",
    "BatchCoHiRF-KernelRBF-1iter-stratified-60": "Batch CoHiRF-KernelRBF-Stratified",
    "BatchCoHiRF-KernelRBF-1iter-random-60": "Batch CoHiRF-KernelRBF-Random",
    "BatchCoHiRF-SC-SRGF-1iter-60": "Batch CoHiRF-SC-SRGF",
    "BatchCoHiRF-SC-SRGF-2-1iter-60": "Batch CoHiRF-SC-SRGF-2",
    "CoHiRF-60": "CoHiRF",
    "CoHiRF-top-down-60": "CoHiRF Top-Down",
    "CoHiRF-top-down-inv-60": "CoHiRF Top-Down Inv",
	"CoHiRF-1000-60": "CoHiRF-1000",
    "CoHiRF-DBSCAN-60": "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF-60": "CoHiRF-KernelRBF",
    "CoHiRF-SC-SRGF-60": "CoHiRF-SC-SRGF",
    "DBSCAN-60": "DBSCAN",
    "KMeans-60": "K-Means",
    "KernelRBFKMeans-60": "Kernel RBF K-Means",
    "SpectralSubspaceRandomization-60": "SC-SRGF",
}

dataset_names = {
    "binary_alpha_digits": "binary-alpha-digits",
	"mnist_784": "mnist",
}  # otherwise we get an error in latex

# Filter to only standardized runs
df = df_runs_parents.copy()
df = df.loc[df['standardize'] == True]
df = df.loc[df['model'].isin(model_names.keys())]
df = df.replace({"model": model_names})
df = df.replace({"dataset_name": dataset_names})

# Filter to only runs with hpo_seed in range(5)
df = df.loc[df['hpo_seed'].isin(range(5))]

In [54]:
hpo_metrics = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
    "normalized_mutual_info",
]

hpo_metrics_rename = [
    "ARI",
    "AMI",
    "Calinski",
    "Silhouette",
    "Davies-Bouldin",
    "NMI",
]

dfs_metrics = {}

for hpo_metric, hpo_metric_rename in zip(hpo_metrics, hpo_metrics_rename):
    df_metric = df.loc[df['hpo_metric'] == hpo_metric][
        ['dataset_name', 'model', 'hpo_seed', f'best/{hpo_metric}']
    ].rename(columns={f'best/{hpo_metric}': hpo_metric_rename})
    df_metric = df_metric.dropna(subset=[hpo_metric_rename])
    df_metric = df_metric.set_index(['dataset_name', 'model', 'hpo_seed'])
    df_metric = df_metric.astype({hpo_metric_rename: float})
    dfs_metrics[hpo_metric_rename] = df_metric

df_metrics = pd.concat(dfs_metrics.values(), axis=1, join="outer")
df_metrics = df_metrics.reset_index()

# calculate mean and std
df_metrics = df_metrics.groupby(['dataset_name', 'model']).agg(['mean', 'std'])
# flatten multiindex columns
df_metrics.columns = [' '.join(col).strip() for col in df_metrics.columns.values]
# drop hpo_seed level
df_metrics = df_metrics.drop(columns=['hpo_seed mean', 'hpo_seed std'])
# Rename index levels
df_metrics.index.names = ["Dataset", "Model"]
# df_metrics["Davies-Bouldin"] = df_metrics["Davies-Bouldin"].astype(float)
# create columns Metric (Mean ± Std)
# for metric in hpo_metrics_rename:
#     df_metrics[f"{metric}"] = df_metrics[f"{metric} mean"].round(3).astype(str) + " $\\pm$ " + df_metrics[f"{metric} std"].round(3).astype(str)

for metric in hpo_metrics_rename:
    df_metrics[f"{metric}"] = (
        df_metrics[f"{metric} mean"].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else "No Run")
        + " $\\pm$ "
        + df_metrics[f"{metric} std"].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else "No Run")
    )


# Reset Seed level
# df_metrics = df_metrics.reset_index(level="Seed")

In [55]:
df_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI mean,ARI std,AMI mean,AMI std,Calinski mean,Calinski std,Silhouette mean,Silhouette std,Davies-Bouldin mean,Davies-Bouldin std,NMI mean,NMI std,ARI,AMI,Calinski,Silhouette,Davies-Bouldin,NMI
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
alizadeh-2000-v2,Batch CoHiRF,0.359896,0.014291,5.065753e-01,0.021294,4.273626,0.237620,0.048742,0.009521,0.411411,0.084283,,,0.360 $\pm$ 0.014,0.507 $\pm$ 0.021,4.274 $\pm$ 0.238,0.049 $\pm$ 0.010,0.411 $\pm$ 0.084,No Run $\pm$ No Run
alizadeh-2000-v2,Batch CoHiRF-DBSCAN,0.000000,0.000000,-1.355437e-14,0.000000,-1.000000,0.000000,-1.000000,0.000000,1000.000000,0.000000,,,0.000 $\pm$ 0.000,-0.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,1000.000 $\pm$ 0.000,No Run $\pm$ No Run
alizadeh-2000-v2,Batch CoHiRF-KernelRBF,0.078146,0.055388,8.028088e-02,0.017931,1.575115,0.093583,0.044387,0.027604,0.470840,0.074694,,,0.078 $\pm$ 0.055,0.080 $\pm$ 0.018,1.575 $\pm$ 0.094,0.044 $\pm$ 0.028,0.471 $\pm$ 0.075,No Run $\pm$ No Run
alizadeh-2000-v2,Batch CoHiRF-SC-SRGF,0.785098,0.075567,6.652103e-01,0.097561,14.183740,1.269080,0.176404,0.012153,1.578932,0.133080,,,0.785 $\pm$ 0.076,0.665 $\pm$ 0.098,14.184 $\pm$ 1.269,0.176 $\pm$ 0.012,1.579 $\pm$ 0.133,No Run $\pm$ No Run
alizadeh-2000-v2,CoHiRF,0.865875,0.006962,7.713592e-01,0.029894,15.094428,0.125896,0.195661,0.014860,0.895849,0.082582,,,0.866 $\pm$ 0.007,0.771 $\pm$ 0.030,15.094 $\pm$ 0.126,0.196 $\pm$ 0.015,0.896 $\pm$ 0.083,No Run $\pm$ No Run
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shuttle,CoHiRF-DBSCAN,0.691670,0.007741,6.041114e-01,0.072497,1661.914919,938.850704,0.947420,0.013002,0.043827,0.003614,,,0.692 $\pm$ 0.008,0.604 $\pm$ 0.072,1661.915 $\pm$ 938.851,0.947 $\pm$ 0.013,0.044 $\pm$ 0.004,No Run $\pm$ No Run
shuttle,CoHiRF-KernelRBF,0.460048,0.119175,4.577238e-01,0.062942,10955.561513,4532.351396,0.474109,0.131954,1.191542,0.132182,,,0.460 $\pm$ 0.119,0.458 $\pm$ 0.063,10955.562 $\pm$ 4532.351,0.474 $\pm$ 0.132,1.192 $\pm$ 0.132,No Run $\pm$ No Run
shuttle,DBSCAN,0.708522,0.012708,6.430448e-01,0.001591,7504.081967,12.888252,0.948136,0.000715,1.738127,0.306075,,,0.709 $\pm$ 0.013,0.643 $\pm$ 0.002,7504.082 $\pm$ 12.888,0.948 $\pm$ 0.001,1.738 $\pm$ 0.306,No Run $\pm$ No Run
shuttle,K-Means,0.608404,0.000000,4.727386e-01,0.013506,30845.870523,297.411898,0.960408,0.008693,0.556971,0.011138,,,0.608 $\pm$ 0.000,0.473 $\pm$ 0.014,30845.871 $\pm$ 297.412,0.960 $\pm$ 0.009,0.557 $\pm$ 0.011,No Run $\pm$ No Run


In [56]:
# Add mean time columns to the existing df_metrics dataframe
# Using the same filtering approach as the original df_metrics
df = df_runs_parents.copy()
df = df.loc[df["standardize"] == True]
df = df.loc[df["model"].isin(model_names.keys())]
df = df.replace({"model": model_names})
df = df.replace({"dataset_name": dataset_names})
# Filter to only runs with hpo_seed in range(5)
df = df.loc[df["hpo_seed"].isin(range(5))]

# Calculate mean and std times for each dataset-model combination across all metrics
df_times = (
    df.groupby(["dataset_name", "model"])
    .agg({"best/elapsed_time": ["mean", "std"], "fit_model_return_elapsed_time": ["mean", "std"]})
    .rename(columns={"best/elapsed_time": "Best Time", "fit_model_return_elapsed_time": "HPO Time"})
)

# Flatten multiindex columns
df_times.columns = [' '.join(col).strip() for col in df_times.columns.values]
# Set the same index structure as df_metrics
df_times.index.names = ["Dataset", "Model"]

df_times["Best Time"] = (
	df_times["Best Time mean"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
	+ " $\\pm$ " 
	+ df_times["Best Time std"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
)
df_times["HPO Time"] = (
	df_times["HPO Time mean"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
	+ " $\\pm$ "
	+ df_times["HPO Time std"].apply(lambda x: f"{x:4.3f}" if not pd.isna(x) else "No Run")
)

# Join with the existing df_metrics (verify we have the same number of rows!)
df_metrics = df_metrics.join(df_times, how="outer")

In [57]:
# # Create a time-based dataframe with elapsed times for each metric optimization
# # Using the same filtering approach as the original df_metrics
# df_filtered = df_runs_parents.loc[df_runs_parents['standardize'] == True].copy()
# df_filtered = df_filtered.loc[df_filtered['model'].isin(model_names.keys())]
# df_filtered = df_filtered.replace({"model": model_names})
# df_filtered = df_filtered.replace({"dataset_name": dataset_names})

# # Create separate dataframes for each metric optimization with time columns
# df_ari_time = df_filtered.loc[df_filtered['hpo_metric'] == 'adjusted_rand'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'ARI_best_time', 'fit_model_return_elapsed_time': 'ARI_total_time'})

# df_ami_time = df_filtered.loc[df_filtered['hpo_metric'] == 'adjusted_mutual_info'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'AMI_best_time', 'fit_model_return_elapsed_time': 'AMI_total_time'})

# df_nmi_time = df_filtered.loc[df_filtered['hpo_metric'] == 'normalized_mutual_info'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'NMI_best_time', 'fit_model_return_elapsed_time': 'NMI_total_time'})

# df_calinski_time = df_filtered.loc[df_filtered['hpo_metric'] == 'calinski_harabasz_score'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'Calinski_best_time', 'fit_model_return_elapsed_time': 'Calinski_total_time'})

# df_silhouette_time = df_filtered.loc[df_filtered['hpo_metric'] == 'silhouette'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'Silhouette_best_time', 'fit_model_return_elapsed_time': 'Silhouette_total_time'})

# df_davies_bouldin_time = df_filtered.loc[df_filtered['hpo_metric'] == 'davies_bouldin_score'][
#     ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
# ].rename(columns={'best/elapsed_time': 'Davies-Bouldin_best_time', 'fit_model_return_elapsed_time': 'Davies-Bouldin_total_time'})

# # Remove missing values before setting index
# df_ari_time = df_ari_time.dropna(subset=["ARI_best_time", "ARI_total_time"])
# df_ami_time = df_ami_time.dropna(subset=["AMI_best_time", "AMI_total_time"])
# df_nmi_time = df_nmi_time.dropna(subset=["NMI_best_time", "NMI_total_time"])
# df_calinski_time = df_calinski_time.dropna(subset=["Calinski_best_time", "Calinski_total_time"])
# df_silhouette_time = df_silhouette_time.dropna(subset=["Silhouette_best_time", "Silhouette_total_time"])
# df_davies_bouldin_time = df_davies_bouldin_time.dropna(subset=["Davies-Bouldin_best_time", "Davies-Bouldin_total_time"])

# # Set multi-index for all dataframes
# df_ari_time = df_ari_time.set_index(["dataset_name", "model"])
# df_ami_time = df_ami_time.set_index(["dataset_name", "model"])
# df_nmi_time = df_nmi_time.set_index(["dataset_name", "model"])
# df_calinski_time = df_calinski_time.set_index(["dataset_name", "model"])
# df_silhouette_time = df_silhouette_time.set_index(["dataset_name", "model"])
# df_davies_bouldin_time = df_davies_bouldin_time.set_index(["dataset_name", "model"])

# # Combine all time metrics into a single dataframe using outer join
# df_time_metrics = df_ari_time.join(df_ami_time, how="outer").join(df_nmi, how="outer").join(df_calinski_time, how="outer").join(df_silhouette_time, how="outer").join(df_davies_bouldin_time, how="outer")

# # Rename index levels
# df_time_metrics.index.names = ["Dataset", "Model"]

In [58]:
df_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI mean,ARI std,AMI mean,AMI std,Calinski mean,Calinski std,Silhouette mean,Silhouette std,Davies-Bouldin mean,Davies-Bouldin std,...,Calinski,Silhouette,Davies-Bouldin,NMI,Best Time mean,Best Time std,HPO Time mean,HPO Time std,Best Time,HPO Time
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
alizadeh-2000-v2,Batch CoHiRF,0.359896,0.014291,5.065753e-01,0.021294,4.273626,0.237620,0.048742,0.009521,0.411411,0.084283,...,4.274 $\pm$ 0.238,0.049 $\pm$ 0.010,0.411 $\pm$ 0.084,No Run $\pm$ No Run,0.142813,0.068740,154.716981,4.251098,0.143 $\pm$ 0.069,154.717 $\pm$ 4.251
alizadeh-2000-v2,Batch CoHiRF-DBSCAN,0.000000,0.000000,-1.355437e-14,0.000000,-1.000000,0.000000,-1.000000,0.000000,1000.000000,0.000000,...,-1.000 $\pm$ 0.000,-1.000 $\pm$ 0.000,1000.000 $\pm$ 0.000,No Run $\pm$ No Run,2.309132,0.231695,138.359086,5.834786,2.309 $\pm$ 0.232,138.359 $\pm$ 5.835
alizadeh-2000-v2,Batch CoHiRF-KernelRBF,0.078146,0.055388,8.028088e-02,0.017931,1.575115,0.093583,0.044387,0.027604,0.470840,0.074694,...,1.575 $\pm$ 0.094,0.044 $\pm$ 0.028,0.471 $\pm$ 0.075,No Run $\pm$ No Run,0.394981,0.464414,167.077405,7.737474,0.395 $\pm$ 0.464,167.077 $\pm$ 7.737
alizadeh-2000-v2,Batch CoHiRF-SC-SRGF,0.785098,0.075567,6.652103e-01,0.097561,14.183740,1.269080,0.176404,0.012153,1.578932,0.133080,...,14.184 $\pm$ 1.269,0.176 $\pm$ 0.012,1.579 $\pm$ 0.133,No Run $\pm$ No Run,1.953276,0.851366,265.944189,32.207204,1.953 $\pm$ 0.851,265.944 $\pm$ 32.207
alizadeh-2000-v2,CoHiRF,0.865875,0.006962,7.713592e-01,0.029894,15.094428,0.125896,0.195661,0.014860,0.895849,0.082582,...,15.094 $\pm$ 0.126,0.196 $\pm$ 0.015,0.896 $\pm$ 0.083,No Run $\pm$ No Run,0.054494,0.032927,142.623437,4.180228,0.054 $\pm$ 0.033,142.623 $\pm$ 4.180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shuttle,CoHiRF-DBSCAN,0.691670,0.007741,6.041114e-01,0.072497,1661.914919,938.850704,0.947420,0.013002,0.043827,0.003614,...,1661.915 $\pm$ 938.851,0.947 $\pm$ 0.013,0.044 $\pm$ 0.004,No Run $\pm$ No Run,146.100549,136.031869,12001.592468,7286.835987,146.101 $\pm$ 136.032,12001.592 $\pm$ 7286.836
shuttle,CoHiRF-KernelRBF,0.460048,0.119175,4.577238e-01,0.062942,10955.561513,4532.351396,0.474109,0.131954,1.191542,0.132182,...,10955.562 $\pm$ 4532.351,0.474 $\pm$ 0.132,1.192 $\pm$ 0.132,No Run $\pm$ No Run,11.999437,6.496682,1136.000458,864.371411,11.999 $\pm$ 6.497,1136.000 $\pm$ 864.371
shuttle,DBSCAN,0.708522,0.012708,6.430448e-01,0.001591,7504.081967,12.888252,0.948136,0.000715,1.738127,0.306075,...,7504.082 $\pm$ 12.888,0.948 $\pm$ 0.001,1.738 $\pm$ 0.306,No Run $\pm$ No Run,31.611373,13.034294,3234.574771,1047.159839,31.611 $\pm$ 13.034,3234.575 $\pm$ 1047.160
shuttle,K-Means,0.608404,0.000000,4.727386e-01,0.013506,30845.870523,297.411898,0.960408,0.008693,0.556971,0.011138,...,30845.871 $\pm$ 297.412,0.960 $\pm$ 0.009,0.557 $\pm$ 0.011,No Run $\pm$ No Run,0.073332,0.062088,478.330452,836.361835,0.073 $\pm$ 0.062,478.330 $\pm$ 836.362


The following will provide the latex code for a clean table, we only need to make a little adjustement in the first line to delete the "key" and have only one header. For the longtable environment (full data) we need to add the "\*" at the end of lines we dont want to have a page break. We also should replace the entire begin{table} ... end{table} by begin{longtable} ... end{longtable} in the latex file, if you want to put caption and labels you should break the line after with '\\' (put both on the same line!)


In [59]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        environment="longtable",
    )
)

\begin{longtable}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\endfirsthead
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\endhead
\midrule
\multicolumn{9}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\multirow[c]{14}{*}{alizadeh-2000-v2} & Batch CoHiRF & 0.360 $\pm$ 0.014 & 0.507 $\pm$ 0.021 & 4.274 $\pm$ 0.238 & 0.049 $\pm$ 0.010 & \bfseries 0.411 $\pm$ 0.084 & 0.143 $\pm$ 0.069 & 154.717 $\pm$ 4.251 \\
 & Batch CoHiRF-DBSCAN & 0.000 $\pm$ 0.000 & -0.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & -1.000 $\pm$ 0.000 & 1000.000 $\pm$ 0.000 & 2.309 $\pm$ 0.232 & 138.359 $\pm$ 5.835 \\
 & Batch CoHiRF-KernelRBF & 0.078 $\pm$ 0.055 & 0.080 $\pm$ 0.018 & 1.575 $\pm$ 0.094 & 0.044 $\pm$ 0.028 & 0.471 $\pm$ 0.075 & 0.395 $\pm$ 0.464 & 167.077 $\pm$ 7.737 \\
 & Batch CoHiRF

# KMeans

In [56]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = [
    "garber-2001",
    "alizadeh-2000-v2",
    "golub-1999-v2",
    "armstrong-2002-v1",
    "nursery",
    "segment",
]
models_to_keep = [
    "K-Means",
    "CoHiRF",
	"CoHiRF-1000",
    "Batch CoHiRF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{alizadeh-2000-v2} & Batch CoHiRF & 0.360 $\pm$ 0.014 & 0.507 $\pm$ 0.021 & 4.274 $\pm$ 0.238 & 0.049 $\pm$ 0.010 & \bfseries 0.411 $\pm$ 0.084 & 0.143 $\pm$ 0.069 & 154.717 $\pm$ 4.251 \\
 & CoHiRF & \bfseries 0.866 $\pm$ 0.007 & \bfseries 0.771 $\pm$ 0.030 & \underline{15.094 $\pm$ 0.126} & \underline{0.196 $\pm$ 0.015} & \underline{0.896 $\pm$ 0.083} & 0.054 $\pm$ 0.033 & 142.623 $\pm$ 4.180 \\
 & K-Means & \underline{0.838 $\pm$ 0.014} & \underline{0.767 $\pm$ 0.037} & \bfseries 15.151 $\pm$ 0.000 & \bfseries 0.204 $\pm$ 0.036 & 0.934 $\pm$ 0.021 & 0.012 $\pm$ 0.003 & 137.865 $\pm$ 3.673 \\
\cline{1-9}
\multirow[c]{3}{*}{armstrong-2002-v1} & Batch CoHiRF & 0.225 $\pm$ 0.071 & 0.267 $\pm$ 0.061 & 2.933 $\pm$ 0.333 & 0.003 $\pm$ 0.007 & \bfseries 0.401 $\pm$ 0.025 & 0.125 $\pm$ 0.039 & 90.402 $\pm$ 3.

# Kernel KMeans

In [57]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = [
    "khan-2001",
    "bittner-2000",
    "iris",
    "satimage",
]
models_to_keep = [
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{bittner-2000} & Batch CoHiRF-KernelRBF & 0.088 $\pm$ 0.015 & 0.098 $\pm$ 0.011 & \bfseries 1.918 $\pm$ 0.217 & 0.041 $\pm$ 0.020 & \bfseries 0.499 $\pm$ 0.083 & 0.224 $\pm$ 0.180 & 171.649 $\pm$ 4.442 \\
 & CoHiRF-KernelRBF & \bfseries 0.104 $\pm$ 0.049 & \underline{0.136 $\pm$ 0.043} & \underline{1.569 $\pm$ 0.146} & \bfseries 0.054 $\pm$ 0.066 & \underline{0.530 $\pm$ 0.053} & 0.191 $\pm$ 0.118 & 158.762 $\pm$ 4.244 \\
 & Kernel RBF K-Means & \underline{0.090 $\pm$ 0.038} & \bfseries 0.137 $\pm$ 0.034 & 1.334 $\pm$ 0.128 & \underline{0.043 $\pm$ 0.021} & 0.905 $\pm$ 0.014 & 0.062 $\pm$ 0.017 & 148.491 $\pm$ 2.636 \\
\cline{1-9}
\multirow[c]{3}{*}{iris} & Batch CoHiRF-KernelRBF & \underline{0.687 $\pm$ 0.090} & 0.704 $\pm$ 0.073 & 121.522 $\pm$ 26.880 & 0.396 $\pm$ 0.162 & 0.889 $\pm$ 0.169 & 0.351 $\

# DBSCAN

In [58]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["ecoli", "binary-alpha-digits", "segment", "chowdary-2006", "shuttle"]
models_to_keep = [
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{binary-alpha-digits} & Batch CoHiRF-DBSCAN & \underline{0.006 $\pm$ 0.008} & \underline{0.032 $\pm$ 0.033} & \underline{1.473 $\pm$ 0.131} & \bfseries 0.080 $\pm$ 0.025 & \bfseries 0.837 $\pm$ 0.037 & 0.284 $\pm$ 0.238 & 58.526 $\pm$ 3.098 \\
 & CoHiRF-DBSCAN & \bfseries 0.009 $\pm$ 0.005 & \bfseries 0.035 $\pm$ 0.026 & 1.353 $\pm$ 0.109 & \underline{0.011 $\pm$ 0.035} & \underline{0.862 $\pm$ 0.035} & 0.189 $\pm$ 0.041 & 57.093 $\pm$ 1.612 \\
 & DBSCAN & 0.000 $\pm$ 0.000 & 0.002 $\pm$ 0.003 & \bfseries 1.574 $\pm$ 3.525 & -0.561 $\pm$ 0.602 & 600.462 $\pm$ 547.090 & 0.053 $\pm$ 0.036 & 44.413 $\pm$ 2.076 \\
\cline{1-9}
\multirow[c]{3}{*}{chowdary-2006} & Batch CoHiRF-DBSCAN & \underline{0.068 $\pm$ 0.053} & \underline{0.099 $\pm$ 0.047} & \underline{37.353 $\pm$ 0.000} & 0.398 $\pm$ 0.366 & \underlin

# SC-SRGF


In [59]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["alizadeh-2000-v3", "alizadeh-2000-v2", "har", "satimage", "chowdary-2006"]
models_to_keep = [
    "SC-SRGF",
	"CoHiRF-SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{alizadeh-2000-v2} & CoHiRF-SC-SRGF & \underline{0.633 $\pm$ 0.042} & \underline{0.758 $\pm$ 0.031} & \underline{7.422 $\pm$ 0.041} & \underline{0.094 $\pm$ 0.001} & \underline{1.550 $\pm$ 0.019} & 1.503 $\pm$ 1.527 & 254.264 $\pm$ 14.525 \\
 & SC-SRGF & \bfseries 0.891 $\pm$ 0.125 & \bfseries 0.854 $\pm$ 0.093 & \bfseries 12.338 $\pm$ 0.000 & \bfseries 0.194 $\pm$ 0.000 & \bfseries 0.936 $\pm$ 0.014 & 0.132 $\pm$ 0.061 & 142.764 $\pm$ 4.481 \\
\cline{1-9}
\multirow[c]{2}{*}{alizadeh-2000-v3} & CoHiRF-SC-SRGF & \underline{0.443 $\pm$ 0.013} & \underline{0.628 $\pm$ 0.019} & \underline{7.411 $\pm$ 0.060} & \underline{0.093 $\pm$ 0.002} & \underline{1.562 $\pm$ 0.047} & 0.928 $\pm$ 1.146 & 245.332 $\pm$ 13.850 \\
 & SC-SRGF & \bfseries 0.519 $\pm$ 0.000 & \bfseries 0.702 $\pm$ 0.048 & \bfseries 12.341 $\p

# COIL 20

In [61]:
df_latex = df_metrics.copy()
columns_to_hide = [col for col in df_latex.columns if col not in (hpo_metrics_rename + ["Best Time", "HPO Time"])]
columns_to_hide += ["NMI"]
datasets_to_keep = ["coil-20", "mnist"]
models_to_keep = [
    "K-Means",
    "CoHiRF",
	"CoHiRF-1000",
    "Batch CoHiRF",
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
    "SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI mean")
highlight_max_ami = partial(highlight_max, column_name="AMI mean")
highlight_max_calinski = partial(highlight_max, column_name="Calinski mean")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette mean")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin mean")
highlight_min_best_time = partial(highlight_min, column_name="Best Time mean")
highlight_min_hpo_time = partial(highlight_min, column_name="HPO Time mean")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI mean")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI mean")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski mean")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette mean")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin mean")
underline_2nd_min_best_time = partial(underline_2nd_min, column_name="Best Time mean")
underline_2nd_min_hpo_time = partial(underline_2nd_min, column_name="HPO Time mean")
print(
    df_latex.style.apply(highlight_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(underline_2nd_max_ari, subset=["ARI", "ARI mean"], axis=None)
    .apply(highlight_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(underline_2nd_max_ami, subset=["AMI", "AMI mean"], axis=None)
    .apply(highlight_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(underline_2nd_max_calinski, subset=["Calinski", "Calinski mean"], axis=None)
    .apply(highlight_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(underline_2nd_max_silhouette, subset=["Silhouette", "Silhouette mean"], axis=None)
    .apply(highlight_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset=["Davies-Bouldin", "Davies-Bouldin mean"], axis=None)
    .hide(columns_to_hide, axis=1)
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "l" * (len(df_latex.columns) - len(columns_to_hide)),
        # environment="longtable",
    )
)

\begin{tabular}{lllllllll}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Best Time & HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{11}{*}{coil-20} & Batch CoHiRF & 0.381 $\pm$ 0.008 & 0.632 $\pm$ 0.010 & 62.639 $\pm$ 4.789 & 0.107 $\pm$ 0.018 & 1.532 $\pm$ 0.065 & 0.541 $\pm$ 0.153 & 172.626 $\pm$ 7.695 \\
 & Batch CoHiRF-DBSCAN & 0.335 $\pm$ 0.055 & 0.588 $\pm$ 0.051 & \bfseries 291.162 $\pm$ 71.197 & -0.001 $\pm$ 0.008 & \bfseries 0.077 $\pm$ 0.041 & 1.305 $\pm$ 1.005 & 175.064 $\pm$ 19.429 \\
 & Batch CoHiRF-KernelRBF & 0.004 $\pm$ 0.006 & 0.023 $\pm$ 0.020 & 1.804 $\pm$ 0.276 & 0.005 $\pm$ 0.002 & 2.404 $\pm$ 0.100 & 10.855 $\pm$ 17.100 & 510.956 $\pm$ 280.500 \\
 & CoHiRF & 0.355 $\pm$ 0.041 & 0.627 $\pm$ 0.017 & 287.989 $\pm$ 0.164 & \underline{0.220 $\pm$ 0.095} & 1.597 $\pm$ 0.166 & 0.290 $\pm$ 0.087 & 151.293 $\pm$ 4.393 \\
 & CoHiRF-1000 & 0.338 $\pm$ 0.009 & 0.632 $\pm$ 0.006 & 288.053 $\pm$ 0.078 & 0.180 $\pm$ 0.004 & 1.725 

# Debug and explore

In [41]:
df = df_runs_raw_parents.copy()

In [59]:
df = df.loc[df["model"] == "BatchCoHiRF-DBSCAN-1iter-60"]
df = df.loc[df["dataset_id"] == 47039]

In [60]:
df

Unnamed: 0_level_0,status,start_time,end_time,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,best/base_model_kwargs/sc_n_clusters,best/child_run_id,...,EXCEPTION,Last step finished,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0687a0e95efb434482a8174cda90d4cb,FINISHED,1759857341387,1759883000000.0,,,,,,,11edf35ba2aa4e67bfbda1a2e60600a4,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
194290e427024ea0a13f9ae54a19ad1d,FINISHED,1759857342388,1759875000000.0,,,,,,,5e6f3d3cf8c844f795bf7220090e113d,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
2d6bd7e8c4ba4385ba525787babef29a,FINISHED,1759746035261,1759772000000.0,,,,,,,cc9f24b21561489abf2010dfb6f18b20,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
3bb1b76f13684935a96c5f6921c1331d,FINISHED,1759857341387,1759876000000.0,,,,,,,92e2a42f740f494d955665bf6cd284df,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
3ce4ac0bd17943149fed033ad489646b,FINISHED,1759746035257,1759761000000.0,,,,,,,8b7dfabd8ba441ecba1950ade4c28e41,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
5136039b079740f39d60ff9070c2b98d,FINISHED,1759746035785,1759792000000.0,,,,,,,1d6e63c34e73416896a0976dba0d528f,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
5d063fc4745a4d70add5329b0640e06f,FINISHED,1759746033001,1759769000000.0,,,,,,,5e35df5e5f6e4572b12190e51a2ad5e8,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
71fea0d02579497b99daa7aaaf00a6bb,FAILED,1759510979843,1759520000000.0,,,,,,,,...,Best metric adjusted_rand not found in the bes...,_before_evaluate_model,,True,criteo-uplift-balanced,47039,1366544,14,4,2
85248d1527414425abd710547fe79470,FINISHED,1759671558525,1759708000000.0,,,,,,,4ac471428c644046a870cb5b0f7bca53,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2
859d7eb2686240689573db3e60bb46c5,FINISHED,1759746035685,1759783000000.0,,,,,,,de08f3d88527492c83a42c20e2210558,...,,_on_train_end,,False,criteo-uplift-balanced,47039,1366544,14,4,2


In [61]:
runs_to_delete_parents = list(df.index)
df = df_runs_raw.copy()
df = df.loc[df["mlflow.parentRunId"].isin(runs_to_delete_parents)]
runs_to_delete_children = list(df.index)
runs_to_delete = runs_to_delete_children + runs_to_delete_parents
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ", ".join(run_uuid_query)

In [62]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

In [63]:
query = f"""
DELETE
FROM
	experiment_tags
WHERE
	experiment_id = ANY(
	SELECT
		experiment_id
	FROM
		experiments
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	latest_metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	params
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	tags
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE 
FROM 
	runs
WHERE 
	lifecycle_stage = 'deleted';

DELETE 
FROM 
	experiments
WHERE 
	lifecycle_stage = 'deleted';
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    

In [None]:
# Display summary information about the metrics dataframe
print("Dataframe shape:", df_metrics.shape)
print("\nColumns:", df_metrics.columns.tolist())
print("\nIndex levels:", df_metrics.index.names)
print("\nFirst few rows:")
print(df_metrics.head(10))
print("\nData types:")
print(df_metrics.dtypes)
print("\nNon-null counts:")
print(df_metrics.count())

Dataframe shape: (335, 3)

Columns: ['ARI', 'AMI', 'Calinski']

Index levels: ['Dataset', 'Model']

First few rows:
                                                       ARI       AMI  \
Dataset          Model                                                 
alizadeh-2000-v2 AffinityPropagation              0.362816       NaN   
                 AverageAgglomerativeClustering   0.809591  0.676228   
                 BatchCoHiRF-SC-SRGF              0.794763  0.671212   
                 CoHiRF                           0.864606  0.757990   
                 CoHiRF-KernelRBF                 0.046212  0.085768   
                 CompleteAgglomerativeClustering  0.382805  0.612835   
                 DBSCAN                           0.000000  0.000000   
                 HDBSCAN                          0.165928  0.267066   
                 IRFLLRR                          0.512283       NaN   
                 KMeans                           0.830673  0.750678   

                   