In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries, get_common_combinations, get_df_with_combinations
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
import os
import pickle
from functools import partial

# Save Results

## Load mlflow runs

In [2]:
db_port = 5001
db_name = 'cohirf'
url = f'postgresql://belucci@localhost:{db_port}/{db_name}'
# url = f"postgresql://belucci@clust9:{db_port}/{db_name}"
engine = create_engine(url)
query = 'SELECT experiments.name from experiments'
experiment_names = pd.read_sql(query, engine)['name'].tolist()
results_dir = Path.cwd().parent / "results" / "real"
# results_dir = Path("/home/users/belucci/cohirf/results/real")
os.makedirs(results_dir, exist_ok=True)

In [3]:
experiment_names

['Default',
 'sfni-SpectralSubspaceRandomization',
 'sfni-KMeans',
 'sfni-BatchCoHiRF-SC-SRGF',
 'sfni-BatchCoHiRF-1iter',
 'sfni-CoHiRF',
 'sphere-BatchCoHiRF-DBSCAN-1iter',
 'sphere-DBSCAN',
 'sphere-CoHiRF-DBSCAN',
 'real-ari-BatchCoHiRF-1iter',
 'real-ari-AverageAgglomerativeClustering',
 'real-ari-BatchCoHiRF-DBSCAN-1iter',
 'real-ari-AffinityPropagation',
 'real-ari-BatchCoHiRF-SC-SRGF',
 'real-ari-CoHiRF-DBSCAN',
 'real-ari-CoHiRF-KernelRBF',
 'real-ari-CoHiRF',
 'real-ari-CompleteAgglomerativeClustering',
 'real-ari-DBSCAN',
 'real-ari-HDBSCAN',
 'real-ari-IRFLLRR',
 'real-ari-KMeans',
 'real-ari-MeanShift',
 'real-ari-OPTICS',
 'real-ari-Proclus',
 'real-ari-SingleAgglomerativeClustering',
 'real-ari-SpectralClustering',
 'real-ari-SpectralSubspaceRandomization',
 'real-ari-WardAgglomerativeClustering',
 'real-adjusted_mutual_info-BatchCoHiRF-DBSCAN-1iter',
 'real-adjusted_mutual_info-DBSCAN',
 'real-adjusted_mutual_info-BatchCoHiRF-SC-SRGF',
 'real-adjusted_mutual_info-Comple

In [4]:
experiments_names = [exp for exp in experiment_names if exp.startswith("real-")]

In [5]:
experiments_names

['real-ari-BatchCoHiRF-1iter',
 'real-ari-AverageAgglomerativeClustering',
 'real-ari-BatchCoHiRF-DBSCAN-1iter',
 'real-ari-AffinityPropagation',
 'real-ari-BatchCoHiRF-SC-SRGF',
 'real-ari-CoHiRF-DBSCAN',
 'real-ari-CoHiRF-KernelRBF',
 'real-ari-CoHiRF',
 'real-ari-CompleteAgglomerativeClustering',
 'real-ari-DBSCAN',
 'real-ari-HDBSCAN',
 'real-ari-IRFLLRR',
 'real-ari-KMeans',
 'real-ari-MeanShift',
 'real-ari-OPTICS',
 'real-ari-Proclus',
 'real-ari-SingleAgglomerativeClustering',
 'real-ari-SpectralClustering',
 'real-ari-SpectralSubspaceRandomization',
 'real-ari-WardAgglomerativeClustering',
 'real-adjusted_mutual_info-BatchCoHiRF-DBSCAN-1iter',
 'real-adjusted_mutual_info-DBSCAN',
 'real-adjusted_mutual_info-BatchCoHiRF-SC-SRGF',
 'real-adjusted_mutual_info-CompleteAgglomerativeClustering',
 'real-adjusted_mutual_info-AverageAgglomerativeClustering',
 'real-adjusted_mutual_info-BatchCoHiRF-1iter',
 'real-adjusted_mutual_info-CoHiRF',
 'real-adjusted_mutual_info-CoHiRF-KernelRBF

In [6]:
query = "SELECT DISTINCT(key) FROM params WHERE key LIKE 'best/%%'"
best_params = pd.read_sql(query, engine)["key"].tolist()

In [7]:
params_columns = [
    "model",
    "dataset_id",
	"n_trials",
	"dataset_name",
	"standardize",
	"hpo_metric",
	"direction",
] + best_params

In [8]:
latest_metrics_columns = [
    "fit_model_return_elapsed_time",
    "max_memory_used_after_fit",
    "max_memory_used",
	"best/n_clusters_",
    "best/rand_score",
    "best/adjusted_rand",
    "best/mutual_info",
    "best/adjusted_mutual_info",
    "best/normalized_mutual_info",
    "best/homogeneity_completeness_v_measure",
    "best/silhouette",
    "best/calinski_harabasz_score",
    "best/davies_bouldin_score",
    "best/inertia_score",
    "best/homogeneity",
    "best/completeness",
    "best/v_measure",
    "best/elapsed_time",
]

In [9]:
tags_columns = [
    'raised_exception',
    'EXCEPTION',
    'mlflow.parentRunId',
]

In [10]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [11]:
dataset_characteristics = pd.read_csv(results_dir / "datasets_characteristics.csv", index_col=0)
dataset_characteristics.index = dataset_characteristics["openml_id"].astype(str)

In [12]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)
df_runs_raw = df_runs_raw.join(dataset_characteristics, on='dataset_id', rsuffix='_dataset')
df_runs_raw.to_csv(results_dir / 'df_runs_raw.csv', index=True)

In [13]:
df_runs_raw = pd.read_csv(results_dir / "df_runs_raw.csv", index_col=0)
df_runs_raw["model"] = df_runs_raw["model"] + "-" + df_runs_raw["n_trials"].astype(str)
# mask = df_runs_raw["model"].str.contains("CoHiRF")
# df_runs_raw.loc[mask, "model"] = df_runs_raw.loc[mask].apply(lambda row: f"{row['model']}-{row['n_trials']}", axis=1)
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents["mlflow.parentRunId"].isna()]

In [14]:
df_runs_raw_parents.head(5)

Unnamed: 0_level_0,status,start_time,end_time,best/alpha,best/avg_dims,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/base_model_kwargs/n_similarities,best/base_model_kwargs/sampling_ratio,...,max_memory_used_after_fit,EXCEPTION,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000d15489674402a9dceb79067cad15e,FINISHED,1753589259762,1753589000000.0,,,,,,,,...,377.94,,,False,ecoli,39.0,336.0,8.0,8.0,1.0
0023396bbc3842289bd8a53f76ffb713,FINISHED,1751654973207,1751655000000.0,,,,,,,,...,438.28,,,False,armstrong-2002-v1,46775.0,72.0,1082.0,2.0,0.0
0029b8e4dd664b4ab4274dea73d7706c,FINISHED,1751923288529,1751923000000.0,,,,,,,,...,442.428,,,False,armstrong-2002-v1,46775.0,72.0,1082.0,2.0,0.0
002d76fd45ef426db8ad19091bc5b622,FINISHED,1751923103527,1751995000000.0,4.0,,,,,,,...,8327.696,,,False,nursery,1568.0,12958.0,9.0,4.0,9.0
003bc124a9bd4ff99c4e646bb808b509,FAILED,1752647956586,1752648000000.0,,,,,,,,...,428.496,Best metric normalized_mutual_info not found i...,,True,armstrong-2002-v1,46775.0,72.0,1082.0,2.0,0.0


## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [15]:
non_duplicate_columns = [
    "model",
    "dataset_id",
	"standardize",
	"hpo_metric",	
]
# df_runs_parents.loc[df_runs_parents["best/n_clusters_"]*0.5 > df_runs_parents["n_instances"], "best/adjusted_rand"] = 
df_runs_parents = df_runs_raw_parents.dropna(axis=0, how="all", subset=["best/adjusted_rand"]).copy()
# add back runs that were not evaluated because we judged too many clusters (but they run anyway)
# df_valid_runs = df_runs_raw_parents.loc[df_runs_raw_parents["best/n_clusters_"] > df_runs_raw_parents["n_instances"]*0.5].copy()
# df_runs_parents = pd.concat([df_runs_parents, df_valid_runs], axis=0)
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]
# fill missing values with "None"
df_runs_parents = df_runs_parents.fillna("None")

In [16]:
df_to_cat = []
hpo_metrics = [
    "adjusted_rand",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
    "normalized_mutual_info",
]
standardize = [True]
fill_value = pd.NA
fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]

In [17]:
# Too few examples (<100) to run in batch
dataset_ids_to_complete = [46773, 46774, 46775, 46776, 46777, 46779, 46780, 46781]
model_names = [
    "BatchCoHiRF-1iter-30",
    "BatchCoHiRF-DBSCAN-1iter-30",
    "BatchCoHiRF-SC-SRGF-30",
    "BatchCoHiRF-KernelRBF-1iter-30",
    "BatchCoHiRF-1iter-60",
	"BatchCoHiRF-DBSCAN-1iter-60",
	# "BatchCoHiRF-SC-SRGF-60",
	# "BatchCoHiRF-KernelRBF-1iter-60",
]
for dataset_id in dataset_ids_to_complete:
	for model_name in model_names:
		for hpo_metric in hpo_metrics:
			for std in standardize:
				mask = (df_runs_parents["dataset_id"] == dataset_id) & (df_runs_parents["model"] == model_name) & (df_runs_parents["hpo_metric"] == hpo_metric) & (df_runs_parents["standardize"] == std)
				if not mask.any():
					new_row = {
						"dataset_id": dataset_id,
						"model": model_name,
						"hpo_metric": hpo_metric,
						"standardize": std,
					}
					for col in fill_columns:
						new_row[col] = fill_value
					df_to_cat.append(new_row)

In [18]:
# Too many examples for IRFLLRR
dataset_ids_to_complete = [40685]
model_names = [
    "IRFLLRR-30",
    "IRFLLRR-60",
]
hpo_metrics = ["adjusted_rand", "adjusted_mutual_info", "calinski_harabasz_score", "silhouette", "davies_bouldin_score", "normalized_mutual_info"]
standardize = [True]
fill_value = pd.NA
fill_columns = ["best/adjusted_rand", "best/adjusted_mutual_info", "best/calinski_harabasz_score", "best/silhouette", "best/davies_bouldin_score", "best/normalized_mutual_info"]
for dataset_id in dataset_ids_to_complete:
    for model_name in model_names:
        for hpo_metric in hpo_metrics:
            for std in standardize:
                mask = (
                    (df_runs_parents["dataset_id"] == dataset_id)
                    & (df_runs_parents["model"] == model_name)
                    & (df_runs_parents["hpo_metric"] == hpo_metric)
                    & (df_runs_parents["standardize"] == std)
                )
                if not mask.any():
                    new_row = {
                        "dataset_id": dataset_id,
                        "model": model_name,
                        "hpo_metric": hpo_metric,
                        "standardize": std,
                    }
                    for col in fill_columns:
                        new_row[col] = fill_value
                    df_to_cat.append(new_row)

In [19]:
df_runs_parents = pd.concat([df_runs_parents, pd.DataFrame(df_to_cat)], axis=0)

# Slow / Memory intensive datasets/models

In [20]:
df = df_runs_parents.copy()
df = df[["model", "dataset_id", "max_memory_used", "max_memory_used_after_fit"]]
df = df.sort_values(by="max_memory_used", ascending=False)
high_mem_tuples = df.loc[df["max_memory_used"] > 10000].copy()
high_mem_tuples = high_mem_tuples[["model", "dataset_id"]]

In [21]:
df

Unnamed: 0,model,dataset_id,max_memory_used,max_memory_used_after_fit
ba156669bdb04a24b3e2e114390be16c,SpectralSubspaceRandomization-30,1568,128347.652,128347.652
81819661f8f3440290d9576971c1988d,SpectralSubspaceRandomization-60,1568,127538.808,127538.808
d9a94c4336f34618b9f0e25085444eea,SpectralSubspaceRandomization-60,1568,127059.340,127059.340
10a9397979ce41f0b09d2760a7495073,SpectralSubspaceRandomization-30,1568,126232.556,126232.556
b0ebd933c5c24b78b26f0ba1855e44f5,AffinityPropagation-30,40685,123946.660,123946.660
...,...,...,...,...
264,IRFLLRR-60,40685,,
265,IRFLLRR-60,40685,,
266,IRFLLRR-60,40685,,
267,IRFLLRR-60,40685,,


In [22]:
model_nickname = [
    # "AffinityPropagation",
    # "AverageAgglomerativeClustering",
    # "BatchCoHiRF",
    "BatchCoHiRF-1iter",
    # "BatchCoHiRF-DBSCAN",
    "BatchCoHiRF-DBSCAN-1iter",
    "BatchCoHiRF-KernelRBF-1iter",
    "BatchCoHiRF-SC-SRGF",
    # "Clique",
    "CoHiRF",
    "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF",
    # "CompleteAgglomerativeClustering",
    "DBSCAN",
    # "HDBSCAN",
    # "IRFLLRR",
    "KMeans",
    # "KMeansProj",
    "KernelRBFKMeans",
    # "MeanShift",
    # "OPTICS",
    # "Proclus",
    # "SingleAgglomerativeClustering",
    # "SpectralClustering",
    "SpectralSubspaceRandomization",
    # "WardAgglomerativeClustering",
]

In [23]:
dataset_id = [
    39,
    61,
    182,
    1478,
    1568,
    40685,
    40984,
    46773,
    46774,
    46775,
    46776,
    46777,
    46778,
    46779,
    46780,
    46781,
    46782,
    46783,
    554,
]

In [24]:
standardize = [True]
hpo_metric = [
    "adjusted_rand",
    "normalized_mutual_info",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
]

In [25]:
# from itertools import product
# combinations_keys = ["model", "dataset_id", "hpo_metric", "standardize"]
# combinations = product(model_nickname, dataset_id, hpo_metric, standardize)
# combination_list = [dict(zip(combinations_keys, comb)) for comb in combinations]
# df_combination = pd.DataFrame(combination_list)
# # exclude slow/mem intensive runs
# # df_combination = df_combination.merge(
# # 	high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True
# # )
# # df_combination = df_combination.loc[df_combination["_merge"] == "left_only"].drop(columns=["_merge"])
# # get only slow/mem intensive runs
# df_combination = df_combination.merge(high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True)
# df_combination = df_combination.loc[df_combination["_merge"] == "both"].drop(columns=["_merge"])
# runs_dict = {}
# for model in df_combination["model"].unique():
#     sub = df_combination[df_combination["model"] == model].drop(columns=["model"])
#     # standardize = True
#     sub_standardized = sub.loc[sub["standardize"] == True].copy()
#     sub_standardized["standardize"] = ""
#     sub_standardized_dict = sub_standardized.to_dict(orient="records")
#     # standardize = False
#     sub_not_standardized = sub.loc[sub["standardize"] == False].copy()
#     sub_not_standardized.drop(columns=["standardize"], inplace=True)
#     sub_not_standardized_dict = sub_not_standardized.to_dict(orient="records")
#     # combine both dictionaries
#     runs_dict[model] = sub_standardized_dict + sub_not_standardized_dict
# if len(runs_dict) != 0:
#     with open(results_dir / "runs_dict.pkl", "wb") as f:
#         pickle.dump(runs_dict, f)

# Missing

In [26]:
model_nickname = df_runs_parents['model'].unique().tolist()
model_nickname.sort()
model_nickname

['AffinityPropagation-30',
 'AverageAgglomerativeClustering-30',
 'BatchCoHiRF-1iter-30',
 'BatchCoHiRF-1iter-60',
 'BatchCoHiRF-DBSCAN-1iter-100',
 'BatchCoHiRF-DBSCAN-1iter-30',
 'BatchCoHiRF-DBSCAN-1iter-60',
 'BatchCoHiRF-KernelRBF-1iter-30',
 'BatchCoHiRF-KernelRBF-1iter-60',
 'BatchCoHiRF-SC-SRGF-30',
 'BatchCoHiRF-SC-SRGF-60',
 'CoHiRF-30',
 'CoHiRF-60',
 'CoHiRF-DBSCAN-30',
 'CoHiRF-DBSCAN-60',
 'CoHiRF-KernelRBF-30',
 'CoHiRF-KernelRBF-60',
 'CoHiRF-SC-SRGF-60',
 'CompleteAgglomerativeClustering-30',
 'DBSCAN-30',
 'DBSCAN-60',
 'HDBSCAN-30',
 'IRFLLRR-30',
 'IRFLLRR-60',
 'KMeans-30',
 'KMeans-60',
 'KernelRBFKMeans-30',
 'KernelRBFKMeans-60',
 'MeanShift-30',
 'OPTICS-30',
 'Proclus-30',
 'SingleAgglomerativeClustering-30',
 'SpectralClustering-30',
 'SpectralSubspaceRandomization-30',
 'SpectralSubspaceRandomization-60',
 'WardAgglomerativeClustering-30']

In [27]:
model_nickname = [
    # "BatchCoHiRF-1iter-60",
    # "BatchCoHiRF-DBSCAN-1iter-60",
    # "BatchCoHiRF-KernelRBF-1iter-60",
    "BatchCoHiRF-SC-SRGF-60",
    # "CoHiRF-60",
    # "CoHiRF-DBSCAN-60",
    # "CoHiRF-KernelRBF-60",
	# "CoHiRF-SC-SRGF-60",
    # "DBSCAN-60",
    # "KMeans-60",
    # "KernelRBFKMeans-60",
    # "SpectralSubspaceRandomization-60",
]

In [28]:
non_duplicate_columns = [
	"model",
	"dataset_id",
	"standardize",
	"hpo_metric",
]

In [29]:
dataset_id = [
    # 39,
    # 61,
    # 182,
    # 1478,
    # 1568,
    # 40685,
    # 40984,
    # 46773,
    # 46774,
    # 46775,
    # 46776,
    # 46777,
    # 46778,
    # 46779,
    # 46780,
    # 46781,
    # 46782,
    # 46783,
	554,
]
standardize = [True]
hpo_metric = [
	"adjusted_rand",
    # "normalized_mutual_info",
    "adjusted_mutual_info",
    "calinski_harabasz_score",
    "silhouette",
    "davies_bouldin_score",
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, standardize, hpo_metric]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)

In [30]:
df_missing

Unnamed: 0,model,dataset_id,standardize,hpo_metric
0,BatchCoHiRF-SC-SRGF-60,554,True,adjusted_mutual_info
1,BatchCoHiRF-SC-SRGF-60,554,True,calinski_harabasz_score


In [64]:
# Join df_runs_raw_parents into df_missing using non_duplicate_columns to get the EXCEPTION column
df_missing_with_exception = df_missing.merge(
    df_runs_raw_parents[non_duplicate_columns + ["raised_exception", "EXCEPTION"]],
    how="left",
    left_on=["model", "dataset_id", "standardize", "hpo_metric"],
    right_on=["model", "dataset_id", "standardize", "hpo_metric"],
)
df_missing_with_exception[["model", "dataset_id", "standardize", "hpo_metric", "raised_exception", "EXCEPTION"]]

Unnamed: 0,model,dataset_id,standardize,hpo_metric,raised_exception,EXCEPTION
0,BatchCoHiRF-SC-SRGF-60,554,True,adjusted_mutual_info,True,Best metric adjusted_mutual_info not found in ...
1,BatchCoHiRF-SC-SRGF-60,554,True,adjusted_mutual_info,,
2,BatchCoHiRF-SC-SRGF-60,554,True,calinski_harabasz_score,,
3,BatchCoHiRF-SC-SRGF-60,554,True,calinski_harabasz_score,True,Best metric calinski_harabasz_score not found ...


In [55]:
df_missing_dict = df_missing.copy()
# get only rows from high_mem_tuples
# df_missing_dict = df_missing_dict.merge(high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "both"].drop(columns="_merge")
# exclude rows that are in missing_ari_tuples
# df_missing_dict = df_missing_dict.merge(
# 	missing_ari_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# exclude rows that are in high_mem_tuples
# df_missing_dict = df_missing_dict.merge(
# 	high_mem_tuples, on=["model", "dataset_id"], how="left", indicator=True
# )
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")
# to_drop = pd.concat([missing_ari_tuples, high_mem_tuples], ignore_index=True)
# df_missing_dict = df_missing_dict[df_missing_dict["_merge"] == "left_only"].drop(columns="_merge")

In [56]:
# get rid of -60
df_missing_dict["model"] = df_missing_dict["model"].str.replace("-60", "")

In [57]:
df_missing_dict

Unnamed: 0,model,dataset_id,standardize,hpo_metric
0,SpectralSubspaceRandomization,182,True,silhouette


In [58]:
missing_dict = {}
for model in df_missing_dict["model"].unique():
    sub = df_missing_dict[df_missing_dict["model"] == model].drop(columns=["model"])
    # standardize = True
    sub_standardized = sub.loc[sub["standardize"] == True].copy()
    sub_standardized["standardize"] = ''
    sub_standardized_dict = sub_standardized.to_dict(orient="records")
    # standardize = False
    sub_not_standardized = sub.loc[sub["standardize"] == False].copy()
    sub_not_standardized.drop(columns=["standardize"], inplace=True)
    sub_not_standardized_dict = sub_not_standardized.to_dict(orient="records")
    # combine both dictionaries
    missing_dict[model] = sub_standardized_dict + sub_not_standardized_dict
if len(missing_dict) != 0:
    with open(results_dir / 'missing_dict.pkl', 'wb') as f:
        pickle.dump(missing_dict, f)

In [59]:
missing_dict

{'SpectralSubspaceRandomization': [{'dataset_id': 182,
   'standardize': '',
   'hpo_metric': 'silhouette'}]}

# Tables

In [31]:
def get_parameters_string(row):
    parameter_names = {
		"best/alpha": "\\alpha",
		"best/avg_dims": "d",
		"best/base_model_kwargs/eps": "\\epsilon",
		"best/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/base_model_kwargs/n_clusters": "C",
		"best/c": "c",
		"best/cohirf_kwargs/base_model_kwargs/eps": "\\epsilon",
		"best/cohirf_kwargs/base_model_kwargs/min_samples": "n_{\\text{min}}",
		"best/cohirf_kwargs/kmeans_n_clusters": "C",
		"best/cohirf_kwargs/n_features": "q",
		"best/cohirf_kwargs/repetitions": "R",
		"best/damping": "\\lambda",
		# "best/density_threshold": "\\tau",
		"best/eps": "\\epsilon",
		"best/kmeans_n_clusters": "C",
		"best/lambda_": "\\lambda",
		"best/min_bin_freq": "bin_{\\text{min}}",
		"best/min_cluster_size": "C_{\\text{min}}",
		"best/min_samples": "n_{\\text{min}}",
		"best/n_clusters": "C",
		"best/n_features": "q",
		# "best/n_partitions": "P",
		"best/n_similarities": "m",
		"best/p": "p",
		"best/repetitions": "R",
		"best/sampling_ratio": "r",
		"best/sc_n_clusters": "C",
		"best/transform_kwargs/gamma": "\\gamma",
	}
    first = True
    str = ""
    for p in parameter_names.keys():
        if not pd.isna(row[p]) and row[p] != "None":
            if not first:
                str += "; "
            else:
                first = False
            value = float(row[p])
            if value.is_integer():
                value = int(value)
                str += f"${parameter_names[p]}={value}$"
            else:
                str += f"${parameter_names[p]}={value:0.2f}$"
    return str

In [32]:
def highlight_max(df, column_name, level=0):
    df_column = df[column_name]
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_highlighted] = 'font-weight: bold'
    return df_css

In [33]:
def highlight_min(df, column_name, level=0):
    df_column = df[column_name]
    min_values = df_column.groupby(level=level).transform("min")
    is_highlighted = df_column.round(3) == min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_highlighted] = "font-weight: bold"
    return df_css

In [34]:
def highlight_max_index(series_index, df_column, level=0):
    max_values = df_column.groupby(level=level).transform('max')
    is_highlighted = df_column.round(3) == max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css[:] = ''
    series_css[is_highlighted.values] = 'font-weight: bold'
    return series_css

In [35]:
def underline_2nd_max(df, column_name, level=0):
    df_column = df[column_name]
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.round(3).drop_duplicates().nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_underlined] = 'underline: --latex--rwrap'
    return df_css

In [36]:
def underline_2nd_min(df, column_name, level=0):
    df_column = df[column_name]
    # get the second min value
    second_min_values = df_column.groupby(level=level).transform(
        lambda x: x.round(3).drop_duplicates().nsmallest(2).iloc[-1]
    )
    is_underlined = df_column.round(3) == second_min_values.round(3)
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ""
    df_css[is_underlined] = "underline: --latex--rwrap"
    return df_css

In [37]:
def underline_2nd_max_index(series_index, df_column, level=0):
    # get the second max value
    second_max_values = df_column.groupby(level=level).transform(lambda x: x.nlargest(2).iloc[-1])
    is_underlined = df_column.round(3) == second_max_values.round(3)
    series_css = series_index.copy().astype(str)
    series_css.loc[:] = ''
    series_css[is_underlined.values] = 'underline: --latex--rwrap'
    return series_css

## Some Models

In [38]:
df_runs_parents['model'].unique()

array(['BatchCoHiRF-DBSCAN-1iter-60', 'KMeans-30', 'IRFLLRR-30',
       'CoHiRF-DBSCAN-30', 'DBSCAN-30', 'MeanShift-30',
       'KernelRBFKMeans-60', 'CoHiRF-KernelRBF-60', 'DBSCAN-60',
       'CompleteAgglomerativeClustering-30', 'CoHiRF-KernelRBF-30',
       'CoHiRF-30', 'BatchCoHiRF-SC-SRGF-60',
       'AverageAgglomerativeClustering-30', 'SpectralClustering-30',
       'SingleAgglomerativeClustering-30', 'KernelRBFKMeans-30',
       'SpectralSubspaceRandomization-60', 'OPTICS-30', 'Proclus-30',
       'HDBSCAN-30', 'AffinityPropagation-30', 'CoHiRF-60',
       'BatchCoHiRF-KernelRBF-1iter-30', 'KMeans-60',
       'BatchCoHiRF-KernelRBF-1iter-60', 'CoHiRF-SC-SRGF-60',
       'WardAgglomerativeClustering-30',
       'SpectralSubspaceRandomization-30', 'BatchCoHiRF-1iter-60',
       'BatchCoHiRF-DBSCAN-1iter-30', 'BatchCoHiRF-1iter-30',
       'BatchCoHiRF-SC-SRGF-30', 'CoHiRF-DBSCAN-60',
       'BatchCoHiRF-DBSCAN-1iter-100', 'IRFLLRR-60'], dtype=object)

In [39]:
model_names = {
    # "AffinityPropagation": "Affinity Propagation",
    # "AverageAgglomerativeClustering": "Average Agglomerative Clustering",
    # 'BatchCoHiRF',
    "BatchCoHiRF-1iter-60": "Batch CoHiRF",
    # 'BatchCoHiRF-DBSCAN',
    "BatchCoHiRF-DBSCAN-1iter-60": "Batch CoHiRF-DBSCAN",
    "BatchCoHiRF-KernelRBF-1iter-60": "Batch CoHiRF-KernelRBF",
    "BatchCoHiRF-SC-SRGF-60": "Batch CoHiRF-SC-SRGF",
    # 'Clique',
    "CoHiRF-60": "CoHiRF",
    "CoHiRF-DBSCAN-60": "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF-60": "CoHiRF-KernelRBF",
    "CoHiRF-SC-SRGF-60": "CoHiRF-SC-SRGF",
    # "CompleteAgglomerativeClustering": "Complete Agglomerative Clustering",
    "DBSCAN-60": "DBSCAN",
    # "HDBSCAN": "HDBSCAN",
    # "IRFLLRR": "IRFLLRR",
    "KMeans-60": "K-Means",
    # 'KMeansProj': "K-Means Proj",
    "KernelRBFKMeans-60": "Kernel RBF K-Means",
    # "MeanShift": "Mean Shift",
    # "OPTICS": "OPTICS",
    # "Proclus": "Proclus",
    # "SingleAgglomerativeClustering": "Single Agglomerative Clustering",
    # "SpectralClustering": "Spectral Clustering",
    "SpectralSubspaceRandomization-60": "SC-SRGF",
    # "WardAgglomerativeClustering": "Ward's Method"
}

dataset_names = {
    "binary_alpha_digits": "binary-alpha-digits",
	"mnist_784": "mnist",
}  # otherwise we get an error in latex

# Filter to only standardized runs
df = df_runs_parents.copy()
df = df.loc[df['standardize'] == True]
df = df.loc[df['model'].isin(model_names.keys())]
df = df.replace({"model": model_names})
df = df.replace({"dataset_name": dataset_names})

# "adjusted_rand",
#     "adjusted_mutual_info",
#     "calinski_harabasz_score",
#     "silhouette",
#     "davies_bouldin_score",
#     "normalized_mutual_info",

# Create separate dataframes for each metric optimization
df_ari = df.loc[df['hpo_metric'] == 'adjusted_rand'][
	['dataset_name', 'model', 'best/adjusted_rand']
].rename(columns={'best/adjusted_rand': 'ARI'})

df_ami = df.loc[df['hpo_metric'] == 'adjusted_mutual_info'][
	['dataset_name', 'model', 'best/adjusted_mutual_info']
].rename(columns={'best/adjusted_mutual_info': 'AMI'})

df_nmi = df.loc[df["hpo_metric"] == "normalized_mutual_info"][
    ["dataset_name", "model", "best/normalized_mutual_info"]
].rename(columns={"best/normalized_mutual_info": "NMI"})

df_calinski = df.loc[df['hpo_metric'] == 'calinski_harabasz_score'][
	['dataset_name', 'model', 'best/calinski_harabasz_score']
].rename(columns={'best/calinski_harabasz_score': 'Calinski'})

df_silhouette = df.loc[df['hpo_metric'] == 'silhouette'][
    ['dataset_name', 'model', 'best/silhouette']
].rename(columns={'best/silhouette': 'Silhouette'})

df_davies_bouldin = df.loc[df['hpo_metric'] == 'davies_bouldin_score'][
    ['dataset_name', 'model', 'best/davies_bouldin_score']
].rename(columns={'best/davies_bouldin_score': 'Davies-Bouldin'})


# Remove missing values before setting index
df_ari = df_ari.dropna(subset=["ARI"])
df_ami = df_ami.dropna(subset=["AMI"])
df_nmi = df_nmi.dropna(subset=["NMI"])
df_calinski = df_calinski.dropna(subset=["Calinski"])
df_silhouette = df_silhouette.dropna(subset=["Silhouette"])
df_davies_bouldin = df_davies_bouldin.dropna(subset=["Davies-Bouldin"])

# Set multi-index for all dataframes
df_ari = df_ari.set_index(["dataset_name", "model"])
df_ami = df_ami.set_index(["dataset_name", "model"])
df_nmi = df_nmi.set_index(["dataset_name", "model"])
df_calinski = df_calinski.set_index(["dataset_name", "model"])
df_silhouette = df_silhouette.set_index(["dataset_name", "model"])
df_davies_bouldin = df_davies_bouldin.set_index(["dataset_name", "model"])

# Combine all metrics into a single dataframe using outer join to keep all combinations
df_metrics = df_ari.join(df_ami, how="outer").join(df_nmi, how="outer").join(df_calinski, how="outer").join(df_silhouette, how="outer").join(df_davies_bouldin, how="outer")

# Rename index levels
df_metrics.index.names = ["Dataset", "Model"]
df_metrics["Davies-Bouldin"] = df_metrics["Davies-Bouldin"].astype(float)

In [40]:
df_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI,AMI,NMI,Calinski,Silhouette,Davies-Bouldin
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alizadeh-2000-v2,Batch CoHiRF-KernelRBF,0.334320,0.329375,0.316570,5.016025,0.124942,
alizadeh-2000-v2,Batch CoHiRF-SC-SRGF,0.828731,0.692277,0.744643,8.641270,0.108252,3.567765
alizadeh-2000-v2,CoHiRF,0.864606,0.846338,0.768753,15.150730,0.189807,1.066668
alizadeh-2000-v2,CoHiRF-DBSCAN,0.000000,,,,,
alizadeh-2000-v2,CoHiRF-KernelRBF,0.056485,,0.255128,,0.008373,
...,...,...,...,...,...,...,...
shuttle,CoHiRF-DBSCAN,0.686407,0.631992,0.635230,483.101574,0.920931,0.043409
shuttle,CoHiRF-KernelRBF,0.429470,0.464621,0.465134,9682.080610,0.393403,1.123023
shuttle,DBSCAN,0.719113,0.642741,0.641843,7510.117827,0.948658,1.432657
shuttle,K-Means,0.608404,0.448867,0.449095,31180.812992,,


In [41]:
# Add mean time columns to the existing df_metrics dataframe
# Using the same filtering approach as the original df_metrics
df_filtered = df_runs_parents.loc[df_runs_parents['standardize'] == True].copy()
df_filtered = df_filtered.loc[df_filtered['model'].isin(model_names.keys())]
df_filtered = df_filtered.replace({"model": model_names})
df_filtered = df_filtered.replace({"dataset_name": dataset_names})

# Calculate mean times for each dataset-model combination across all metrics
df_times = df_filtered.groupby(['dataset_name', 'model']).agg({
    'best/elapsed_time': 'mean',
    'fit_model_return_elapsed_time': 'mean'
}).rename(columns={
    'best/elapsed_time': 'Mean Best Time',
    'fit_model_return_elapsed_time': 'Mean HPO Time'
})

# Set the same index structure as df_metrics
df_times.index.names = ["Dataset", "Model"]

# Join with the existing df_metrics (verify we have the same number of rows!)
df_metrics = df_metrics.join(df_times, how="outer")

In [42]:
# Create a time-based dataframe with elapsed times for each metric optimization
# Using the same filtering approach as the original df_metrics
df_filtered = df_runs_parents.loc[df_runs_parents['standardize'] == True].copy()
df_filtered = df_filtered.loc[df_filtered['model'].isin(model_names.keys())]
df_filtered = df_filtered.replace({"model": model_names})
df_filtered = df_filtered.replace({"dataset_name": dataset_names})

# Create separate dataframes for each metric optimization with time columns
df_ari_time = df_filtered.loc[df_filtered['hpo_metric'] == 'adjusted_rand'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'ARI_best_time', 'fit_model_return_elapsed_time': 'ARI_total_time'})

df_ami_time = df_filtered.loc[df_filtered['hpo_metric'] == 'adjusted_mutual_info'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'AMI_best_time', 'fit_model_return_elapsed_time': 'AMI_total_time'})

df_nmi_time = df_filtered.loc[df_filtered['hpo_metric'] == 'normalized_mutual_info'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'NMI_best_time', 'fit_model_return_elapsed_time': 'NMI_total_time'})

df_calinski_time = df_filtered.loc[df_filtered['hpo_metric'] == 'calinski_harabasz_score'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'Calinski_best_time', 'fit_model_return_elapsed_time': 'Calinski_total_time'})

df_silhouette_time = df_filtered.loc[df_filtered['hpo_metric'] == 'silhouette'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'Silhouette_best_time', 'fit_model_return_elapsed_time': 'Silhouette_total_time'})

df_davies_bouldin_time = df_filtered.loc[df_filtered['hpo_metric'] == 'davies_bouldin_score'][
    ['dataset_name', 'model', 'best/elapsed_time', 'fit_model_return_elapsed_time']
].rename(columns={'best/elapsed_time': 'Davies-Bouldin_best_time', 'fit_model_return_elapsed_time': 'Davies-Bouldin_total_time'})

# Remove missing values before setting index
df_ari_time = df_ari_time.dropna(subset=["ARI_best_time", "ARI_total_time"])
df_ami_time = df_ami_time.dropna(subset=["AMI_best_time", "AMI_total_time"])
df_nmi_time = df_nmi_time.dropna(subset=["NMI_best_time", "NMI_total_time"])
df_calinski_time = df_calinski_time.dropna(subset=["Calinski_best_time", "Calinski_total_time"])
df_silhouette_time = df_silhouette_time.dropna(subset=["Silhouette_best_time", "Silhouette_total_time"])
df_davies_bouldin_time = df_davies_bouldin_time.dropna(subset=["Davies-Bouldin_best_time", "Davies-Bouldin_total_time"])

# Set multi-index for all dataframes
df_ari_time = df_ari_time.set_index(["dataset_name", "model"])
df_ami_time = df_ami_time.set_index(["dataset_name", "model"])
df_nmi_time = df_nmi_time.set_index(["dataset_name", "model"])
df_calinski_time = df_calinski_time.set_index(["dataset_name", "model"])
df_silhouette_time = df_silhouette_time.set_index(["dataset_name", "model"])
df_davies_bouldin_time = df_davies_bouldin_time.set_index(["dataset_name", "model"])

# Combine all time metrics into a single dataframe using outer join
df_time_metrics = df_ari_time.join(df_ami_time, how="outer").join(df_nmi, how="outer").join(df_calinski_time, how="outer").join(df_silhouette_time, how="outer").join(df_davies_bouldin_time, how="outer")

# Rename index levels
df_time_metrics.index.names = ["Dataset", "Model"]

The following will provide the latex code for a clean table, we only need to make a little adjustement in the first line to delete the "key" and have only one header. For the longtable environment (full data) we need to add the "\*" at the end of lines we dont want to have a page break. We also should replace the entire begin{table} ... end{table} by begin{longtable} ... end{longtable} in the latex file, if you want to put caption and labels you should break the line after with '\\' (put both on the same line!)


In [43]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
	.apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(hrules=True, clines="skip-last;data", convert_css=True, column_format="ll"+'r'*len(df_latex.columns), environment="longtable")
)

\begin{longtable}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\endfirsthead
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\endhead
\midrule
\multicolumn{9}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\multirow[c]{10}{*}{alizadeh-2000-v2} & Batch CoHiRF-KernelRBF & 0.334 & 0.329 & 5.016 & 0.124942 & nan & 0.500 & 940.167 \\
 & Batch CoHiRF-SC-SRGF & 0.829 & 0.692 & 8.641 & 0.108252 & 3.567765 & 16.787 & 1353.557 \\
 & CoHiRF & \underline{0.865} & \underline{0.846} & \bfseries 15.151 & \underline{0.189807} & 1.066668 & 0.095 & 258.393 \\
 & CoHiRF-DBSCAN & 0.000 & No Run & No Run & nan & nan & 1.583 & 327.070 \\
 & CoHiRF-KernelRBF & 0.056 & No Run & No Run & 0.008373 & nan & 0.310 & 259.866 \\
 & CoHiRF-SC-SRGF & 0.589 & 0.774 & 7.297 & 0.

# KMeans

In [44]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
datasets_to_keep = [
    "garber-2001",
    "alizadeh-2000-v2",
    "golub-1999-v2",
    "armstrong-2002-v1",
    "nursery",
    "segment",
]
models_to_keep = [
	'K-Means',
	'CoHiRF',
	'Batch CoHiRF',
]
df_latex = df_latex.loc[df_latex.index.get_level_values('Dataset').isin(datasets_to_keep) & df_latex.index.get_level_values('Model').isin(models_to_keep), :]
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
	.apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(hrules=True, clines="skip-last;data", convert_css=True, column_format="ll"+'r'*len(df_latex.columns),
			#    environment="longtable"
			   )
)

\begin{tabular}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{alizadeh-2000-v2} & CoHiRF & \bfseries 0.865 & \bfseries 0.846 & \bfseries \underline{15.151} & \bfseries 0.189807 & \underline{1.066668} & 0.095 & 258.393 \\
 & K-Means & \underline{0.831} & \underline{0.759} & \bfseries \underline{15.151} & \underline{0.188241} & \bfseries 0.972517 & 0.020 & 222.753 \\
\cline{1-9}
\multirow[c]{2}{*}{armstrong-2002-v1} & CoHiRF & \bfseries 0.634 & \underline{0.670} & \underline{6.438} & \underline{0.147702} & \underline{1.062096} & 0.077 & 172.181 \\
 & K-Means & \underline{0.618} & \bfseries 0.901 & \bfseries 6.513 & \bfseries 0.188146 & \bfseries 0.955932 & 0.011 & 155.461 \\
\cline{1-9}
\multirow[c]{2}{*}{garber-2001} & CoHiRF & \bfseries 0.314 & \bfseries 0.232 & \underline{6.169} & \bfseries \underline{0.333445} & \underline{1.055637} & 0.308 & 532.405 

# Kernel KMeans

In [45]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
datasets_to_keep = [
    "khan-2001",
    "bittner-2000",
    "iris",
    "satimage",
]
models_to_keep = [
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
    .apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski", "Silhouette", "Davies-Bouldin"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "r" * len(df_latex.columns),
        #    environment="longtable"
    )
)

\begin{tabular}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{bittner-2000} & Batch CoHiRF-KernelRBF & \bfseries 0.388 & \bfseries 0.333 & \bfseries 1.773 & \bfseries 0.113 & \bfseries 0.652 & 4.249 & 1145.689 \\
 & CoHiRF-KernelRBF & \underline{0.174} & \underline{0.141} & \underline{1.324} & 0.025 & 1.685 & 0.336 & 257.860 \\
 & Kernel RBF K-Means & 0.045 & 0.107 & 1.242 & \underline{0.035} & \underline{1.608} & 0.086 & 243.000 \\
\cline{1-9}
\multirow[c]{3}{*}{iris} & Batch CoHiRF-KernelRBF & \bfseries 0.856 & 0.699 & 139.623 & 0.429 & \underline{0.715} & 1.301 & 191.076 \\
 & CoHiRF-KernelRBF & 0.644 & \bfseries 0.886 & \bfseries 241.038 & \underline{0.564} & \bfseries 0.514 & 0.187 & 90.386 \\
 & Kernel RBF K-Means & \underline{0.679} & \underline{0.728} & \underline{195.816} & \bfseries 0.579 & 0.787 & 0.013 & 88.026 \\
\cline{1-9}
\multirow[c]{3}

# DBSCAN

In [46]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
datasets_to_keep = ["ecoli", "binary-alpha-digits", "segment", "chowdary-2006", "shuttle"]
models_to_keep = [
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
    .apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski", "Silhouette", "Davies-Bouldin"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "r" * len(df_latex.columns),
        #    environment="longtable"
    )
)

\begin{tabular}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{binary-alpha-digits} & Batch CoHiRF-DBSCAN & \bfseries 0.360 & \bfseries 0.516 & \bfseries 16.135 & \underline{0.036} & \underline{0.915} & 4.872 & 734.124 \\
 & CoHiRF-DBSCAN & \underline{0.012} & \underline{0.035} & 1.494 & 0.006 & \bfseries 0.817 & 0.252 & 114.220 \\
 & DBSCAN & 0.000 & 0.006 & \underline{5.436} & \bfseries 0.099 & 1.156 & 0.030 & 116.279 \\
\cline{1-9}
\multirow[c]{3}{*}{chowdary-2006} & Batch CoHiRF-DBSCAN & \bfseries 0.708 & \bfseries 0.675 & \bfseries 37.353 & \bfseries 0.666 & \bfseries 0.168 & 0.122 & 216.296 \\
 & CoHiRF-DBSCAN & 0.060 & 0.058 & \bfseries 37.353 & \bfseries 0.666 & \bfseries 0.168 & 0.052 & 98.358 \\
 & DBSCAN & \underline{0.370} & \underline{0.308} & \underline{23.358} & \underline{0.581} & \underline{0.946} & 0.004 & 74.574 \\
\cline{1-9}
\multiro

# SC-SRGF


In [47]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
datasets_to_keep = ["alizadeh-2000-v3", "alizadeh-2000-v2", "har", "satimage", "chowdary-2006"]
models_to_keep = [
    "SC-SRGF",
	"CoHiRF-SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
    .apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski", "Silhouette", "Davies-Bouldin"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "r" * len(df_latex.columns),
        #    environment="longtable"
    )
)

\begin{tabular}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{3}{*}{alizadeh-2000-v2} & Batch CoHiRF-SC-SRGF & \underline{0.829} & 0.692 & \underline{8.641} & \underline{0.108} & 3.568 & 16.787 & 1353.557 \\
 & CoHiRF-SC-SRGF & 0.589 & \underline{0.774} & 7.297 & 0.094 & \underline{1.545} & 3.367 & 406.406 \\
 & SC-SRGF & \bfseries 0.947 & \bfseries 0.922 & \bfseries 12.338 & \bfseries 0.194 & \bfseries 0.930 & 0.202 & 226.919 \\
\cline{1-9}
\multirow[c]{3}{*}{alizadeh-2000-v3} & Batch CoHiRF-SC-SRGF & 0.411 & 0.536 & \underline{11.858} & \underline{0.189} & \underline{1.167} & 13.581 & 1629.159 \\
 & CoHiRF-SC-SRGF & \underline{0.456} & \underline{0.596} & 7.455 & 0.094 & 1.545 & 2.923 & 432.780 \\
 & SC-SRGF & \bfseries 0.519 & \bfseries 0.724 & \bfseries 12.341 & \bfseries 0.194 & \bfseries 0.948 & 0.244 & 265.322 \\
\cline{1-9}
\multirow[c]{3}{*}{chowdary

# COIL 20

In [65]:
df_latex = df_metrics.copy()
df_latex = df_latex.drop(columns=["NMI"])
datasets_to_keep = ["coil-20", "mnist"]
models_to_keep = [
    "K-Means",
    "CoHiRF",
    "Batch CoHiRF",
    "Kernel RBF K-Means",
    "CoHiRF-KernelRBF",
    "Batch CoHiRF-KernelRBF",
    "DBSCAN",
    "CoHiRF-DBSCAN",
    "Batch CoHiRF-DBSCAN",
    "SC-SRGF",
    "Batch CoHiRF-SC-SRGF",
]
df_latex = df_latex.loc[
    df_latex.index.get_level_values("Dataset").isin(datasets_to_keep)
    & df_latex.index.get_level_values("Model").isin(models_to_keep),
    :,
]
highlight_max_ari = partial(highlight_max, column_name="ARI")
highlight_max_ami = partial(highlight_max, column_name="AMI")
highlight_max_calinski = partial(highlight_max, column_name="Calinski")
highlight_max_silhouette = partial(highlight_max, column_name="Silhouette")
highlight_min_davies_bouldin = partial(highlight_min, column_name="Davies-Bouldin")
underline_2nd_max_ari = partial(underline_2nd_max, column_name="ARI")
underline_2nd_max_ami = partial(underline_2nd_max, column_name="AMI")
underline_2nd_max_calinski = partial(underline_2nd_max, column_name="Calinski")
underline_2nd_max_silhouette = partial(underline_2nd_max, column_name="Silhouette")
underline_2nd_min_davies_bouldin = partial(underline_2nd_min, column_name="Davies-Bouldin")
print(
    df_latex.style.apply(highlight_max_ari, subset="ARI", axis=None)
    .apply(underline_2nd_max_ari, subset="ARI", axis=None)
    .apply(highlight_max_ami, subset="AMI", axis=None)
    .apply(underline_2nd_max_ami, subset="AMI", axis=None)
    .apply(highlight_max_calinski, subset="Calinski", axis=None)
    .apply(underline_2nd_max_calinski, subset="Calinski", axis=None)
    .apply(highlight_max_silhouette, subset="Silhouette", axis=None)
    .apply(underline_2nd_max_silhouette, subset="Silhouette", axis=None)
    .apply(highlight_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .apply(underline_2nd_min_davies_bouldin, subset="Davies-Bouldin", axis=None)
    .format(precision=3, na_rep="No Run", subset=["ARI", "AMI", "Calinski", "Silhouette", "Davies-Bouldin"])
    .format(formatter="{:4.3f}", subset=["Mean Best Time", "Mean HPO Time"])
    .to_latex(
        hrules=True,
        clines="skip-last;data",
        convert_css=True,
        column_format="ll" + "r" * len(df_latex.columns),
        #    environment="longtable"
    )
)

\begin{tabular}{llrrrrrrr}
\toprule
 &  & ARI & AMI & Calinski & Silhouette & Davies-Bouldin & Mean Best Time & Mean HPO Time \\
Dataset & Model &  &  &  &  &  &  &  \\
\midrule
\multirow[c]{11}{*}{coil-20} & Batch CoHiRF & 0.467 & 0.730 & 75.334 & 0.134 & 1.229 & 0.575 & 204.292 \\
 & Batch CoHiRF-DBSCAN & \bfseries 0.782 & \underline{0.856} & 23.430 & \underline{0.188} & \underline{0.539} & 6.578 & 991.348 \\
 & Batch CoHiRF-KernelRBF & 0.077 & 0.251 & 43.180 & 0.034 & 2.227 & 14.393 & 1055.906 \\
 & Batch CoHiRF-SC-SRGF & 0.260 & 0.678 & 132.257 & 0.125 & 1.458 & 11.040 & 782.459 \\
 & CoHiRF & 0.336 & 0.615 & \underline{287.924} & 0.178 & 1.564 & 0.423 & 184.464 \\
 & CoHiRF-DBSCAN & 0.696 & 0.791 & 13.839 & 0.159 & \bfseries 0.529 & 0.317 & 177.882 \\
 & CoHiRF-KernelRBF & 0.002 & 0.013 & 1.799 & 0.003 & 2.498 & 18.076 & 1197.955 \\
 & DBSCAN & 0.143 & 0.563 & 56.906 & -0.005 & 1.143 & 0.077 & 142.999 \\
 & K-Means & 0.670 & 0.814 & \bfseries 288.095 & \bfseries 0.229 & 1.746 & 0.

In [None]:
datasets_ids = [39, 61, 46779]
model_names = {
    "AffinityPropagation": "Affinity Propagation",
    "AverageAgglomerativeClustering": "Average Agglomerative",
    # "BatchCoHiRF": "Batch CoHiRF",
    # "BatchCoHiRF-1iter": "Batch CoHiRF-1iter",
    # "BatchCoHiRF-DBSCAN": "Batch CoHiRF-DBSCAN",
    # "BatchCoHiRF-DBSCAN-1iter": "Batch CoHiRF-DBSCAN-1iter",
    "CoHiRF-50": "CoHiRF",
    "CoHiRF-DBSCAN-50": "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF-50": "CoHiRF-KernelRBF",
    # "CoHiRF-100": "CoHiRF",
    # "CoHiRF-DBSCAN-100": "CoHiRF-DBSCAN",
    # "CoHiRF-KernelRBF-100": "CoHiRF-KernelRBF",
    "CompleteAgglomerativeClustering": "Complete Agglomerative",
    "DBSCAN": "DBSCAN",
    "HDBSCAN": "HDBSCAN",
    "IRFLLRR": "IRFLLRR",
    "KMeans": "K-Means",
    "MeanShift": "Mean Shift",
    "OPTICS": "OPTICS",
    "Proclus": "Proclus",
    "SingleAgglomerativeClustering": "Single Agglomerative",
    "SpectralClustering": "Spectral Clustering",
    "SpectralSubspaceRandomization": "SC-SRGF",
    "WardAgglomerativeClustering": "Ward's Method",
}
df = df_runs_parents.copy()
# get time of the child run
df = df.loc[df['dataset_id'].isin(datasets_ids)]
df = df.loc[df["model"].isin(model_names.keys())]
# df = df.loc[df['hpo_metric'] == 'adjusted_rand']
df = df.loc[df['standardize'] == True]
df['Parameters'] = df.apply(get_parameters_string, axis=1)
df = df[['model', 'dataset_name', 'best/adjusted_rand', 'Parameters', 'best/elapsed_time']]
df = df.replace({"model": model_names})
df = df.rename(columns={'best/adjusted_rand': 'ARI', 'model': 'Model', 'dataset_name': 'Dataset', 'best/elapsed_time': 'Time (s)'})
df = df.groupby(['Dataset', 'Model']).agg({'ARI': 'mean', 'Time (s)': 'first', 'Parameters': 'first'})

In [None]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI,Time (s),Parameters
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ecoli,Affinity Propagation,0.248155,0.053642,$\lambda=0.58$
ecoli,Average Agglomerative,0.72761,0.007877,$C=11$
ecoli,CoHiRF,0.756428,0.143971,$C=4$; $q=0.99$; $R=6$
ecoli,CoHiRF-DBSCAN,0.454527,0.051831,$\epsilon=0.43$; $n_{\text{min}}=13$; $q=0.42$...
ecoli,CoHiRF-KernelRBF,0.70477,0.121266,$C=2$; $q=0.27$; $R=9$; $\gamma=1.12$
ecoli,Complete Agglomerative,0.785384,0.006143,$C=7$
ecoli,DBSCAN,0.573312,0.015961,$\epsilon=1.24$; $n_{\text{min}}=32$
ecoli,HDBSCAN,0.397572,0.019754,$C_{\text{min}}=10$
ecoli,IRFLLRR,0.452451,0.282542,$\alpha=1$; $c=100$; $\lambda=0.10$; $p=0.88$;...
ecoli,K-Means,0.723973,0.003528,$C=6$


In [None]:
# with time, appendix
df_latex = df.copy()
highlight_max_ari = partial(highlight_max, column_name='ARI')
highlight_max_ari_index = partial(highlight_max_index, df_column=df_latex['ARI'])
underline_2nd_max_ari = partial(underline_2nd_max, column_name='ARI')
underline_2nd_max_ari_index = partial(underline_2nd_max_index, df_column=df_latex['ARI'])
print(df_latex.style.apply(highlight_max_ari, subset='ARI', axis=None).apply_index(highlight_max_ari_index,'index', level=1).apply(underline_2nd_max_ari, subset='ARI', axis=None).apply_index(underline_2nd_max_ari_index, 'index', level=1).format(precision=3, na_rep='No Run', subset='ARI').format(formatter='{:4.3f}',subset='Time (s)').to_latex(hrules=True, clines='skip-last;data', convert_css=True, column_format='llrrl'))

\begin{tabular}{llrrl}
\toprule
 &  & ARI & Time (s) & Parameters \\
Dataset & Model &  &  &  \\
\midrule
\multirow[c]{17}{*}{ecoli} & Affinity Propagation & 0.248 & 0.054 & $\lambda=0.58$ \\
 & Average Agglomerative & 0.728 & 0.008 & $C=11$ \\
 & \underline{CoHiRF} & \underline{0.756} & 0.144 & $C=4$; $q=0.99$; $R=6$ \\
 & CoHiRF-DBSCAN & 0.455 & 0.052 & $\epsilon=0.43$; $n_{\text{min}}=13$; $q=0.42$; $R=8$ \\
 & CoHiRF-KernelRBF & 0.705 & 0.121 & $C=2$; $q=0.27$; $R=9$; $\gamma=1.12$ \\
 & \bfseries Complete Agglomerative & \bfseries 0.785 & 0.006 & $C=7$ \\
 & DBSCAN & 0.573 & 0.016 & $\epsilon=1.24$; $n_{\text{min}}=32$ \\
 & HDBSCAN & 0.398 & 0.020 & $C_{\text{min}}=10$ \\
 & IRFLLRR & 0.452 & 0.283 & $\alpha=1$; $c=100$; $\lambda=0.10$; $p=0.88$; $C=8$ \\
 & K-Means & 0.724 & 0.004 & $C=6$ \\
 & Mean Shift & 0.038 & 2.281 & $bin_{\text{min}}=1$ \\
 & OPTICS & 0.314 & 5.559 & $n_{\text{min}}=10$ \\
 & Proclus & 0.429 & 0.019 & $d=0.58$; $C=2$ \\
 & SC-SRGF & 0.725 & 2.121 & $m=21$

In [None]:
# no time, main text
df_latex = df.copy()[['ARI', 'Parameters']]
highlight_max_ari = partial(highlight_max, column_name='ARI')
highlight_max_ari_index = partial(highlight_max_index, df_column=df_latex['ARI'])
underline_2nd_max_ari = partial(underline_2nd_max, column_name='ARI')
underline_2nd_max_ari_index = partial(underline_2nd_max_index, df_column=df_latex['ARI'])
print(df_latex.style.apply(highlight_max_ari, subset='ARI', axis=None).apply_index(highlight_max_ari_index,'index', level=1).apply(underline_2nd_max_ari, subset='ARI', axis=None).apply_index(underline_2nd_max_ari_index, 'index', level=1).format(precision=3, na_rep='No Run', subset='ARI').to_latex(hrules=True, clines='skip-last;data', convert_css=True, column_format='p{0.95cm}lp{0.5cm}l'))

\begin{tabular}{llrrl}
\toprule
 & key & ARI & Parameters \\
Dataset & Model &  &  \\
\midrule
\multirow[c]{9}{*}{ecoli} & Affinity Propagation & 0.248 & $\lambda=0.58$ \\
 & \bfseries CoHiRF & \bfseries 0.758 & $C=7$; $q=11$; $R=10$ \\
 & \underline{CoHiRF-RBF} & \underline{0.742} & $C=7$; $q=25$; $R=4$ \\
 & DBSCAN & 0.345 & $n_{\text{min}}=7$; $\epsilon=0.78$ \\
 & HDBSCAN & 0.398 & $C_{\text{min}}=10$ \\
 & K-Means & 0.719 & $C=6$ \\
 & OPTICS & 0.314 & $n_{\text{min}}=10$ \\
 & SC-SRGF & 0.723 & $m=15$; $r=0.80$; $C=4$ \\
 & Ward's Method & 0.735 & $C=7$ \\
\cline{1-4}
\multirow[c]{10}{*}{har} & Affinity Propagation & 0.313 & $\lambda=1.00$ \\
 & CoHiRF & 0.491 & $C=4$; $q=11$; $R=8$ \\
 & CoHiRF-1000 & 0.341 & $C=4$; $q=18$; $R=3$ \\
 & CoHiRF-RBF & 0.495 & $C=6$; $q=13$; $R=4$ \\
 & DBSCAN & 0.302 & $n_{\text{min}}=3$; $\epsilon=13.91$ \\
 & HDBSCAN & 0.287 & $C_{\text{min}}=6$ \\
 & K-Means & 0.438 & $C=9$ \\
 & OPTICS & 0.001 & $n_{\text{min}}=4$ \\
 & \bfseries SC-SRGF & \bfs

# Debug and explore

In [None]:
df = df_runs_raw_parents.copy()

In [None]:
model_names = df.model.unique().tolist()
model_names.sort()
model_names

['AffinityPropagation',
 'AverageAgglomerativeClustering',
 'BatchCoHiRF-1iter',
 'BatchCoHiRF-DBSCAN-1iter',
 'BatchCoHiRF-SC-SRGF',
 'CoHiRF',
 'CoHiRF-DBSCAN',
 'CoHiRF-KernelRBF',
 'CompleteAgglomerativeClustering',
 'DBSCAN',
 'HDBSCAN',
 'IRFLLRR',
 'KMeans',
 'MeanShift',
 'OPTICS',
 'Proclus',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'SpectralSubspaceRandomization',
 'WardAgglomerativeClustering']

In [None]:
model_names = [
    "AffinityPropagation",
    # "AverageAgglomerativeClustering",
    # "BatchCoHiRF-1iter",
    # "BatchCoHiRF-DBSCAN-1iter",
    # "BatchCoHiRF-SC-SRGF",
    # "CoHiRF",
    # "CoHiRF-DBSCAN",
    # "CoHiRF-KernelRBF",
    # "CompleteAgglomerativeClustering",
    # "DBSCAN",
    # "HDBSCAN",
    # "IRFLLRR",
    # "KMeans",
    # "MeanShift",
    # "OPTICS",
    # "Proclus",
    # "SingleAgglomerativeClustering",
    # "SpectralClustering",
    "SpectralSubspaceRandomization",
    # "WardAgglomerativeClustering",
]

In [None]:
df = df.loc[df["best/adjusted_rand"].isna()]
df = df.loc[~df["model"].isin(model_names)]

In [None]:
df

Unnamed: 0_level_0,status,start_time,end_time,best/alpha,best/avg_dims,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/c,best/child_run_id,...,max_memory_used,max_memory_used_after_fit,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0901bfc08af5458ab0034ab7cb7d1b2e,FINISHED,1751916368453,1751917000000.0,,,,,,,820cc0feddd54385b382eb25ecca62f2,...,493.62,493.62,,False,bittner-2000,46776,38,2202,2,0
1d574921f2c543b78cee887c28f3946b,FINISHED,1751915421602,1751916000000.0,,,,,,,2d010d3203d94fa986be77f749c84e2e,...,482.804,482.804,,False,golub-1999-v2,46780,72,1869,3,0
33cb421921c041b584362f17a4cdc17d,FINISHED,1751914443733,1751915000000.0,,,,,,,731020e814df4d46a5ff2bd6984e121d,...,467.028,467.028,,False,bredel-2005,46777,50,1740,3,0
3fbaefa4ed844048ae4989264777d062,FINISHED,1751914724869,1751915000000.0,,,0.5,5.0,,,1b2f91a4346b4e3b83e2c881da933fe0,...,436.04,436.04,,False,khan-2001,46781,83,1070,4,0
6779db1b4fe343a99e9fb55206dfedb0,FINISHED,1751915142539,1751915000000.0,,,,,,,138f534b142d455c99599b996f185fa3,...,440.36,440.36,,False,khan-2001,46781,83,1070,4,0
695d4e9d098f4dc69268794dc6b7b2d6,FINISHED,1751915044106,1751915000000.0,,,,,,,b250df6764e44fd48dc6d7ca77495729,...,491.192,491.192,,False,alizadeh-2000-v3,46774,62,2092,4,0
6afd8f179fc54e86b856e958b268e4cd,FINISHED,1751914203808,1751915000000.0,4.0,,,,,0.1,eccef9f294b14c35a5a3bae860af6d25,...,30077.172,30077.172,,False,shuttle,40685,58000,10,7,1
6e6a7953d76542ea8dbabdb462ee2218,FINISHED,1751916127683,1751916000000.0,,,,,,,fca5ad03fa2846508b7d57905ae855e1,...,495.456,495.456,,False,alizadeh-2000-v3,46774,62,2092,4,0
76965c65b70f43ae855292070c4ca909,FINISHED,1751914487224,1751915000000.0,,,0.5,5.0,,,2799568ae82248748739957d0cce4efe,...,470.04,470.04,,False,bredel-2005,46777,50,1740,3,0
79cec1f2e7d34fff85ca42b7f1e807f0,FINISHED,1751915198538,1751915000000.0,,,,,,,6e8312c7218647b1bfe97335bdf2f56e,...,440.148,440.148,,False,khan-2001,46781,83,1070,4,0


In [None]:
runs_to_delete_parents = list(df.index)

In [None]:
df = df_runs_raw.copy()
df = df.loc[df["mlflow.parentRunId"].isin(runs_to_delete_parents)]

In [None]:
runs_to_delete_children = list(df.index)

In [None]:
runs_to_delete = runs_to_delete_children + runs_to_delete_parents

In [None]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [None]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

In [None]:
query = f"""
DELETE
FROM
	experiment_tags
WHERE
	experiment_id = ANY(
	SELECT
		experiment_id
	FROM
		experiments
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	latest_metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	params
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	tags
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE 
FROM 
	runs
WHERE 
	lifecycle_stage = 'deleted';

DELETE 
FROM 
	experiments
WHERE 
	lifecycle_stage = 'deleted';
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    

In [None]:
# Display summary information about the metrics dataframe
print("Dataframe shape:", df_metrics.shape)
print("\nColumns:", df_metrics.columns.tolist())
print("\nIndex levels:", df_metrics.index.names)
print("\nFirst few rows:")
print(df_metrics.head(10))
print("\nData types:")
print(df_metrics.dtypes)
print("\nNon-null counts:")
print(df_metrics.count())

Dataframe shape: (335, 3)

Columns: ['ARI', 'AMI', 'Calinski']

Index levels: ['Dataset', 'Model']

First few rows:
                                                       ARI       AMI  \
Dataset          Model                                                 
alizadeh-2000-v2 AffinityPropagation              0.362816       NaN   
                 AverageAgglomerativeClustering   0.809591  0.676228   
                 BatchCoHiRF-SC-SRGF              0.794763  0.671212   
                 CoHiRF                           0.864606  0.757990   
                 CoHiRF-KernelRBF                 0.046212  0.085768   
                 CompleteAgglomerativeClustering  0.382805  0.612835   
                 DBSCAN                           0.000000  0.000000   
                 HDBSCAN                          0.165928  0.267066   
                 IRFLLRR                          0.512283       NaN   
                 KMeans                           0.830673  0.750678   

                   

The following will provide the latex code for a clean table, we only need to make a little adjustement in the first line to delete the "key" and have only one header. For the longtable environment (full data) we need to add the "\*" at the end of lines we dont want to have a page break. We also should replace the entire begin{table} ... end{table} by begin{longtable} ... end{longtable} in the latex file.


In [43]:
datasets_ids = [39, 61, 46779]
model_names = {
    "AffinityPropagation": "Affinity Propagation",
    "AverageAgglomerativeClustering": "Average Agglomerative",
    # "BatchCoHiRF": "Batch CoHiRF",
    # "BatchCoHiRF-1iter": "Batch CoHiRF-1iter",
    # "BatchCoHiRF-DBSCAN": "Batch CoHiRF-DBSCAN",
    # "BatchCoHiRF-DBSCAN-1iter": "Batch CoHiRF-DBSCAN-1iter",
    "CoHiRF-50": "CoHiRF",
    "CoHiRF-DBSCAN-50": "CoHiRF-DBSCAN",
    "CoHiRF-KernelRBF-50": "CoHiRF-KernelRBF",
    # "CoHiRF-100": "CoHiRF",
    # "CoHiRF-DBSCAN-100": "CoHiRF-DBSCAN",
    # "CoHiRF-KernelRBF-100": "CoHiRF-KernelRBF",
    "CompleteAgglomerativeClustering": "Complete Agglomerative",
    "DBSCAN": "DBSCAN",
    "HDBSCAN": "HDBSCAN",
    "IRFLLRR": "IRFLLRR",
    "KMeans": "K-Means",
    "MeanShift": "Mean Shift",
    "OPTICS": "OPTICS",
    "Proclus": "Proclus",
    "SingleAgglomerativeClustering": "Single Agglomerative",
    "SpectralClustering": "Spectral Clustering",
    "SpectralSubspaceRandomization": "SC-SRGF",
    "WardAgglomerativeClustering": "Ward's Method",
}
df = df_runs_parents.copy()
# get time of the child run
df = df.loc[df['dataset_id'].isin(datasets_ids)]
df = df.loc[df["model"].isin(model_names.keys())]
# df = df.loc[df['hpo_metric'] == 'adjusted_rand']
df = df.loc[df['standardize'] == True]
df['Parameters'] = df.apply(get_parameters_string, axis=1)
df = df[['model', 'dataset_name', 'best/adjusted_rand', 'Parameters', 'best/elapsed_time']]
df = df.replace({"model": model_names})
df = df.rename(columns={'best/adjusted_rand': 'ARI', 'model': 'Model', 'dataset_name': 'Dataset', 'best/elapsed_time': 'Time (s)'})
df = df.groupby(['Dataset', 'Model']).agg({'ARI': 'mean', 'Time (s)': 'first', 'Parameters': 'first'})

In [44]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ARI,Time (s),Parameters
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ecoli,Affinity Propagation,0.248155,0.053642,$\lambda=0.58$
ecoli,Average Agglomerative,0.72761,0.007877,$C=11$
ecoli,CoHiRF,0.756428,0.143971,$C=4$; $q=0.99$; $R=6$
ecoli,CoHiRF-DBSCAN,0.454527,0.051831,$\epsilon=0.43$; $n_{\text{min}}=13$; $q=0.42$...
ecoli,CoHiRF-KernelRBF,0.70477,0.121266,$C=2$; $q=0.27$; $R=9$; $\gamma=1.12$
ecoli,Complete Agglomerative,0.785384,0.006143,$C=7$
ecoli,DBSCAN,0.573312,0.015961,$\epsilon=1.24$; $n_{\text{min}}=32$
ecoli,HDBSCAN,0.397572,0.019754,$C_{\text{min}}=10$
ecoli,IRFLLRR,0.452451,0.282542,$\alpha=1$; $c=100$; $\lambda=0.10$; $p=0.88$;...
ecoli,K-Means,0.723973,0.003528,$C=6$


In [45]:
# with time, appendix
df_latex = df.copy()
highlight_max_ari = partial(highlight_max, column_name='ARI')
highlight_max_ari_index = partial(highlight_max_index, df_column=df_latex['ARI'])
underline_2nd_max_ari = partial(underline_2nd_max, column_name='ARI')
underline_2nd_max_ari_index = partial(underline_2nd_max_index, df_column=df_latex['ARI'])
print(df_latex.style.apply(highlight_max_ari, subset='ARI', axis=None).apply_index(highlight_max_ari_index,'index', level=1).apply(underline_2nd_max_ari, subset='ARI', axis=None).apply_index(underline_2nd_max_ari_index, 'index', level=1).format(precision=3, na_rep='No Run', subset='ARI').format(formatter='{:4.3f}',subset='Time (s)').to_latex(hrules=True, clines='skip-last;data', convert_css=True, column_format='llrrl'))

\begin{tabular}{llrrl}
\toprule
 &  & ARI & Time (s) & Parameters \\
Dataset & Model &  &  &  \\
\midrule
\multirow[c]{17}{*}{ecoli} & Affinity Propagation & 0.248 & 0.054 & $\lambda=0.58$ \\
 & Average Agglomerative & 0.728 & 0.008 & $C=11$ \\
 & \underline{CoHiRF} & \underline{0.756} & 0.144 & $C=4$; $q=0.99$; $R=6$ \\
 & CoHiRF-DBSCAN & 0.455 & 0.052 & $\epsilon=0.43$; $n_{\text{min}}=13$; $q=0.42$; $R=8$ \\
 & CoHiRF-KernelRBF & 0.705 & 0.121 & $C=2$; $q=0.27$; $R=9$; $\gamma=1.12$ \\
 & \bfseries Complete Agglomerative & \bfseries 0.785 & 0.006 & $C=7$ \\
 & DBSCAN & 0.573 & 0.016 & $\epsilon=1.24$; $n_{\text{min}}=32$ \\
 & HDBSCAN & 0.398 & 0.020 & $C_{\text{min}}=10$ \\
 & IRFLLRR & 0.452 & 0.283 & $\alpha=1$; $c=100$; $\lambda=0.10$; $p=0.88$; $C=8$ \\
 & K-Means & 0.724 & 0.004 & $C=6$ \\
 & Mean Shift & 0.038 & 2.281 & $bin_{\text{min}}=1$ \\
 & OPTICS & 0.314 & 5.559 & $n_{\text{min}}=10$ \\
 & Proclus & 0.429 & 0.019 & $d=0.58$; $C=2$ \\
 & SC-SRGF & 0.725 & 2.121 & $m=21$

In [None]:
# no time, main text
df_latex = df.copy()[['ARI', 'Parameters']]
highlight_max_ari = partial(highlight_max, column_name='ARI')
highlight_max_ari_index = partial(highlight_max_index, df_column=df_latex['ARI'])
underline_2nd_max_ari = partial(underline_2nd_max, column_name='ARI')
underline_2nd_max_ari_index = partial(underline_2nd_max_index, df_column=df_latex['ARI'])
print(df_latex.style.apply(highlight_max_ari, subset='ARI', axis=None).apply_index(highlight_max_ari_index,'index', level=1).apply(underline_2nd_max_ari, subset='ARI', axis=None).apply_index(underline_2nd_max_ari_index, 'index', level=1).format(precision=3, na_rep='No Run', subset='ARI').to_latex(hrules=True, clines='skip-last;data', convert_css=True, column_format='p{0.95cm}lp{0.5cm}l'))

\begin{tabular}{llrrl}
\toprule
 & key & ARI & Parameters \\
Dataset & Model &  &  \\
\midrule
\multirow[c]{9}{*}{ecoli} & Affinity Propagation & 0.248 & $\lambda=0.58$ \\
 & \bfseries CoHiRF & \bfseries 0.758 & $C=7$; $q=11$; $R=10$ \\
 & \underline{CoHiRF-RBF} & \underline{0.742} & $C=7$; $q=25$; $R=4$ \\
 & DBSCAN & 0.345 & $n_{\text{min}}=7$; $\epsilon=0.78$ \\
 & HDBSCAN & 0.398 & $C_{\text{min}}=10$ \\
 & K-Means & 0.719 & $C=6$ \\
 & OPTICS & 0.314 & $n_{\text{min}}=10$ \\
 & SC-SRGF & 0.723 & $m=15$; $r=0.80$; $C=4$ \\
 & Ward's Method & 0.735 & $C=7$ \\
\cline{1-4}
\multirow[c]{10}{*}{har} & Affinity Propagation & 0.313 & $\lambda=1.00$ \\
 & CoHiRF & 0.491 & $C=4$; $q=11$; $R=8$ \\
 & CoHiRF-1000 & 0.341 & $C=4$; $q=18$; $R=3$ \\
 & CoHiRF-RBF & 0.495 & $C=6$; $q=13$; $R=4$ \\
 & DBSCAN & 0.302 & $n_{\text{min}}=3$; $\epsilon=13.91$ \\
 & HDBSCAN & 0.287 & $C_{\text{min}}=6$ \\
 & K-Means & 0.438 & $C=9$ \\
 & OPTICS & 0.001 & $n_{\text{min}}=4$ \\
 & \bfseries SC-SRGF & \bfs

# Debug and explore

In [45]:
df = df_runs_raw_parents.copy()

In [46]:
df = df.loc[df["hpo_metric"] == "davies_bouldin_score"]
df = df.loc[df["direction"] == "maximize"]

In [47]:
df

Unnamed: 0_level_0,status,start_time,end_time,best/alpha,best/avg_dims,best/base_model_kwargs/eps,best/base_model_kwargs/min_samples,best/base_model_kwargs/n_clusters,best/c,best/child_run_id,...,max_memory_used_after_fit,EXCEPTION,mlflow.parentRunId,raised_exception,dataset,openml_id,n_instances,n_features,n_classes,n_categorical
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00ec4cb302944fb1a641dc197deb818d,FAILED,1752585884991,1.752586e+12,,,,,,,,...,492.784,Best metric davies_bouldin_score not found in ...,,True,alizadeh-2000-v3,46774.0,62.0,2092.0,4.0,0.0
01601cdc6a27435db99653f8667590aa,FINISHED,1752963868547,1.752964e+12,,,,,,,f5b246a05fb940808546f5c78f6c47d7,...,490.080,,,False,alizadeh-2000-v2,46773.0,62.0,2094.0,3.0,0.0
020290c5b0a64b9aa40b4ef13728d88f,FINISHED,1753589429231,1.753590e+12,,,,,2.0,,3daf3d3c4d244730a95b456098d5a930,...,379.748,,,False,ecoli,39.0,336.0,8.0,8.0,1.0
028da96c62be4cff9241678600d2cd80,FAILED,1752583209869,1.752583e+12,,,,,,,,...,436.112,Best metric davies_bouldin_score not found in ...,,True,armstrong-2002-v1,46775.0,72.0,1082.0,2.0,0.0
0306fbf2e2704da883e3529e3f2a567a,FAILED,1752596519716,1.752613e+12,,,,,,,,...,1791.276,Best metric davies_bouldin_score not found in ...,,True,har,1478.0,10299.0,562.0,6.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff3357d9115046cb802f4999a6ad4809,FINISHED,1753561098211,1.753561e+12,,,,,,,1ac9fe461b224d5982b934cdb3f1dcba,...,378.080,,,False,iris,61.0,150.0,5.0,3.0,1.0
ff6f42ff199f45e481823c0c34b25f80,FAILED,1752605850216,1.752606e+12,,,,,,,,...,393.812,Best metric davies_bouldin_score not found in ...,,True,chowdary-2006,46778.0,104.0,183.0,2.0,0.0
ff8f2d88f3264b72a3d86299cdc17615,FINISHED,1753598803119,1.753600e+12,,,,,3.0,,5b6c9cb0730f4771bbdc8f2d348eada5,...,16970.668,,,False,shuttle,40685.0,58000.0,10.0,7.0,1.0
ffba87034de64847a3a3b00caa87f8cf,FINISHED,1753639879137,1.753641e+12,,,,,,,844a53dd50974e9aa9c92ed86ebb789e,...,21629.400,,,False,,,,,,


In [48]:
runs_to_delete_parents = list(df.index)

In [49]:
df = df_runs_raw.copy()
df = df.loc[df["mlflow.parentRunId"].isin(runs_to_delete_parents)]

In [51]:
runs_to_delete_children = list(df.index)

In [52]:
runs_to_delete = runs_to_delete_children + runs_to_delete_parents

In [53]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [54]:
run_uuid_query

"'0003d08fba2c4388b8be59cc755052a0', '000d4aeceb984735b5228ce74ebf32e1', '000d7928bf86411ab50a8dee455cbaca', '000fd3e176bb4d50bc8663a21982f9ac', '00102c6b6ab040b1914b0e5d512d7626', '00185345f08b426b81c131a44c3127c2', '00196ac22b3d4b56b35308d3d832d2d2', '00197632f2394a52a05d6889ad4b1805', '001a719144884c6ba21d1d5c60ac5805', '001ae32e4a5548f1a337f44efbfea2a5', '001b0434c90a4e448cc29e08147e1ae4', '001d8b2a73844021b2fb3744e4097c5a', '001f0fd2284140cbaa06b14b13b6b2ba', '001fb8de3064482da4dc0b9bfc650801', '001fc4324a0343cb91fd06c03fe4a06b', '00208e8fb4754f718ab0f55cedd2109f', '0021d9a3008349d280d7c87a8a00732d', '0022618b0ade41db9a33ae0455edb0f9', '0022893f7af84daa857d72204c700427', '0026fcbeb25a49caac761e01724eb69c', '002997e7e844400d8999601880826eaf', '002f34e29f714405aae3520779845dac', '002fdd8a995b41fe81fabc8ffdb153de', '003154b784dd45eb98b6e8d0a92fa13f', '00356743477542159e468b9e2c696d4c', '0035c2af7d124c5498e63762b2096f0c', '0037b031de544f2382165860d0f9e0d0', '003a07bff835484e978ff7c2a0

In [55]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

In [56]:
query = f"""
DELETE
FROM
	experiment_tags
WHERE
	experiment_id = ANY(
	SELECT
		experiment_id
	FROM
		experiments
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	latest_metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	metrics
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE
FROM
	params
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');

DELETE
FROM
	tags
WHERE
	run_uuid = ANY(
	SELECT
		run_uuid
	FROM
		runs
	WHERE
		lifecycle_stage = 'deleted');
	
DELETE 
FROM 
	runs
WHERE 
	lifecycle_stage = 'deleted';

DELETE 
FROM 
	experiments
WHERE 
	lifecycle_stage = 'deleted';
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    

In [40]:
# Display summary information about the metrics dataframe
print("Dataframe shape:", df_metrics.shape)
print("\nColumns:", df_metrics.columns.tolist())
print("\nIndex levels:", df_metrics.index.names)
print("\nFirst few rows:")
print(df_metrics.head(10))
print("\nData types:")
print(df_metrics.dtypes)
print("\nNon-null counts:")
print(df_metrics.count())

Dataframe shape: (335, 3)

Columns: ['ARI', 'AMI', 'Calinski']

Index levels: ['Dataset', 'Model']

First few rows:
                                                       ARI       AMI  \
Dataset          Model                                                 
alizadeh-2000-v2 AffinityPropagation              0.362816       NaN   
                 AverageAgglomerativeClustering   0.809591  0.676228   
                 BatchCoHiRF-SC-SRGF              0.794763  0.671212   
                 CoHiRF                           0.864606  0.757990   
                 CoHiRF-KernelRBF                 0.046212  0.085768   
                 CompleteAgglomerativeClustering  0.382805  0.612835   
                 DBSCAN                           0.000000  0.000000   
                 HDBSCAN                          0.165928  0.267066   
                 IRFLLRR                          0.512283       NaN   
                 KMeans                           0.830673  0.750678   

                   