In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries, get_common_combinations, get_df_with_combinations
import plotly.express as px
from itertools import product
import time
import numpy as np

# Save Results

## Load mlflow runs

In [2]:
db_port = 5003
db_name = 'recursive_clustering'
w = 'clust9'
# url = f'postgresql://{w}.ceremade.dauphine.lan:{db_port}/{db_name}'
url = f'postgresql://belucci@localhost:{db_port}/{db_name}'
engine = create_engine(url)
query = 'SELECT experiments.name from experiments'
experiment_names = pd.read_sql(query, engine)['name'].tolist()
# results_dir = Path('~/tab_benchmark/results')
# os.makedirs(results_dir, exist_ok=True)

In [3]:
experiment_names

['Default',
 'blob_experiment',
 'hpo_classif_experiment',
 'hpo_openml_experiment',
 'hpo_gaussian_experiment',
 'hpo_openml_final',
 'outlier_hc',
 'time_hc']

In [4]:
experiments_names = [
    'hpo_openml_final',
]

In [5]:
params_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'n_classes',
    'seed_model',
    'dataset_id',
    'dataset_name',
    # 'model_params/normalization',
    'direction',
    'hpo_metric',
    'standardize'
]

In [6]:
latest_metrics_columns = [
    'fit_model_return_elapsed_time',
    'max_memory_used',
    'n_clusters_',
    'rand_score',
    'adjusted_rand',
    'mutual_info',
    'adjusted_mutual_info',
    'normalized_mutual_info',
    'homogeneity',
    'completeness',
    'v_measure',
    'silhouette',
    'calinski_harabasz_score',
    'davies_bouldin_score',
    'inertia_score',
    'best_n_clusters_',
    'best_rand_score',
    'best_adjusted_rand',
    'best_mutual_info',
    'best_adjusted_mutual_info',
    'best_normalized_mutual_info',
    'best_homogeneity_completeness_v_measure',
    'best_silhouette',
    'best_calinski_harabasz_score',
    'best_davies_bouldin_score',
    'best_inertia_score',
    'best_homogeneity',
    'best_completeness',
    'best_v_measure',
]

In [7]:
tags_columns = [
    'raised_exception',
    'EXCEPTION',
    'parent_run_id',
    'best_child_run_id',
]

In [8]:
# parameters of each model, they are saved as tags for the parent run
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}
all_model_parameters = [p for model, params in parameters.items() for p in params]
all_model_parameters = list(set(all_model_parameters))
for param in all_model_parameters:
    tags_columns.append(param)

In [9]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [10]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)

In [11]:
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents['parent_run_id'].isna()]

In [12]:
# get all metrics beside best or last
metrics_columns = [
    'adjusted_rand',
    'silhouette',
    'calinski_harabasz_score',
    'davies_bouldin_score',
    'inertia_score',
]
metrics_columns_query = [f"'{m}'" for m in metrics_columns]
metrics_columns_query = ', '.join(metrics_columns_query)
experiments_names_query = [f"'{name}'" for name in experiments_names]
experiments_names_query = ', '.join(experiments_names_query)
query = f"""
SELECT
	runs.run_uuid,
	metrics."key",
	metrics.value,
	metrics.step 
FROM
	runs
LEFT JOIN metrics ON
	metrics.run_uuid = runs.run_uuid
LEFT JOIN experiments ON 
	experiments.experiment_id = runs.experiment_id  
WHERE 
	metrics."key" IN ({metrics_columns_query})
	AND experiments."name" IN ({experiments_names_query})
"""
df_metrics = pd.read_sql(query, engine)
df_metrics = df_metrics.pivot_table(index=['run_uuid', 'step'], columns='key', values='value')
df_metrics

Unnamed: 0_level_0,key,adjusted_rand,calinski_harabasz_score,davies_bouldin_score,inertia_score,silhouette
run_uuid,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000109ef351e4cf7a088ad85ad7e0643,0,0.000000,-1.000000,1000.000000,1.279360e+05,-1.000000
00032b51730847ebb72e762407701b9b,0,0.082347,558.383684,1.100218,9.153871e+03,0.371600
00058d452fdd4128810e0b4bbf070246,0,0.113154,71.531337,1.431907,4.009946e+03,0.192575
0008dd178961435e884e90714ac29054,0,0.146457,4.302437,5.825635,9.054871e+05,-0.150969
001070aa390c40369818a820ccda66c3,0,0.140046,8.696977,3.885626,3.376744e+05,-0.051281
...,...,...,...,...,...,...
fff60e5db53e4316bde81a7a5e0aa3c5,0,0.347694,35.930376,3.407080,3.757653e+04,0.051182
fff7d9ffe83c41eb9c296012f06989cc,0,0.356844,16.279657,2.048616,2.497868e+05,-0.051084
fff95330848845f3a40af5ff9487fce9,0,-0.011991,79.069922,2.133947,4.063221e+05,-0.143267
fffbb4067ec14b42a455f64144044d98,0,0.175813,4658.172273,2.949616,6.914248e+06,0.055392


In [13]:
df_runs_raw_parents

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,kmeans_n_clusters,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0018714386f44f95ab7141d480ce09f0,FINISHED,1737557036103,1.737557e+12,1044,eye_movements,maximize,adjusted_rand,HDBSCAN,,,...,,10,,,,,False,,,
00502ff54da4466191d5fadd2ab0f945,FINISHED,1737556950053,1.737557e+12,7,audiology,maximize,adjusted_rand,HDBSCAN,,,...,,5,,,,,False,,,
00b6f8e09ce1419e84214bb9734a9bff,FINISHED,1737589285408,1.737590e+12,23380,cjs,maximize,adjusted_rand,HDBSCAN,,,...,,3,,,,,False,,,
00b7b74b49c2418e8cd9108326d76f54,FINISHED,1737504019181,1.737512e+12,46335,primary-tumor_clean,maximize,adjusted_rand,RecursiveClustering,,,...,8,,,,,,False,6,,
00be057a1db84ac1b41fd40f1cbc4889,FAILED,1737643267392,1.737643e+12,1567,poker-hand,maximize,adjusted_rand,HDBSCAN,,,...,,,,,,,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fdb13063ebc24ee887da97c3d7b67523,FINISHED,1737643758563,1.737646e+12,458,analcatdata_authorship,maximize,adjusted_rand,OPTICS,,,...,,,2,,,,False,,,
fdc6bb9a534743c7be6ce026fd67252a,FINISHED,1737556999587,1.737557e+12,61,iris,maximize,adjusted_rand,DBSCAN,,,...,,,5,,,,False,,,
feca472c0d2149368332abf98bab6bd7,FINISHED,1737516203798,1.737519e+12,42,soybean,maximize,adjusted_rand,AffinityPropagation,,,...,,,,,,,False,,,
ff30261b4f8743808513ede3863765bb,FINISHED,1737651977550,1.737664e+12,377,synthetic_control,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,,7,,,False,,,


## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [14]:
non_duplicate_columns = [
    'model_nickname',
    'dataset_id',
    # 'model_params/normalization',
    'direction',
    'hpo_metric',
    'standardize',
]
df_runs_parents = df_runs_raw_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]

no_run_time = 4*3600
df_runs_timed_out = df_runs_raw.copy()
df_runs_timed_out = df_runs_timed_out.loc[df_runs_timed_out.EXCEPTION == 'FunctionTimedOut']
df_runs_timed_out = df_runs_timed_out.loc[(~df_runs_timed_out.duplicated(non_duplicate_columns))]
df_runs_timed_out['fit_model_return_elapsed_time'] = no_run_time
df_runs = pd.concat([df_runs, df_runs_timed_out])

status = 'FAILED'
no_run_memory = 2*120000
raised_exception = True
EXCEPTION = 'NoRun'
start_time = time.time() * 1000
end_time = time.time() * 1000

# IRFLLRR do not run with n_samples > 14427
model_nickname = 'IRFLLRR'
n_samples = ['14427', '50000']
n_features = ['100', '347', '1202', '4163', '14427', '50000']
combinations = product(n_samples, n_features)
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# KMeansProj do not run with (14427,14427), (14427,50000), (50000,14427), (50000,50000)
model_nickname = 'KMeansProj'
n_samples = ['14427', '50000']
n_features = ['14427', '50000']
combinations = product(n_samples, n_features)
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# HDBSCAN do not run with (14427,14427), (14427,50000), and n_samples = 50000 n_features > 347
model_nickname = 'HDBSCAN'
n_samples = ['50000']
n_features = ['347', '1202', '4163', '14427', '50000']
combinations = list(product(n_samples, n_features))
combinations += [('14427', '14427'), ('14427', '50000')]
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# SpectralSubspaceRandomization do not run with n_samples > 50000
model_nickname = 'SpectralSubspaceRandomization'
n_samples = ['50000']
n_features = ['100', '347', '1202', '4163', '14427', '50000']
combinations = list(product(n_samples, n_features))
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# ensure no duplicates
df_runs = df_runs.loc[(~df_runs.duplicated(non_duplicate_columns))]

# Missing

In [15]:
df_runs_parents['model_nickname'].unique().tolist()

['HDBSCAN',
 'RecursiveClustering',
 'OPTICS',
 'SpectralSubspaceRandomization',
 'DBSCAN',
 'WardAgglomerativeClustering',
 'KMeans',
 'AffinityPropagation']

In [16]:
non_duplicate_columns = [
    'model_nickname',
    'dataset_id',
    # 'model_params/normalization',
    'direction',
    'hpo_metric',
    'standardize',
]

In [17]:
datasets = pd.read_csv('/home/bbelucci/code/recursive_clustering/recursive_clustering/openml_datasets.csv')

In [18]:
dataset_id = [40685, 39, 61, 182, 40984, 1478, 1568]
df = datasets.loc[datasets['dataset_id'].isin(dataset_id)].copy()
df['n_categorical_features'] = df['n_categorical_features'] - 1  # remove the target column
df['n_instances'] = df['n_instances'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df['n_classes'] = df['n_classes'].astype(int)
df['n_categorical_features'] = df['n_categorical_features'].astype(int)
df = df[['dataset_name', 'dataset_id', 'n_instances', 'n_features', 'n_categorical_features', 'n_classes', 'task_id']]
# df = df.rename(columns={'dataset_name': 'Dataset', 'dataset_id': 'OpenML ID', 'n_instances': 'N. Samples', 'n_features': 'N. Features', 'n_classes': 'N. Classes', 'n_categorical_features': 'N. Categorical Features'})
df = df.rename(columns={'dataset_name': 'Dataset', 'dataset_id': 'OpenML ID', 'n_instances': '$\samplesize$', 'n_features': '$\dimsize$', 'n_classes': '$\clusternum$', 'n_categorical_features': '$\categoricalnum$'})

In [19]:
df

Unnamed: 0,Dataset,OpenML ID,$\samplesize$,$\dimsize$,$\categoricalnum$,$\clusternum$,task_id
26,ecoli,39,336,8,0,8,145977
34,har,1478,10299,562,0,6,14970
38,iris,61,150,5,0,3,59
55,nursery,1568,12958,9,8,4,9892
62,satimage,182,6430,37,0,6,2074
63,segment,40984,2310,20,0,7,146822
65,shuttle,40685,58000,10,0,7,146212


In [20]:
print(df.style.hide().to_latex(hrules=True))

\begin{tabular}{lrrrrrr}
\toprule
Dataset & OpenML ID & $\samplesize$ & $\dimsize$ & $\categoricalnum$ & $\clusternum$ & task_id \\
\midrule
ecoli & 39 & 336 & 8 & 0 & 8 & 145977 \\
har & 1478 & 10299 & 562 & 0 & 6 & 14970 \\
iris & 61 & 150 & 5 & 0 & 3 & 59 \\
nursery & 1568 & 12958 & 9 & 8 & 4 & 9892 \\
satimage & 182 & 6430 & 37 & 0 & 6 & 2074 \\
segment & 40984 & 2310 & 20 & 0 & 7 & 146822 \\
shuttle & 40685 & 58000 & 10 & 0 & 7 & 146212 \\
\bottomrule
\end{tabular}



In [21]:
dataset_id = datasets['dataset_id'].tolist()
# too long datasets 42855, 1596, 40927, 40996, 41168, 1483, 1459, 554
dataset_id = [40685, 39, 61, 182, 40984, 1478, 1568]
dataset_id = [str(d) for d in dataset_id if d not in [42855, 1596, 40927, 40996, 41168, 1483, 1459, 554, 1567, 41165, 1509, 40668, 1118]]
direction = ['maximize']
metric = ['adjusted_rand']
standardize = ['True']
model_nickname = [
    'AffinityPropagation',
    'HDBSCAN',
    'WardAgglomerativeClustering',
    'SpectralSubspaceRandomization',
    'RecursiveClustering',
    'KMeans',
    'DBSCAN',
    'OPTICS',
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, dataset_id, direction, metric, standardize]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)

In [22]:
df_missing

Unnamed: 0,model_nickname,dataset_id,direction,hpo_metric,standardize
0,AffinityPropagation,40685,maximize,adjusted_rand,True
1,SpectralSubspaceRandomization,40685,maximize,adjusted_rand,True
2,SpectralSubspaceRandomization,1568,maximize,adjusted_rand,True


In [23]:
print(*df_missing['dataset_id'].unique().tolist())

40685 1568


In [24]:
df = df_runs_raw_parents.copy()
df = df.loc[df['model_nickname'] == 'DBSCAN']
df = df.loc[df['dataset_id'] == '1459']
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,kmeans_n_clusters,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4dbb7e53d52f46548b57fee10b285c21,FINISHED,1737589030899,1737589000000.0,1459,artificial-characters,maximize,adjusted_rand,DBSCAN,,,...,,,7,,,,False,,,


In [25]:
df = df_runs_parents.loc[df_runs_parents['model_nickname'] == 'DBSCAN']
df = df.loc[df['dataset_id'] == '1118']
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,kmeans_n_clusters,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ea59887cdd6f470189caf89611190b5c,FINISHED,1737589058694,1737590000000.0,1118,chess,maximize,adjusted_rand,DBSCAN,,,...,,,5,,,,False,,,


# Get common combinations

In [84]:
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'SingleAgglomerativeClustering',
    'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
df = df_runs_parents.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [85]:
df_common = get_df_with_combinations(df, combination_columns, common_combinations)

In [86]:
df_common['n_samples'] = df_common['n_samples'].astype(int)
df_common['n_features'] = df_common['n_features'].astype(int)
df_common['pct_random'] = df_common['pct_random'].astype(float)
df_common['class_sep'] = df_common['class_sep'].astype(float)
df_common['seed_unified'] = df_common['seed_unified'].astype(int)
df_common['n_classes'] = df_common['n_classes'].astype(int)

In [87]:
df_common

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0018167e4ff1486a8a909f25fdaa5d76,FINISHED,1734289233443,1.734364e+12,100.0,RecursiveClustering,5,100,2,16,1000,...,2077.516,1.609438,5.0,1.000000,1.000000,0.936638,,0fc46cf1ca434f86841b195a2b92e37d,,False
002193f658094383b0f627c470ba56ae,FINISHED,1734314061235,1.734333e+12,100.0,SpectralSubspaceRandomization,5,100,2,16,100,...,412.472,1.609438,7.0,0.921757,0.960404,0.615130,,bedee9589bea483ca7379588f1db31b8,,False
0023abb5ee2140a7b29b46776f01d1cb,FINISHED,1734334144613,1.734467e+12,10.0,Proclus,5,10000,2,16,1000,...,656.852,0.017358,8.0,0.010126,0.686326,-0.034658,,d75026e4d65a4841b4f3efb2cee88fad,,False
002ff3e2dd904189a04b1568c2a4cfcb,FINISHED,1734288888094,1.734326e+12,10.0,WardAgglomerativeClustering,5,100,2,16,1000,...,2062.552,1.609438,12.0,0.800623,0.895393,0.105404,,7009338b4aac4a1b81815335fffe138d,,False
003d9c7cb27d4db18cfd570de01c35dd,FINISHED,1734292682676,1.734321e+12,100.0,KMeans,5,1000,2,16,1000,...,2069.860,1.609438,5.0,1.000000,1.000000,0.881070,,f4587824429f4f848e6be406124a2de5,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffbdad9588f0455480ca59645773da0f,FINISHED,1734295865271,1.734410e+12,100.0,AverageAgglomerativeClustering,5,1000,2,16,1000,...,2141.648,1.609438,5.0,1.000000,1.000000,0.787634,,5e6823c34f3749a3832cb34c2cde2b52,,False
ffbea3f03bf94cb89131832f6d2df628,FINISHED,1734290981889,1.734316e+12,10.0,KMeans,5,1000,2,16,1000,...,2065.080,0.784140,10.0,0.402668,0.812697,0.064731,,b0dddd5d266048b7baaa2f5b7b7bce5f,,False
ffd07194432a4f399e2677dec7e98d62,FINISHED,1734299196992,1.734435e+12,100.0,SingleAgglomerativeClustering,5,1000,2,16,100,...,2137.760,1.609438,7.0,0.968492,0.988889,0.563874,,9cf711ec5db346c29a64c10a156f825d,,False
ffe8865703464b1d96376e2320ed96bd,FINISHED,1734296998050,1.734414e+12,100.0,AverageAgglomerativeClustering,5,1000,2,16,1000,...,2139.360,1.609438,12.0,0.976104,0.994142,0.358206,,2ce9b9ce81364285a5d1ea2b9bd15162,,False


# Plots

In [26]:
datasets = pd.read_csv('/home/bbelucci/code/recursive_clustering/recursive_clustering/openml_datasets.csv')

In [27]:
df = df_runs_parents.copy()
df['dataset_id'] = df['dataset_id'].astype(int)
df = df.join(datasets.set_index('dataset_id')[['n_instances', 'n_features', 'n_classes']], on='dataset_id', rsuffix='_dataset')

In [28]:
df = df.loc[df['model_nickname'] == 'RecursiveClustering']
df = df.sort_values('best_adjusted_rand', ascending=False)

In [29]:
df[['dataset_id', 'dataset_name', 'n_instances', 'n_features_dataset', 'n_classes_dataset', 'best_adjusted_rand']]

Unnamed: 0_level_0,dataset_id,dataset_name,n_instances,n_features_dataset,n_classes_dataset,best_adjusted_rand
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
da0fe18112cf4676b2d2e58c6fc2ad28,458,analcatdata_authorship,841.0,71.0,4,0.873840
10ce6db39cd94cce8804face72448da4,1466,cardiotocography,2126.0,36.0,10,0.813451
74a2b0d1e1cc414383e6feba490704ca,39,ecoli,336.0,8.0,8,0.758326
698087e04b1b4eb5bf54caf1e4d628f4,377,synthetic_control,600.0,61.0,6,0.667671
98c93373a4224f4cb4315e6359715842,40685,shuttle,58000.0,10.0,7,0.651919
...,...,...,...,...,...,...
437497276aa54cd3b7b4c41b17a03f42,23,cmc,1473.0,10.0,3,0.029548
df7d45c2bdd64ca2853738f487bac5eb,40927,CIFAR_10,60000.0,3073.0,10,0.014253
562841d9791d44a9a7e5422ec3ff2eae,469,analcatdata_dmft,797.0,5.0,6,0.013220
ca88e4c96f414adebcffb0c76a9615e5,41164,fabert,8237.0,801.0,7,0.010549


In [114]:
def get_parameters_string(row):
    parameter_names = {
        'components_size': 'q',
        'repetitions': 'R',
        'kmeans_n_clusters': 'C',
        'n_clusters': 'C',
        'min_cluster_size': 'C_{\\text{min}}',
        'eps': '\epsilon',
        'min_samples': 'n_{\\text{min}}',
        'damping': '\lambda',
        'n_similarities': 'm',
        'sampling_ratio': 'r',
        'sc_n_clusters': 'C',
    }
    first = True
    str = ''
    for p in all_model_parameters:
        if not pd.isna(row[p]):
            if not first:
                str += '; '
            else:
                first = False
            value = float(row[p])
            if value.is_integer():
                value = int(value)
                str += f'${parameter_names[p]}={value}$'
            else:
                str += f'${parameter_names[p]}={value:0.2f}$'
    return str        

In [115]:
def highlight_max(df):
    max_values = df.groupby(level=0).transform('max')
    is_highlighted = df == max_values
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_highlighted] = 'font-weight: bold'
    return df_css

In [116]:
datasets_ids = [40685, 39, 61, 182, 40984, 1478, 1568]
datasets_ids = [str(d) for d in datasets_ids]
models_nicknames = ['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering']
df = df_runs_parents.copy()
df = df.loc[df['dataset_id'].isin(datasets_ids)]
df = df.loc[df['model_nickname'].isin(models_nicknames)]
df['Parameters'] = df.apply(get_parameters_string, axis=1)
df = df[['model_nickname', 'dataset_name', 'best_adjusted_rand', 'Parameters']]
models_names = {
    'RecursiveClustering': 'CoHiRF',
    'KMeans': 'K-Means',
    'HDBSCAN': 'HDBSCAN',
    'DBSCAN': 'DBSCAN',
    'AffinityPropagation': 'Affinity Propagation',
    'OPTICS': 'OPTICS',
    'SpectralSubspaceRandomization': 'SC-SRGF',
    'WardAgglomerativeClustering': "Ward's Method",
}
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}
df = df.replace({'model_nickname': models_names})
df = df.rename(columns={'best_adjusted_rand': 'ARI', 'model_nickname': 'Model', 'dataset_name': 'Dataset'})
df = df.groupby(['Dataset', 'Model']).agg({'ARI': 'mean', 'Parameters': 'first'})

In [117]:
df

Unnamed: 0_level_0,key,ARI,Parameters
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1
ecoli,Affinity Propagation,0.248155,$\delta=0.58$
ecoli,CoHiRF,0.758326,$C=7$; $R=10$; $q=11$
ecoli,DBSCAN,0.345131,$n_{min}=7$; $\epsilon=0.78$
ecoli,HDBSCAN,0.397572,$C_{min}=10$
ecoli,K-Means,0.718701,$C=6$
ecoli,OPTICS,0.313694,$n_{min}=10$
ecoli,SC-SRGF,0.723193,$\rho=0.80$; $n_{sim}=15$; $C=4$
ecoli,Ward's Method,0.734778,$C=7$
har,Affinity Propagation,0.312771,$\delta=1.00$
har,CoHiRF,0.491249,$C=4$; $R=8$; $q=11$


In [118]:
print(df.style.apply(highlight_max, subset='ARI', axis=None).format(precision=3, na_rep='No Run').to_latex(hrules=True, clines='skip-last;data', convert_css=True, column_format='p{1cm}lp{0.5cm}l'))

\begin{tabular}{p{1cm}lp{0.5cm}l}
\toprule
 & key & ARI & Parameters \\
Dataset & Model &  &  \\
\midrule
\multirow[c]{8}{*}{ecoli} & Affinity Propagation & 0.248 & $\delta=0.58$ \\
 & CoHiRF & \bfseries 0.758 & $C=7$; $R=10$; $q=11$ \\
 & DBSCAN & 0.345 & $n_{min}=7$; $\epsilon=0.78$ \\
 & HDBSCAN & 0.398 & $C_{min}=10$ \\
 & K-Means & 0.719 & $C=6$ \\
 & OPTICS & 0.314 & $n_{min}=10$ \\
 & SC-SRGF & 0.723 & $\rho=0.80$; $n_{sim}=15$; $C=4$ \\
 & Ward's Method & 0.735 & $C=7$ \\
\cline{1-4}
\multirow[c]{8}{*}{har} & Affinity Propagation & 0.313 & $\delta=1.00$ \\
 & CoHiRF & 0.491 & $C=4$; $R=8$; $q=11$ \\
 & DBSCAN & 0.302 & $n_{min}=3$; $\epsilon=13.91$ \\
 & HDBSCAN & 0.287 & $C_{min}=6$ \\
 & K-Means & 0.438 & $C=9$ \\
 & OPTICS & 0.001 & $n_{min}=4$ \\
 & SC-SRGF & \bfseries 0.546 & $\rho=0.45$; $n_{sim}=21$; $C=20$ \\
 & Ward's Method & 0.511 & $C=4$ \\
\cline{1-4}
\multirow[c]{8}{*}{iris} & Affinity Propagation & 0.477 & $\delta=0.98$ \\
 & CoHiRF & 0.631 & $C=4$; $R=7$; $q=26$

In [80]:
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters,parameters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0d3f0312fdfb4b4eb909bcaf25b1703e,FINISHED,1737516206511,1737518000000.0,40984,segment,maximize,adjusted_rand,AffinityPropagation,,,...,,,,,,False,,,,damping: 0.9294026686605588
10ac2ec99d13499a8b4d80afbbbe05a3,FINISHED,1737568999692,1737569000000.0,1568,nursery,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,3.0,,,False,,,,n_clusters: 3
1577a524c14344f7a86903084f02949e,FINISHED,1737504126260,1737504000000.0,40984,segment,maximize,adjusted_rand,KMeans,,,...,,,10.0,,,False,,,,n_clusters: 10
1c48c1907f1447e89c724333a731af26,FINISHED,1737504067203,1737515000000.0,1478,har,maximize,adjusted_rand,RecursiveClustering,,,...,,,,,,False,8.0,,,"components_size: 11, kmeans_n_clusters: 4, rep..."
1c8422d391974a038aaea93cffd7c20f,FINISHED,1737569000346,1737569000000.0,40984,segment,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,8.0,,,False,,,,n_clusters: 8
27b16ca431d14bf295265e04a20572f0,FINISHED,1737557104228,1737588000000.0,40984,segment,maximize,adjusted_rand,SpectralSubspaceRandomization,,,...,,,,23.0,,False,,0.6533337717772364,14.0,"n_similarities: 23, sampling_ratio: 0.65333377..."
294b4441caa04a7587a00051e738b19a,FINISHED,1737557013902,1737557000000.0,40984,segment,maximize,adjusted_rand,DBSCAN,,,...,,7.0,,,,False,,,,"eps: 0.5374182452826163, min_samples: 7"
2fcb935e2cb0474dafc839d7069548f9,FINISHED,1737556959605,1737558000000.0,182,satimage,maximize,adjusted_rand,HDBSCAN,,,...,6.0,,,,,False,,,,min_cluster_size: 6
39be46ef4bcc4230b354dd8e14c8bffb,FINISHED,1737675145917,1737676000000.0,40685,shuttle,maximize,adjusted_rand,HDBSCAN,,,...,2.0,,,,,False,,,,min_cluster_size: 2
3a05d05ac49d44e98c3c754f8133e5f6,FINISHED,1737617061584,1737619000000.0,40984,segment,maximize,adjusted_rand,OPTICS,,,...,,10.0,,,,False,,,,min_samples: 10


In [50]:
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters,parameters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0d3f0312fdfb4b4eb909bcaf25b1703e,FINISHED,1737516206511,1737518000000.0,40984,segment,maximize,adjusted_rand,AffinityPropagation,,,...,,,,,,False,,,,
10ac2ec99d13499a8b4d80afbbbe05a3,FINISHED,1737568999692,1737569000000.0,1568,nursery,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,3.0,,,False,,,,
1577a524c14344f7a86903084f02949e,FINISHED,1737504126260,1737504000000.0,40984,segment,maximize,adjusted_rand,KMeans,,,...,,,10.0,,,False,,,,
1c48c1907f1447e89c724333a731af26,FINISHED,1737504067203,1737515000000.0,1478,har,maximize,adjusted_rand,RecursiveClustering,,,...,,,,,,False,8.0,,,
1c8422d391974a038aaea93cffd7c20f,FINISHED,1737569000346,1737569000000.0,40984,segment,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,8.0,,,False,,,,
27b16ca431d14bf295265e04a20572f0,FINISHED,1737557104228,1737588000000.0,40984,segment,maximize,adjusted_rand,SpectralSubspaceRandomization,,,...,,,,23.0,,False,,0.6533337717772364,14.0,
294b4441caa04a7587a00051e738b19a,FINISHED,1737557013902,1737557000000.0,40984,segment,maximize,adjusted_rand,DBSCAN,,,...,,7.0,,,,False,,,,
2fcb935e2cb0474dafc839d7069548f9,FINISHED,1737556959605,1737558000000.0,182,satimage,maximize,adjusted_rand,HDBSCAN,,,...,6.0,,,,,False,,,,
39be46ef4bcc4230b354dd8e14c8bffb,FINISHED,1737675145917,1737676000000.0,40685,shuttle,maximize,adjusted_rand,HDBSCAN,,,...,2.0,,,,,False,,,,
3a05d05ac49d44e98c3c754f8133e5f6,FINISHED,1737617061584,1737619000000.0,40984,segment,maximize,adjusted_rand,OPTICS,,,...,,10.0,,,,False,,,,


In [29]:
# parameters of each model:
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,v_measure,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0d3f0312fdfb4b4eb909bcaf25b1703e,FINISHED,1737516206511,1737518000000.0,40984,segment,maximize,adjusted_rand,AffinityPropagation,,,...,0.030963,9.0,0.028502,0.206498,0.371367,0.028502,,0d7f660a5a6145a0a35ff6d18a47cae3,,False
10ac2ec99d13499a8b4d80afbbbe05a3,FINISHED,1737568999692,1737569000000.0,1568,nursery,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,0.412775,14.0,0.218916,0.695642,0.01422,0.218916,,424acca7d3f24ecd86ec6594ea25151b,,False
1577a524c14344f7a86903084f02949e,FINISHED,1737504126260,1737504000000.0,40984,segment,maximize,adjusted_rand,KMeans,,,...,1.179973,14.0,0.564665,0.861749,0.39477,0.564665,,a1f609411ea347128e103bddba078b07,,False
1c48c1907f1447e89c724333a731af26,FINISHED,1737504067203,1737515000000.0,1478,har,maximize,adjusted_rand,RecursiveClustering,,,...,0.775495,14.0,0.449194,0.772594,-0.018323,0.449194,,de36a2227c364abfa8d85049c6e3d264,,False
1c8422d391974a038aaea93cffd7c20f,FINISHED,1737569000346,1737569000000.0,40984,segment,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,1.117544,14.0,0.541587,0.858334,0.431105,0.541587,,de1a40e3265c4dd29984c438e265d288,,False
27b16ca431d14bf295265e04a20572f0,FINISHED,1737557104228,1737588000000.0,40984,segment,maximize,adjusted_rand,SpectralSubspaceRandomization,,,...,1.461261,17.0,0.649277,0.89134,0.198541,0.649277,,9443a3e882944fc79af9afdc12702716,,False
294b4441caa04a7587a00051e738b19a,FINISHED,1737557013902,1737557000000.0,40984,segment,maximize,adjusted_rand,DBSCAN,,,...,1.130986,41.0,0.517286,0.805215,-0.060018,0.517286,,b85977a952834338891e067174ce5576,,False
2fcb935e2cb0474dafc839d7069548f9,FINISHED,1737556959605,1737558000000.0,182,satimage,maximize,adjusted_rand,HDBSCAN,,,...,0.292927,5.0,0.254284,0.437044,0.035195,0.254284,,34778f18c700414da3f4be2f0ddfc780,,False
39be46ef4bcc4230b354dd8e14c8bffb,FINISHED,1737675145917,1737676000000.0,40685,shuttle,maximize,adjusted_rand,HDBSCAN,,,...,0.55715,1825.0,0.152159,0.36106,0.045138,0.152159,,5253a2ea464d433fb53b9e10ef74f257,,False
3a05d05ac49d44e98c3c754f8133e5f6,FINISHED,1737617061584,1737619000000.0,40984,segment,maximize,adjusted_rand,OPTICS,,,...,0.524235,52.0,0.283064,0.515303,-0.349834,0.283064,,aa1d7221ef6b42748ee3e39af4e7ee93,,False


In [27]:
df

Dataset,ecoli,har,iris,nursery,satimage,segment,shuttle
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AffinityPropagation,0.248155,,0.477171,,,0.279673,
DBSCAN,0.345131,0.302146,0.558371,0.0,0.296792,0.250797,0.685962
HDBSCAN,0.397572,0.286958,0.563751,0.014181,0.308123,0.390071,0.001018
KMeans,0.718701,0.43831,0.592333,0.150396,0.566324,0.512024,0.608404
OPTICS,0.313694,0.000694,0.396136,0.0,0.022515,0.09727,0.016784
RecursiveClustering,0.758326,0.491249,0.631048,0.440254,0.583496,0.539809,0.651919
SpectralSubspaceRandomization,,,,,,0.481409,
WardAgglomerativeClustering,0.734778,0.510766,0.615323,0.253798,0.484501,0.446081,0.478068


In [52]:
df = df_runs_raw_parents.copy()
df = df.loc[df['model_nickname'] == 'RecursiveClustering']
df = df.loc[df['metric'] == 'adjusted_rand']
df = df.loc[(df['model_params/normalization'] == '1') | (df['standardize'] == 'True')]
df = df[['model_nickname', 'dataset_name', 'dataset_id', 'model_params/normalization', 'standardize', 'best_adjusted_rand']]

In [56]:
df = df.loc[df.duplicated(subset='dataset_id', keep=False)]
df = df.sort_values(['dataset_id', 'best_adjusted_rand'])

In [57]:
df

key,model_nickname,dataset_name,dataset_id,model_params/normalization,standardize,best_adjusted_rand
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
864ffa472636448f8c818a03dac0e102,RecursiveClustering,mfeat-factors,12,1.0,False,0.512992
48bf194dc53e40e39258a1b6ad6ef4fa,RecursiveClustering,mfeat-factors,12,,True,0.541314
3636b6d774d3490eb5564cc679270785,RecursiveClustering,mfeat-morphological,18,1.0,False,0.56285
37fc83ca94d644208788c721ad681af7,RecursiveClustering,mfeat-morphological,18,,True,0.573092
f0e8851a821345bdab9b6527e0d3cc36,RecursiveClustering,satimage,182,1.0,False,0.55699
50f3fd8149324f6da7314c1d27cc06cf,RecursiveClustering,satimage,182,,True,0.583496
4b37b3fcbfb9414ab3b59a49e1696de2,RecursiveClustering,page-blocks,30,,True,0.572026
eec8aab4384b4a6f8e0aea1e2af25d2e,RecursiveClustering,page-blocks,30,1.0,False,0.603016
d1bb6eb83a6943ea8be2497c99fba63c,RecursiveClustering,pendigits,32,1.0,False,0.615024
00938023ec844e0eb4bb73a6d161ff3b,RecursiveClustering,pendigits,32,,True,0.646296


In [75]:
df = df_metrics.copy()
df = df.loc[df.index.get_level_values('run_uuid') == 'eec8aab4384b4a6f8e0aea1e2af25d2e']

In [80]:
x_metric = 'silhouette'
df = df.sort_values(x_metric)
px.scatter(df, x=x_metric, y='adjusted_rand')

In [109]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 100.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [169]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
# df = df.loc[df['pct_random'] == 0.7]
df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_easy.csv')
df

key,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,1.0,0.0,5.0,5.0,5.0,537.226421,39.111352,2059.4429,1.779558,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
DBSCAN,0.0,0.0,1.0,1.0,1.0,495.724737,24.503844,2034.5569,1.282687,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
HDBSCAN,1.0,0.0,5.0,5.0,5.0,156.379071,16.678345,1951.1594,228.016285,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
KMeans,1.0,0.0,5.0,5.0,5.0,455.211977,39.756185,2024.6738,151.542389,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
OPTICS,1.0,0.0,5.0,5.0,5.0,186.481377,12.323363,2067.3815,0.839997,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
RecursiveClustering,1.0,0.0,5.0,5.0,5.0,2264.785293,217.413369,2080.029,2.354198,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,3496.919557,733.693875,1108.6247,27.026635,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
WardAgglomerativeClustering,1.0,0.0,5.0,5.0,5.0,267.337123,30.720733,2059.133,1.695819,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10


In [108]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 10.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_hard.csv')
df

key,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.189093,0.017298,21.5,2.0,42.0,529.730182,10.693666,2059.49,2.024502,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
DBSCAN,0.0,0.0,1.0,1.0,1.0,478.425069,8.144741,2034.9944,1.190787,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
HDBSCAN,0.993713,0.003036,6.0,6.0,6.0,157.498777,5.076115,2008.3772,178.319054,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
KMeans,0.200988,0.060978,2.0,2.0,2.0,421.863799,23.231615,2065.8152,4.504178,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
OPTICS,0.399327,0.344323,2.5,2.0,4.5,193.6702,4.669891,2067.3468,1.081793,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
RecursiveClustering,0.433655,0.230195,4.0,3.0,4.75,2390.756949,168.434082,2078.184,2.627123,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
SpectralSubspaceRandomization,0.915287,0.202475,5.0,5.0,5.0,3650.157271,473.980725,1103.11,4.184572,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
WardAgglomerativeClustering,0.494884,0.296377,2.0,2.0,4.25,250.402186,13.8455,2058.1964,2.105722,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10


# With filling values

In [132]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents_fill = df_runs_raw_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

In [133]:
# for the moment, I did not manage to run HDBSCAN and SpectralSubspaceRandomization with samples >= 10000, so we will fill these values for the moment as "no run" and we will see later
status = 'FAILED'
no_run_time = np.inf
no_run_memory = np.inf
no_run_metric = np.nan
raised_exception = True
EXCEPTION = 'NoRun'
start_time = time.time() * 1000
end_time = time.time() * 1000

In [134]:
model_nickname = 'HDBSCAN'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [135]:
model_nickname = 'SpectralSubspaceRandomization'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [136]:
# ensure no duplicates
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

# Missing

In [137]:
df_runs_parents_fill['model_nickname'].unique().tolist()

['RecursiveClustering',
 'AverageAgglomerativeClustering',
 'KMeansProj',
 'HDBSCAN',
 'OPTICS',
 'SpectralSubspaceRandomization',
 'Proclus',
 'DBSCAN',
 'WardAgglomerativeClustering',
 'KMeans',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'CompleteAgglomerativeClustering',
 'IRFLLRR',
 'Clique',
 'AffinityPropagation']

In [138]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]

In [139]:
df_runs_parents_fill

Unnamed: 0,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
000b291a142341289479933ddb447488,FINISHED,1.736872e+12,1.736877e+12,50.0,RecursiveClustering,5,10000,2,16,1000,...,1156.568,1.609438,577.0,0.425927,0.806024,-0.104928,,e62023cb27d941dd8b9683eaef947ccc,,False
000f29a52641434a9a03ec3951ba12ba,FINISHED,1.736890e+12,1.736911e+12,50.0,AverageAgglomerativeClustering,5,10000,2,16,10000,...,1941.040,1.608578,8.0,0.997419,0.999561,0.283838,,8e0a1045c9474ca5925d553aa6030f20,,False
0015c6675ce048ec99ded8242b2df23e,FINISHED,1.734337e+12,1.734515e+12,100.0,KMeansProj,5,100,2,16,1000,...,473.880,1.079833,12.0,0.539120,0.835267,-0.002984,,d4a718cc5d92496ba63ebf72357e77db,,False
0018167e4ff1486a8a909f25fdaa5d76,FINISHED,1.734289e+12,1.734364e+12,100.0,RecursiveClustering,5,100,2,16,1000,...,2077.516,1.609438,5.0,1.000000,1.000000,0.936638,,0fc46cf1ca434f86841b195a2b92e37d,,False
00187fd8f9df4282ba536e067e9b6881,FINISHED,1.736687e+12,1.736691e+12,100.0,HDBSCAN,5,100,2,16,10000,...,1212.672,1.609438,5.0,1.000000,1.000000,0.939268,,c25dc85141e84c21b389b9e9cfced842,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,100.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,50.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,10.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,100.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True


In [144]:
n_samples = ['100', '1000', '10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(1)]
class_sep = ['10.0', '100.0']
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'SingleAgglomerativeClustering',
    'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, pct_random, seed_unified, class_sep]
df_missing = get_missing_entries(df_runs_parents_fill, columns_names, should_contain_values)

In [145]:
df_missing

Unnamed: 0,model_nickname,n_samples,n_features,pct_random,seed_unified,class_sep
0,SingleAgglomerativeClustering,10000,10000,0.0,0,10.0
1,SingleAgglomerativeClustering,10000,10000,0.0,0,100.0
2,SingleAgglomerativeClustering,10000,10000,0.2,0,10.0
3,SingleAgglomerativeClustering,10000,10000,0.2,0,100.0
4,SingleAgglomerativeClustering,10000,10000,0.5,0,10.0
5,SingleAgglomerativeClustering,10000,10000,0.5,0,100.0
6,SingleAgglomerativeClustering,10000,10000,0.7,0,10.0
7,SingleAgglomerativeClustering,10000,10000,0.7,0,100.0
8,SpectralClustering,10000,100,0.5,0,10.0
9,SpectralClustering,10000,100,0.5,0,100.0


# Get common combinations

In [146]:
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
df = df_runs_parents_fill.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [147]:
df_common_fill = get_df_with_combinations(df, combination_columns, common_combinations)

In [148]:
df_common_fill['n_samples'] = df_common_fill['n_samples'].astype(int)
df_common_fill['n_features'] = df_common_fill['n_features'].astype(int)
df_common_fill['pct_random'] = df_common_fill['pct_random'].astype(float)
df_common_fill['class_sep'] = df_common_fill['class_sep'].astype(float)
df_common_fill['seed_unified'] = df_common_fill['seed_unified'].astype(int)
df_common_fill['n_classes'] = df_common_fill['n_classes'].astype(int)

## Plot

In [154]:
df

Unnamed: 0,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
9b5af53d2adb4e66b6c299cc9dd03f84,FINISHED,1736864000000.0,1736871000000.0,100.0,AffinityPropagation,5,1000,2.0,16.0,10000,...,4457.724,1.609438,18.0,0.719116,0.858192,0.084554,,8b29db04c53947fb8a6298af4012597f,,False
6e13b2c209784d15ad55255aff27e0fd,FINISHED,1736864000000.0,1736868000000.0,100.0,AverageAgglomerativeClustering,5,1000,2.0,16.0,10000,...,1231.704,1.609438,14.0,0.963381,0.989579,0.261144,,2a961b3635994d84a2494e22d57f7e76,,False
6fc65459261041fda9074a00019fee06,FINISHED,1736688000000.0,1736808000000.0,100.0,CompleteAgglomerativeClustering,5,1000,2.0,16.0,10000,...,3575.068,1.609438,14.0,0.779245,0.88911,0.079613,,61cc5cae6000412bbb61cdaf2bf7fec7,,False
37261cab80634ea7b30a76a55b63d541,FINISHED,1736686000000.0,1736694000000.0,100.0,DBSCAN,5,1000,2.0,16.0,10000,...,1918.316,0.0,1.0,0.0,0.19992,-1.0,,f93e17338fdb4361a1ab9ca7acb61d9f,,False
a88089c38f8548eab4a4f5a687b268c5,FINISHED,1736687000000.0,1736701000000.0,100.0,HDBSCAN,5,1000,2.0,16.0,10000,...,1301.976,1.609438,5.0,1.0,1.0,0.839192,,d07a9c27fc094cd0b10e2291625f6a04,,False
HDBSCAN_10000_1000,FAILED,1736947000000.0,1736947000000.0,100.0,HDBSCAN,5,1000,,,10000,...,inf,,,,,,NoRun,,,True
b5e3972f272d47199e54bb7d9a1ae019,FINISHED,1736866000000.0,1736866000000.0,100.0,KMeans,5,1000,2.0,16.0,10000,...,1190.328,1.609438,14.0,0.760001,0.873464,0.11079,,339825b5dad64b748b8ba9fec4b5dc3d,,False
4d07fcc980ec4f939220cb15718a0d46,FINISHED,1736707000000.0,1736746000000.0,100.0,OPTICS,5,1000,2.0,16.0,10000,...,1287.588,1.609438,5.0,1.0,1.0,0.839192,,482d10afeb4e44a2b3a78a35966e8f00,,False
ffaaa267a0634c8786d45d623e7a5a6a,FINISHED,1736708000000.0,1736767000000.0,100.0,Proclus,5,1000,2.0,16.0,10000,...,1697.296,0.000755,4.0,0.000523,0.617884,-0.007934,,489a115c31194f1f878d8961d7888b3a,,False
9bc581d07a254b95a7c77a44afee74a2,FINISHED,1736791000000.0,1736824000000.0,100.0,RecursiveClustering,5,1000,2.0,16.0,10000,...,2186.084,1.609438,7.0,0.999242,0.99988,0.623656,,f161eb41e12e4109a0a29037aa0e7659,,False


In [192]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [160]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_easy_more_samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.160395,0.040795,57.0,47.75,68.5,6950.536954,455.081972,4258.068,15.686925,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,1035.232506,75.605361,3573.426,1.666874,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,,,,,,inf,,inf,,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
KMeans,1.0,0.0,5.0,5.0,5.0,749.472146,169.602759,1938.543,863.966482,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,9990.183728,510.976754,1981.357,6.291208,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,1.0,0.0,5.0,5.0,5.0,3492.044976,772.109737,2571.013,1067.58143,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.782541,0.0,4.0,4.0,4.0,7430.75458,155.17805,3571.057,2.912283,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1


In [177]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 10.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_hard_more_samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.408197,,5.0,5.0,5.0,7348.131929,,3590.168,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
DBSCAN,0.0,,1.0,1.0,1.0,406.390341,,2165.288,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
HDBSCAN,0.9993746,,6.0,6.0,6.0,inf,,inf,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
KMeans,0.1383373,,2.0,2.0,2.0,269.914718,,1187.536,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
OPTICS,9.270328e-07,,18.0,18.0,18.0,4420.699471,,1287.016,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
RecursiveClustering,0.409462,,3.0,3.0,3.0,7003.752386,,2194.076,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
WardAgglomerativeClustering,0.9457308,,5.0,5.0,5.0,1533.686058,,1308.08,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1


In [195]:
df = df_common_fill.copy()
n_samples = 100
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.705632,0.222672,10.0,7.25,12.25,291.665205,6.030363,1070.73,640.08979,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,468.850985,56.704983,1331.866,480.067282,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,1.0,0.0,5.0,5.0,5.0,245.753154,2.265686,1118.75,644.658505,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
KMeans,0.734719,0.386622,5.0,4.25,5.5,228.895834,17.110452,328.976,8.565886,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
OPTICS,0.996882,0.006235,5.0,5.0,5.25,81.335244,1.056877,1073.542,643.367891,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,0.977885,0.036395,5.0,5.0,5.25,1223.913159,185.241066,1469.911,561.160744,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,1566.598713,63.137136,585.495,351.864159,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.924978,0.090899,5.0,5.0,5.25,264.170412,5.403925,1106.073,640.410057,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1


In [196]:
df = df_common_fill.copy()
n_samples = 1000
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.394753,0.171944,3.0,2.5,5.5,212.604738,43.821401,1863.784,1419.461313,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,433.483653,12.986188,1807.408,331.777796,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
HDBSCAN,1.0,0.0,5.0,5.0,5.0,223.830181,16.12429,1411.584,604.404744,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
KMeans,0.705713,0.436123,5.0,3.5,5.0,194.259393,25.739411,725.601333,442.063026,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,435.720034,5.135288,1349.556,604.088144,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
RecursiveClustering,0.960138,0.058236,5.0,5.0,5.5,1211.743927,99.746887,1532.3,403.019641,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,4635.317239,440.833632,1483.517333,401.51956,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
WardAgglomerativeClustering,0.999165,0.001446,5.0,5.0,5.0,279.317189,27.937345,1514.278667,427.108678,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1


In [197]:
df = df_common_fill.copy()
n_samples = 10000
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.087754,0.019162,112.5,93.5,133.75,5300.751929,457.882468,4447.989,183.783045,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,1079.975262,40.682919,3297.037,295.117361,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,,,,,,inf,,inf,,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
KMeans,0.793508,0.3334,5.0,4.25,5.0,563.6524,31.476729,2326.306,740.989262,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,8207.799477,992.601355,3084.931,320.679048,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,0.929387,0.130033,5.0,4.75,5.0,3845.434804,570.223215,3136.996,294.056798,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.740728,0.083625,4.0,3.75,4.0,7515.465591,141.255042,3099.448,320.918731,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1


# Debug and explore

In [106]:
df = df_runs_raw_parents.copy()

In [107]:
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0018714386f44f95ab7141d480ce09f0,FINISHED,1737557036103,1.737557e+12,1044,eye_movements,maximize,adjusted_rand,HDBSCAN,,,...,422.740,0.003246,6.0,0.005303,0.358073,0.527841,,756cc869acc94fed89376ae563e075a2,,False
00502ff54da4466191d5fadd2ab0f945,FINISHED,1737556950053,1.737557e+12,7,audiology,maximize,adjusted_rand,HDBSCAN,,,...,361.448,0.078387,3.0,0.058583,0.213176,-0.124492,,3d80243180a847ceb7542ac29e31edd8,,False
00b6f8e09ce1419e84214bb9734a9bff,FINISHED,1737589285408,1.737590e+12,23380,cjs,maximize,adjusted_rand,HDBSCAN,,,...,491.640,0.271687,51.0,0.125215,0.739143,0.441531,,f7b70e6394ce4793bd5d1924df8d7cf0,,False
00b7b74b49c2418e8cd9108326d76f54,FINISHED,1737504019181,1.737512e+12,46335,primary-tumor_clean,maximize,adjusted_rand,RecursiveClustering,,,...,541.000,2.169609,216.0,0.563600,0.891047,0.359055,,f2af19c022474cc587b3a26c1c1518b7,,False
0200fd7653474a6ab77e2fa4085e294f,FINISHED,1737617015722,1.737619e+12,16,mfeat-karhunen,maximize,adjusted_rand,OPTICS,,,...,1005.424,0.028445,4.0,0.023843,0.120347,-0.058768,,e1c3ab245a3e47d0813cd19cb56802d0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fb946510558746d29d122b87ee9869da,FINISHED,1737617084498,1.737620e+12,30,page-blocks,maximize,adjusted_rand,OPTICS,,,...,1005.424,0.081613,80.0,0.099852,0.634365,-0.584304,,6578c7710d3d445eb88dd432d6c88b91,,False
fd61b8a547c54e11b898653ebc8769db,FINISHED,1737557019786,1.737557e+12,40984,segment,maximize,adjusted_rand,HDBSCAN,,,...,410.420,1.356621,33.0,0.598839,0.856482,0.196516,,d81f797b53d142e68e400b8143cb2586,,False
fdc6bb9a534743c7be6ce026fd67252a,FINISHED,1737556999587,1.737557e+12,61,iris,maximize,adjusted_rand,DBSCAN,,,...,7826.080,0.638874,3.0,0.685822,0.772975,0.507009,,abfd2a33adaf48689df0ef0a86e211b6,,False
feca472c0d2149368332abf98bab6bd7,FINISHED,1737516203798,1.737519e+12,42,soybean,maximize,adjusted_rand,AffinityPropagation,,,...,308.308,2.379024,76.0,0.691180,0.921723,0.160309,,9ab6f55ea3df48e589d4358bbc15688b,,False


In [108]:
df = df.loc[df['status'] == 'RUNNING']

In [109]:
df

key,status,start_time,end_time,dataset_id,dataset_name,direction,hpo_metric,model_nickname,n_classes,n_features,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0289c581d4a74c6998008fd8c6a23a8b,RUNNING,1737633122174,,41168,,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,0.145176,25.0,0.073103,0.630174,-0.003434,,,,
0417c1059c264cdf8399c22c311b9876,RUNNING,1737514361699,,41168,,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,,,,,,,,,
043d0cbd906747819040f8979d7f40d1,RUNNING,1737626372810,,40996,,maximize,adjusted_rand,HDBSCAN,,,...,,,,,,,,,,
08ca8fb95ff84adcb2b8c6567a29cc34,RUNNING,1737626671949,,41168,,maximize,adjusted_rand,WardAgglomerativeClustering,,,...,,0.145176,25.0,0.073103,0.630174,-0.003434,,,,
16a5d69973d3475eae567d57d9241d3d,RUNNING,1737516197385,,41027,,maximize,adjusted_rand,AffinityPropagation,,,...,,,,,,,,,,
1798d892c9b54192ac736b8013d9229e,RUNNING,1737617191351,,1596,,maximize,adjusted_rand,OPTICS,,,...,,,,,,,,,,
1e3053ee2da046f8aaf7a0e0c70ba9dc,RUNNING,1737589047902,,184,,maximize,adjusted_rand,DBSCAN,,,...,,0.0,1.0,0.0,0.104322,-1.0,,,,
230e310a3ff640f6865d1bda85845caf,RUNNING,1737504059995,,1501,,maximize,adjusted_rand,RecursiveClustering,,,...,,0.94991,64.0,0.335039,0.880739,-0.074361,,,,
2490402188d94a7ca9f499eee32a52d3,RUNNING,1737590230603,,1501,,maximize,adjusted_rand,RecursiveClustering,,,...,,0.272477,7.0,0.140823,0.686752,-0.005928,,,,
2545cfaa4ce44d4ba25a4c57945f8f4a,RUNNING,1737591088527,,40927,,maximize,adjusted_rand,HDBSCAN,,,...,,,,,,,,,,


In [110]:
parent_run_ids = list(df.index)

In [111]:
parent_run_ids

['0289c581d4a74c6998008fd8c6a23a8b',
 '0417c1059c264cdf8399c22c311b9876',
 '043d0cbd906747819040f8979d7f40d1',
 '08ca8fb95ff84adcb2b8c6567a29cc34',
 '16a5d69973d3475eae567d57d9241d3d',
 '1798d892c9b54192ac736b8013d9229e',
 '1e3053ee2da046f8aaf7a0e0c70ba9dc',
 '230e310a3ff640f6865d1bda85845caf',
 '2490402188d94a7ca9f499eee32a52d3',
 '2545cfaa4ce44d4ba25a4c57945f8f4a',
 '26ab3cc84c4a420b8857b1a95ed6f767',
 '2ec4852314ed431f96d3841991fdc743',
 '338ba54bce1345a2a924b657bd5993bf',
 '3692c78f34fd4c3c885c21c7a9071eaa',
 '3eb48652d7dd4c87ac58ddd07b3c5175',
 '4ea354c749a844fea7fa84bbb49cb447',
 '54433df700ed4a78801c8425094c326f',
 '571eb50796254ba09de64dc30acdebc1',
 '6a48f613362447c78e3359b5c9dcc2f3',
 '6cc3d2d840764cbea8c488d14e14b443',
 '6e847cd448bd4b8184b831772de744bd',
 '71d0891f6ac041e38d423fe3328f130e',
 '7239cc6680ec46a880e51cd9e02c3b62',
 '72528d7e02ad4ba1b51ba2ca3335ae83',
 '72cbb804b0d44f4e83b8d6e723bde1d2',
 '7354437c98ce4de9af3822771725e288',
 '7e33df0102fb40968e05fb8ce3e3c909',
 

In [112]:
df = df_runs_raw.copy()
df = df.loc[df['parent_run_id'].isin(parent_run_ids)]

In [113]:
child_run_ids = list(df.index)

In [114]:
runs_to_delete = parent_run_ids + child_run_ids

In [115]:
runs_to_delete

['0289c581d4a74c6998008fd8c6a23a8b',
 '0417c1059c264cdf8399c22c311b9876',
 '043d0cbd906747819040f8979d7f40d1',
 '08ca8fb95ff84adcb2b8c6567a29cc34',
 '16a5d69973d3475eae567d57d9241d3d',
 '1798d892c9b54192ac736b8013d9229e',
 '1e3053ee2da046f8aaf7a0e0c70ba9dc',
 '230e310a3ff640f6865d1bda85845caf',
 '2490402188d94a7ca9f499eee32a52d3',
 '2545cfaa4ce44d4ba25a4c57945f8f4a',
 '26ab3cc84c4a420b8857b1a95ed6f767',
 '2ec4852314ed431f96d3841991fdc743',
 '338ba54bce1345a2a924b657bd5993bf',
 '3692c78f34fd4c3c885c21c7a9071eaa',
 '3eb48652d7dd4c87ac58ddd07b3c5175',
 '4ea354c749a844fea7fa84bbb49cb447',
 '54433df700ed4a78801c8425094c326f',
 '571eb50796254ba09de64dc30acdebc1',
 '6a48f613362447c78e3359b5c9dcc2f3',
 '6cc3d2d840764cbea8c488d14e14b443',
 '6e847cd448bd4b8184b831772de744bd',
 '71d0891f6ac041e38d423fe3328f130e',
 '7239cc6680ec46a880e51cd9e02c3b62',
 '72528d7e02ad4ba1b51ba2ca3335ae83',
 '72cbb804b0d44f4e83b8d6e723bde1d2',
 '7354437c98ce4de9af3822771725e288',
 '7e33df0102fb40968e05fb8ce3e3c909',
 

In [116]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [117]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    