In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries, get_common_combinations, get_df_with_combinations
import plotly.express as px
from itertools import product
import time
import numpy as np

# Save Results

## Load mlflow runs

In [2]:
db_port = 5003
db_name = 'recursive_clustering'
w = 'clust9'
# url = f'postgresql://{w}.ceremade.dauphine.lan:{db_port}/{db_name}'
url = f'postgresql://belucci@localhost:{db_port}/{db_name}'
engine = create_engine(url)
query = 'SELECT experiments.name from experiments'
experiment_names = pd.read_sql(query, engine)['name'].tolist()
# results_dir = Path('~/tab_benchmark/results')
# os.makedirs(results_dir, exist_ok=True)

In [3]:
experiment_names

['Default',
 'blob_experiment',
 'hpo_classif_experiment',
 'hpo_openml_experiment']

In [4]:
experiments_names = [
    'hpo_openml_experiment',
]

In [5]:
params_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'n_classes',
    'seed_model',
    'dataset_id',
    'dataset_name',
]

In [6]:
latest_metrics_columns = [
    'fit_model_return_elapsed_time',
    'max_memory_used',
    'n_clusters_',
    'rand_score',
    'adjusted_rand',
    'mutual_info',
    'adjusted_mutual_info',
    'normalized_mutual_info',
    'homogeneity_completeness_v_measure',
    'silhouette',
    'best_n_clusters_',
    'best_rand_score',
    'best_adjusted_rand',
    'best_mutual_info',
    'best_adjusted_mutual_info',
    'best_normalized_mutual_info',
    'best_homogeneity_completeness_v_measure',
    'best_silhouette',
]

In [7]:
tags_columns = [
    'raised_exception',
    'EXCEPTION',
    'parent_run_id',
    'best_child_run_id',
]

In [8]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [9]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)

In [10]:
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents['parent_run_id'].isna()]

In [11]:
df_runs_raw_parents

key,status,start_time,end_time,dataset_id,dataset_name,model_nickname,n_classes,n_features,n_samples,seed_model,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00632cf3ad074f87864622092ea3e0ca,FINISHED,1736959653571,1.736970e+12,46334,glass_clean,RecursiveClustering,,,,0,...,2579.736,0.373914,7.0,0.344745,0.566671,0.308996,,91a5ee697c9b46f1a9b88a8e05622a27,,False
007692590c5149fa8d32213242942963,FINISHED,1736960595210,1.736961e+12,42,soybean,HDBSCAN,,,,0,...,1470.908,1.658535,16.0,0.672717,0.868104,0.165824,,1d0c0b2c2d8d4448a25e83a5b865a743,,False
0591ac6c5b9d4afe995a029336c9b8ad,FINISHED,1736960625477,1.736961e+12,32,pendigits,HDBSCAN,,,,0,...,1405.836,1.779204,31.0,0.729315,0.912854,0.125539,,99ae7115761c4fa2837d9a78146dcc07,,False
05adbef95fbe4b8188283c434be472f1,FINISHED,1736960682070,1.736961e+12,40979,mfeat-pixel,HDBSCAN,,,,0,...,1343.916,1.206990,7.0,0.589807,0.828319,0.054027,,8394064e9dee447db08f0a830a4b3e19,,False
06f57fd295d04990b8f6447b45f555d1,FINISHED,1736960680758,1.736961e+12,40984,segment,HDBSCAN,,,,0,...,1440.508,1.356621,33.0,0.598839,0.856482,0.184424,,78114fa3548e452e9b9c58a7b8e90b13,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f5f582058bc74857abf7e48700587f07,FINISHED,1736959672774,1.736964e+12,46,splice,RecursiveClustering,,,,0,...,1435.284,1.025107,2907.0,0.229220,0.615352,0.105033,,f9585de39c8d4463a50b40ac3d3817ce,,False
f630cdd7b0b040d6822d3f875f7533dd,FINISHED,1736959653533,1.736970e+12,46336,hayes-roth_clean,RecursiveClustering,,,,0,...,7873.792,0.742270,62.0,0.296902,0.646069,0.606133,,5f77c88cb26b44c7a5faca0f1436c4a8,,False
f701b22bf9ac44d79f966c2821575301,FINISHED,1736959649535,1.736973e+12,300,isolet,RecursiveClustering,,,,0,...,10748.124,0.840302,4.0,0.378747,0.680685,0.056059,,f387c890143a4bec808ffbb6fd80d94b,,False
f78ac5d4c11d41f3954d0cb1d4d23a76,FINISHED,1736960511980,1.736961e+12,14,mfeat-fourier,HDBSCAN,,,,0,...,377.220,0.198594,3.0,0.136019,0.440904,0.088917,,7d1af63219f54fc5b4c580290618d59e,,False


## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [12]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents = df_runs_raw_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]

no_run_time = 4*3600
df_runs_timed_out = df_runs_raw.copy()
df_runs_timed_out = df_runs_timed_out.loc[df_runs_timed_out.EXCEPTION == 'FunctionTimedOut']
df_runs_timed_out = df_runs_timed_out.loc[(~df_runs_timed_out.duplicated(non_duplicate_columns))]
df_runs_timed_out['fit_model_return_elapsed_time'] = no_run_time
df_runs = pd.concat([df_runs, df_runs_timed_out])

status = 'FAILED'
no_run_memory = 2*120000
raised_exception = True
EXCEPTION = 'NoRun'
start_time = time.time() * 1000
end_time = time.time() * 1000

# IRFLLRR do not run with n_samples > 14427
model_nickname = 'IRFLLRR'
n_samples = ['14427', '50000']
n_features = ['100', '347', '1202', '4163', '14427', '50000']
combinations = product(n_samples, n_features)
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# KMeansProj do not run with (14427,14427), (14427,50000), (50000,14427), (50000,50000)
model_nickname = 'KMeansProj'
n_samples = ['14427', '50000']
n_features = ['14427', '50000']
combinations = product(n_samples, n_features)
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# HDBSCAN do not run with (14427,14427), (14427,50000), and n_samples = 50000 n_features > 347
model_nickname = 'HDBSCAN'
n_samples = ['50000']
n_features = ['347', '1202', '4163', '14427', '50000']
combinations = list(product(n_samples, n_features))
combinations += [('14427', '14427'), ('14427', '50000')]
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# SpectralSubspaceRandomization do not run with n_samples > 50000
model_nickname = 'SpectralSubspaceRandomization'
n_samples = ['50000']
n_features = ['100', '347', '1202', '4163', '14427', '50000']
combinations = list(product(n_samples, n_features))
df_cat = []
indexes = []
for n_sample, n_feature in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs = pd.concat([df_runs, df_no_run])

# ensure no duplicates
df_runs = df_runs.loc[(~df_runs.duplicated(non_duplicate_columns))]

# Missing

In [13]:
df_runs_parents['model_nickname'].unique().tolist()

['RecursiveClustering',
 'AverageAgglomerativeClustering',
 'KMeansProj',
 'HDBSCAN',
 'OPTICS',
 'SpectralSubspaceRandomization',
 'Proclus',
 'DBSCAN',
 'WardAgglomerativeClustering',
 'KMeans',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'CompleteAgglomerativeClustering',
 'IRFLLRR',
 'Clique',
 'AffinityPropagation']

In [14]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]

In [120]:
n_samples = ['100', '1000', '10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(1)]
class_sep = ['10.0', '100.0']
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'SingleAgglomerativeClustering',
    'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, pct_random, seed_unified, class_sep]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)

In [121]:
df_missing

Unnamed: 0,model_nickname,n_samples,n_features,pct_random,seed_unified,class_sep
0,HDBSCAN,10000,10000,0.0,0,10.0
1,HDBSCAN,10000,10000,0.0,0,100.0
2,HDBSCAN,10000,10000,0.2,0,10.0
3,HDBSCAN,10000,10000,0.2,0,100.0
4,HDBSCAN,10000,10000,0.5,0,10.0
5,HDBSCAN,10000,10000,0.5,0,100.0
6,HDBSCAN,10000,10000,0.7,0,10.0
7,HDBSCAN,10000,10000,0.7,0,100.0
8,SpectralSubspaceRandomization,10000,100,0.0,0,10.0
9,SpectralSubspaceRandomization,10000,100,0.0,0,100.0


# Get common combinations

In [84]:
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'SingleAgglomerativeClustering',
    'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
df = df_runs_parents.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [85]:
df_common = get_df_with_combinations(df, combination_columns, common_combinations)

In [86]:
df_common['n_samples'] = df_common['n_samples'].astype(int)
df_common['n_features'] = df_common['n_features'].astype(int)
df_common['pct_random'] = df_common['pct_random'].astype(float)
df_common['class_sep'] = df_common['class_sep'].astype(float)
df_common['seed_unified'] = df_common['seed_unified'].astype(int)
df_common['n_classes'] = df_common['n_classes'].astype(int)

In [87]:
df_common

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0018167e4ff1486a8a909f25fdaa5d76,FINISHED,1734289233443,1.734364e+12,100.0,RecursiveClustering,5,100,2,16,1000,...,2077.516,1.609438,5.0,1.000000,1.000000,0.936638,,0fc46cf1ca434f86841b195a2b92e37d,,False
002193f658094383b0f627c470ba56ae,FINISHED,1734314061235,1.734333e+12,100.0,SpectralSubspaceRandomization,5,100,2,16,100,...,412.472,1.609438,7.0,0.921757,0.960404,0.615130,,bedee9589bea483ca7379588f1db31b8,,False
0023abb5ee2140a7b29b46776f01d1cb,FINISHED,1734334144613,1.734467e+12,10.0,Proclus,5,10000,2,16,1000,...,656.852,0.017358,8.0,0.010126,0.686326,-0.034658,,d75026e4d65a4841b4f3efb2cee88fad,,False
002ff3e2dd904189a04b1568c2a4cfcb,FINISHED,1734288888094,1.734326e+12,10.0,WardAgglomerativeClustering,5,100,2,16,1000,...,2062.552,1.609438,12.0,0.800623,0.895393,0.105404,,7009338b4aac4a1b81815335fffe138d,,False
003d9c7cb27d4db18cfd570de01c35dd,FINISHED,1734292682676,1.734321e+12,100.0,KMeans,5,1000,2,16,1000,...,2069.860,1.609438,5.0,1.000000,1.000000,0.881070,,f4587824429f4f848e6be406124a2de5,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffbdad9588f0455480ca59645773da0f,FINISHED,1734295865271,1.734410e+12,100.0,AverageAgglomerativeClustering,5,1000,2,16,1000,...,2141.648,1.609438,5.0,1.000000,1.000000,0.787634,,5e6823c34f3749a3832cb34c2cde2b52,,False
ffbea3f03bf94cb89131832f6d2df628,FINISHED,1734290981889,1.734316e+12,10.0,KMeans,5,1000,2,16,1000,...,2065.080,0.784140,10.0,0.402668,0.812697,0.064731,,b0dddd5d266048b7baaa2f5b7b7bce5f,,False
ffd07194432a4f399e2677dec7e98d62,FINISHED,1734299196992,1.734435e+12,100.0,SingleAgglomerativeClustering,5,1000,2,16,100,...,2137.760,1.609438,7.0,0.968492,0.988889,0.563874,,9cf711ec5db346c29a64c10a156f825d,,False
ffe8865703464b1d96376e2320ed96bd,FINISHED,1734296998050,1.734414e+12,100.0,AverageAgglomerativeClustering,5,1000,2,16,1000,...,2139.360,1.609438,12.0,0.976104,0.994142,0.358206,,2ce9b9ce81364285a5d1ea2b9bd15162,,False


# Plots

In [109]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 100.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [169]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
# df = df.loc[df['pct_random'] == 0.7]
df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_easy.csv')
df

key,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,1.0,0.0,5.0,5.0,5.0,537.226421,39.111352,2059.4429,1.779558,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
DBSCAN,0.0,0.0,1.0,1.0,1.0,495.724737,24.503844,2034.5569,1.282687,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
HDBSCAN,1.0,0.0,5.0,5.0,5.0,156.379071,16.678345,1951.1594,228.016285,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
KMeans,1.0,0.0,5.0,5.0,5.0,455.211977,39.756185,2024.6738,151.542389,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
OPTICS,1.0,0.0,5.0,5.0,5.0,186.481377,12.323363,2067.3815,0.839997,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
RecursiveClustering,1.0,0.0,5.0,5.0,5.0,2264.785293,217.413369,2080.029,2.354198,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,3496.919557,733.693875,1108.6247,27.026635,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10
WardAgglomerativeClustering,1.0,0.0,5.0,5.0,5.0,267.337123,30.720733,2059.133,1.695819,1000.0,...,1000.0,1,100.0,1,0.35,4,5.0,1,4.5,10


In [108]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 10.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_hard.csv')
df

key,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.189093,0.017298,21.5,2.0,42.0,529.730182,10.693666,2059.49,2.024502,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
DBSCAN,0.0,0.0,1.0,1.0,1.0,478.425069,8.144741,2034.9944,1.190787,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
HDBSCAN,0.993713,0.003036,6.0,6.0,6.0,157.498777,5.076115,2008.3772,178.319054,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
KMeans,0.200988,0.060978,2.0,2.0,2.0,421.863799,23.231615,2065.8152,4.504178,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
OPTICS,0.399327,0.344323,2.5,2.0,4.5,193.6702,4.669891,2067.3468,1.081793,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
RecursiveClustering,0.433655,0.230195,4.0,3.0,4.75,2390.756949,168.434082,2078.184,2.627123,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
SpectralSubspaceRandomization,0.915287,0.202475,5.0,5.0,5.0,3650.157271,473.980725,1103.11,4.184572,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10
WardAgglomerativeClustering,0.494884,0.296377,2.0,2.0,4.25,250.402186,13.8455,2058.1964,2.105722,1000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,4.5,10


# With filling values

In [132]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents_fill = df_runs_raw_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

In [133]:
# for the moment, I did not manage to run HDBSCAN and SpectralSubspaceRandomization with samples >= 10000, so we will fill these values for the moment as "no run" and we will see later
status = 'FAILED'
no_run_time = np.inf
no_run_memory = np.inf
no_run_metric = np.nan
raised_exception = True
EXCEPTION = 'NoRun'
start_time = time.time() * 1000
end_time = time.time() * 1000

In [134]:
model_nickname = 'HDBSCAN'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [135]:
model_nickname = 'SpectralSubspaceRandomization'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [136]:
# ensure no duplicates
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

# Missing

In [137]:
df_runs_parents_fill['model_nickname'].unique().tolist()

['RecursiveClustering',
 'AverageAgglomerativeClustering',
 'KMeansProj',
 'HDBSCAN',
 'OPTICS',
 'SpectralSubspaceRandomization',
 'Proclus',
 'DBSCAN',
 'WardAgglomerativeClustering',
 'KMeans',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'CompleteAgglomerativeClustering',
 'IRFLLRR',
 'Clique',
 'AffinityPropagation']

In [138]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]

In [139]:
df_runs_parents_fill

Unnamed: 0,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
000b291a142341289479933ddb447488,FINISHED,1.736872e+12,1.736877e+12,50.0,RecursiveClustering,5,10000,2,16,1000,...,1156.568,1.609438,577.0,0.425927,0.806024,-0.104928,,e62023cb27d941dd8b9683eaef947ccc,,False
000f29a52641434a9a03ec3951ba12ba,FINISHED,1.736890e+12,1.736911e+12,50.0,AverageAgglomerativeClustering,5,10000,2,16,10000,...,1941.040,1.608578,8.0,0.997419,0.999561,0.283838,,8e0a1045c9474ca5925d553aa6030f20,,False
0015c6675ce048ec99ded8242b2df23e,FINISHED,1.734337e+12,1.734515e+12,100.0,KMeansProj,5,100,2,16,1000,...,473.880,1.079833,12.0,0.539120,0.835267,-0.002984,,d4a718cc5d92496ba63ebf72357e77db,,False
0018167e4ff1486a8a909f25fdaa5d76,FINISHED,1.734289e+12,1.734364e+12,100.0,RecursiveClustering,5,100,2,16,1000,...,2077.516,1.609438,5.0,1.000000,1.000000,0.936638,,0fc46cf1ca434f86841b195a2b92e37d,,False
00187fd8f9df4282ba536e067e9b6881,FINISHED,1.736687e+12,1.736691e+12,100.0,HDBSCAN,5,100,2,16,10000,...,1212.672,1.609438,5.0,1.000000,1.000000,0.939268,,c25dc85141e84c21b389b9e9cfced842,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,100.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,50.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,10.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True
SpectralSubspaceRandomization_10000_10000,FAILED,1.736947e+12,1.736947e+12,100.0,SpectralSubspaceRandomization,5,10000,,,10000,...,inf,,,,,,NoRun,,,True


In [144]:
n_samples = ['100', '1000', '10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(1)]
class_sep = ['10.0', '100.0']
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'SingleAgglomerativeClustering',
    'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, pct_random, seed_unified, class_sep]
df_missing = get_missing_entries(df_runs_parents_fill, columns_names, should_contain_values)

In [145]:
df_missing

Unnamed: 0,model_nickname,n_samples,n_features,pct_random,seed_unified,class_sep
0,SingleAgglomerativeClustering,10000,10000,0.0,0,10.0
1,SingleAgglomerativeClustering,10000,10000,0.0,0,100.0
2,SingleAgglomerativeClustering,10000,10000,0.2,0,10.0
3,SingleAgglomerativeClustering,10000,10000,0.2,0,100.0
4,SingleAgglomerativeClustering,10000,10000,0.5,0,10.0
5,SingleAgglomerativeClustering,10000,10000,0.5,0,100.0
6,SingleAgglomerativeClustering,10000,10000,0.7,0,10.0
7,SingleAgglomerativeClustering,10000,10000,0.7,0,100.0
8,SpectralClustering,10000,100,0.5,0,10.0
9,SpectralClustering,10000,100,0.5,0,100.0


# Get common combinations

In [146]:
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    'RecursiveClustering',
    'OPTICS',
    'Proclus',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'CompleteAgglomerativeClustering',
    'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
df = df_runs_parents_fill.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [147]:
df_common_fill = get_df_with_combinations(df, combination_columns, common_combinations)

In [148]:
df_common_fill['n_samples'] = df_common_fill['n_samples'].astype(int)
df_common_fill['n_features'] = df_common_fill['n_features'].astype(int)
df_common_fill['pct_random'] = df_common_fill['pct_random'].astype(float)
df_common_fill['class_sep'] = df_common_fill['class_sep'].astype(float)
df_common_fill['seed_unified'] = df_common_fill['seed_unified'].astype(int)
df_common_fill['n_classes'] = df_common_fill['n_classes'].astype(int)

## Plot

In [154]:
df

Unnamed: 0,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,max_memory_used,mutual_info,n_clusters_,normalized_mutual_info,rand_score,silhouette,EXCEPTION,best_child_run_id,parent_run_id,raised_exception
9b5af53d2adb4e66b6c299cc9dd03f84,FINISHED,1736864000000.0,1736871000000.0,100.0,AffinityPropagation,5,1000,2.0,16.0,10000,...,4457.724,1.609438,18.0,0.719116,0.858192,0.084554,,8b29db04c53947fb8a6298af4012597f,,False
6e13b2c209784d15ad55255aff27e0fd,FINISHED,1736864000000.0,1736868000000.0,100.0,AverageAgglomerativeClustering,5,1000,2.0,16.0,10000,...,1231.704,1.609438,14.0,0.963381,0.989579,0.261144,,2a961b3635994d84a2494e22d57f7e76,,False
6fc65459261041fda9074a00019fee06,FINISHED,1736688000000.0,1736808000000.0,100.0,CompleteAgglomerativeClustering,5,1000,2.0,16.0,10000,...,3575.068,1.609438,14.0,0.779245,0.88911,0.079613,,61cc5cae6000412bbb61cdaf2bf7fec7,,False
37261cab80634ea7b30a76a55b63d541,FINISHED,1736686000000.0,1736694000000.0,100.0,DBSCAN,5,1000,2.0,16.0,10000,...,1918.316,0.0,1.0,0.0,0.19992,-1.0,,f93e17338fdb4361a1ab9ca7acb61d9f,,False
a88089c38f8548eab4a4f5a687b268c5,FINISHED,1736687000000.0,1736701000000.0,100.0,HDBSCAN,5,1000,2.0,16.0,10000,...,1301.976,1.609438,5.0,1.0,1.0,0.839192,,d07a9c27fc094cd0b10e2291625f6a04,,False
HDBSCAN_10000_1000,FAILED,1736947000000.0,1736947000000.0,100.0,HDBSCAN,5,1000,,,10000,...,inf,,,,,,NoRun,,,True
b5e3972f272d47199e54bb7d9a1ae019,FINISHED,1736866000000.0,1736866000000.0,100.0,KMeans,5,1000,2.0,16.0,10000,...,1190.328,1.609438,14.0,0.760001,0.873464,0.11079,,339825b5dad64b748b8ba9fec4b5dc3d,,False
4d07fcc980ec4f939220cb15718a0d46,FINISHED,1736707000000.0,1736746000000.0,100.0,OPTICS,5,1000,2.0,16.0,10000,...,1287.588,1.609438,5.0,1.0,1.0,0.839192,,482d10afeb4e44a2b3a78a35966e8f00,,False
ffaaa267a0634c8786d45d623e7a5a6a,FINISHED,1736708000000.0,1736767000000.0,100.0,Proclus,5,1000,2.0,16.0,10000,...,1697.296,0.000755,4.0,0.000523,0.617884,-0.007934,,489a115c31194f1f878d8961d7888b3a,,False
9bc581d07a254b95a7c77a44afee74a2,FINISHED,1736791000000.0,1736824000000.0,100.0,RecursiveClustering,5,1000,2.0,16.0,10000,...,2186.084,1.609438,7.0,0.999242,0.99988,0.623656,,f161eb41e12e4109a0a29037aa0e7659,,False


In [192]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [160]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_easy_more_samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.160395,0.040795,57.0,47.75,68.5,6950.536954,455.081972,4258.068,15.686925,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,1035.232506,75.605361,3573.426,1.666874,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,,,,,,inf,,inf,,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
KMeans,1.0,0.0,5.0,5.0,5.0,749.472146,169.602759,1938.543,863.966482,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,9990.183728,510.976754,1981.357,6.291208,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,1.0,0.0,5.0,5.0,5.0,3492.044976,772.109737,2571.013,1067.58143,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.782541,0.0,4.0,4.0,4.0,7430.75458,155.17805,3571.057,2.912283,10000.0,...,10000.0,1,100.0,1,0.35,4,5.0,1,0.0,1


In [177]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 10.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_hard_more_samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.408197,,5.0,5.0,5.0,7348.131929,,3590.168,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
DBSCAN,0.0,,1.0,1.0,1.0,406.390341,,2165.288,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
HDBSCAN,0.9993746,,6.0,6.0,6.0,inf,,inf,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
KMeans,0.1383373,,2.0,2.0,2.0,269.914718,,1187.536,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
OPTICS,9.270328e-07,,18.0,18.0,18.0,4420.699471,,1287.016,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
RecursiveClustering,0.409462,,3.0,3.0,3.0,7003.752386,,2194.076,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1
WardAgglomerativeClustering,0.9457308,,5.0,5.0,5.0,1533.686058,,1308.08,,10000.0,...,1000.0,1,10.0,1,0.5,1,5.0,1,0.0,1


In [195]:
df = df_common_fill.copy()
n_samples = 100
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.705632,0.222672,10.0,7.25,12.25,291.665205,6.030363,1070.73,640.08979,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,468.850985,56.704983,1331.866,480.067282,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,1.0,0.0,5.0,5.0,5.0,245.753154,2.265686,1118.75,644.658505,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
KMeans,0.734719,0.386622,5.0,4.25,5.5,228.895834,17.110452,328.976,8.565886,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
OPTICS,0.996882,0.006235,5.0,5.0,5.25,81.335244,1.056877,1073.542,643.367891,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,0.977885,0.036395,5.0,5.0,5.25,1223.913159,185.241066,1469.911,561.160744,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,1566.598713,63.137136,585.495,351.864159,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.924978,0.090899,5.0,5.0,5.25,264.170412,5.403925,1106.073,640.410057,100.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1


In [196]:
df = df_common_fill.copy()
n_samples = 1000
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.394753,0.171944,3.0,2.5,5.5,212.604738,43.821401,1863.784,1419.461313,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,433.483653,12.986188,1807.408,331.777796,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
HDBSCAN,1.0,0.0,5.0,5.0,5.0,223.830181,16.12429,1411.584,604.404744,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
KMeans,0.705713,0.436123,5.0,3.5,5.0,194.259393,25.739411,725.601333,442.063026,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,435.720034,5.135288,1349.556,604.088144,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
RecursiveClustering,0.960138,0.058236,5.0,5.0,5.5,1211.743927,99.746887,1532.3,403.019641,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
SpectralSubspaceRandomization,1.0,0.0,5.0,5.0,5.0,4635.317239,440.833632,1483.517333,401.51956,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1
WardAgglomerativeClustering,0.999165,0.001446,5.0,5.0,5.0,279.317189,27.937345,1514.278667,427.108678,1000.0,...,10000.0,1,50.0,1,0.233333,3,5.0,1,0.0,1


In [197]:
df = df_common_fill.copy()
n_samples = 10000
df = df.loc[df['n_samples'] == n_samples]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv(f'simulated_data_medium_{n_samples}samples.csv')
df

Unnamed: 0_level_0,Adjusted Rand,Adjusted Rand,Number of Clusters Found,Number of Clusters Found,Number of Clusters Found,Time (s),Time (s),Memory (MB),Memory (MB),Number of samples,...,Number of features,Number of features,Class Separation,Class Separation,% Random Features,% Random Features,True Number of Clusters,True Number of Clusters,Seed,Seed
Unnamed: 0_level_1,mean,std,median,Q25,Q75,mean,std,mean,std,mean,...,mean,nunique,mean,nunique,mean,nunique,mean,nunique,mean,nunique
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AffinityPropagation,0.087754,0.019162,112.5,93.5,133.75,5300.751929,457.882468,4447.989,183.783045,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
DBSCAN,0.0,0.0,1.0,1.0,1.0,1079.975262,40.682919,3297.037,295.117361,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
HDBSCAN,,,,,,inf,,inf,,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
KMeans,0.793508,0.3334,5.0,4.25,5.0,563.6524,31.476729,2326.306,740.989262,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
OPTICS,1.0,0.0,5.0,5.0,5.0,8207.799477,992.601355,3084.931,320.679048,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
RecursiveClustering,0.929387,0.130033,5.0,4.75,5.0,3845.434804,570.223215,3136.996,294.056798,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
SpectralSubspaceRandomization,,,,,,inf,,inf,,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1
WardAgglomerativeClustering,0.740728,0.083625,4.0,3.75,4.0,7515.465591,141.255042,3099.448,320.918731,10000.0,...,10000.0,1,50.0,1,0.35,4,5.0,1,0.0,1


# Debug and explore

In [12]:
df = df_runs_raw_parents.copy()

In [14]:
df = df.loc[df['model_nickname'] == 'RecursiveClustering']

In [16]:
parent_run_ids = list(df.index)

In [17]:
parent_run_ids

['00632cf3ad074f87864622092ea3e0ca',
 '07b4844fc23843be8c04006dccf40000',
 '0c2298db42b04144872b743f49b079c9',
 '130c56acae814efe87af52250feba181',
 '1672fa8e1b8c4347a50c9a9edf3163c4',
 '20530da3f18948fe9acc10624a6836d8',
 '22014d5dda56412a9183599deb9e2b61',
 '2985710c23e44408b32e4af09674c512',
 '2b2966618d7d421cb1547362bdd5805d',
 '2ca549456d2242cdaeaf7bf702c7792c',
 '2d7bfd760fe74b00b4718b7afe45d8c7',
 '2dd7d98d4a71456f9cfa041b6e38ee78',
 '2ee55abec496486a89ad3b7aa26b3725',
 '3158a7facea94724a927ff1f65e4292c',
 '33c7ce3fc5ef407db5cbd89c064fd7e8',
 '348bd5da26c145aa9b9e158b640fea29',
 '38383dd42bc340a7872cb97610dbd07b',
 '3c1a8f173c88439a975930483c1ce3ee',
 '4644c1272758473db4cfab2972a09b7d',
 '46c76db43d6a4db4b84bdaa0aa918b55',
 '49d77a5124e04671b7549a5cd5fc8a4d',
 '4a44737dbc724d16a69cd5e35ed32a1c',
 '54cb7a39f63048c5934f628da37f770b',
 '58f2f945c7cc4dcd8bea58457166c3e6',
 '59cf1a91b39b4bd196a78cef506a2d28',
 '5a5bb69e60d64a4d988af643048e8525',
 '5d805a20cfb146e582eae473afd0eaa5',
 

In [18]:
df = df_runs_raw.copy()
df = df.loc[df['parent_run_id'].isin(parent_run_ids)]

In [19]:
child_run_ids = list(df.index)

In [20]:
runs_to_delete = parent_run_ids + child_run_ids

In [21]:
runs_to_delete

['00632cf3ad074f87864622092ea3e0ca',
 '07b4844fc23843be8c04006dccf40000',
 '0c2298db42b04144872b743f49b079c9',
 '130c56acae814efe87af52250feba181',
 '1672fa8e1b8c4347a50c9a9edf3163c4',
 '20530da3f18948fe9acc10624a6836d8',
 '22014d5dda56412a9183599deb9e2b61',
 '2985710c23e44408b32e4af09674c512',
 '2b2966618d7d421cb1547362bdd5805d',
 '2ca549456d2242cdaeaf7bf702c7792c',
 '2d7bfd760fe74b00b4718b7afe45d8c7',
 '2dd7d98d4a71456f9cfa041b6e38ee78',
 '2ee55abec496486a89ad3b7aa26b3725',
 '3158a7facea94724a927ff1f65e4292c',
 '33c7ce3fc5ef407db5cbd89c064fd7e8',
 '348bd5da26c145aa9b9e158b640fea29',
 '38383dd42bc340a7872cb97610dbd07b',
 '3c1a8f173c88439a975930483c1ce3ee',
 '4644c1272758473db4cfab2972a09b7d',
 '46c76db43d6a4db4b84bdaa0aa918b55',
 '49d77a5124e04671b7549a5cd5fc8a4d',
 '4a44737dbc724d16a69cd5e35ed32a1c',
 '54cb7a39f63048c5934f628da37f770b',
 '58f2f945c7cc4dcd8bea58457166c3e6',
 '59cf1a91b39b4bd196a78cef506a2d28',
 '5a5bb69e60d64a4d988af643048e8525',
 '5d805a20cfb146e582eae473afd0eaa5',
 

In [22]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [23]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    