In [2]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries, get_common_combinations, get_df_with_combinations
import plotly.express as px
from itertools import product
import time
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Save Results

## Load mlflow runs

In [3]:
db_port = 5001
db_name = 'recursive_clustering'
w = 'clust9'
# url = f'postgresql://{w}.ceremade.dauphine.lan:{db_port}/{db_name}'
url = f'postgresql://belucci@localhost:{db_port}/{db_name}'
engine = create_engine(url)
query = 'SELECT experiments.name from experiments'
experiment_names = pd.read_sql(query, engine)['name'].tolist()
# results_dir = Path('~/tab_benchmark/results')
# os.makedirs(results_dir, exist_ok=True)

In [4]:
experiment_names

['Default',
 'blob_experiment',
 'hpo_classif_experiment',
 'hpo_openml_experiment',
 'hpo_gaussian_experiment',
 'hpo_openml_final',
 'outlier_hc',
 'time_hc',
 'hpo_n_clusters']

In [5]:
experiments_names = [
    'hpo_classif_experiment',
]

In [6]:
params_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes',
    'n_trials'
]

In [7]:
latest_metrics_columns = [
    'fit_model_return_elapsed_time',
    'max_memory_used',
    'n_clusters_',
    'rand_score',
    'adjusted_rand',
    'mutual_info',
    'adjusted_mutual_info',
    'normalized_mutual_info',
    'homogeneity',
    'completeness',
    'v_measure',
    'silhouette',
    'calinski_harabasz_score',
    'davies_bouldin_score',
    'inertia_score',
    'best_n_clusters_',
    'best_rand_score',
    'best_adjusted_rand',
    'best_mutual_info',
    'best_adjusted_mutual_info',
    'best_normalized_mutual_info',
    'best_homogeneity_completeness_v_measure',
    'best_silhouette',
    'best_calinski_harabasz_score',
    'best_davies_bouldin_score',
    'best_inertia_score',
    'best_homogeneity',
    'best_completeness',
    'best_v_measure',
]

In [8]:
tags_columns = [
    'raised_exception',
    'EXCEPTION',
    'parent_run_id',
    'best_child_run_id',
]

In [9]:
# parameters of each model, they are saved as tags for the parent run
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}
all_model_parameters = [p for model, params in parameters.items() for p in params]
all_model_parameters = list(set(all_model_parameters))
for param in all_model_parameters:
    tags_columns.append(param)

In [10]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [11]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)

In [12]:
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents['parent_run_id'].isna()]

In [13]:
df_runs_raw_parents

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,kmeans_n_clusters,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000aaf76d56747d59a35d53ba0fde4a4,FINISHED,1736980797626,1.736985e+12,50.0,AffinityPropagation,5,10000,2,16,100,...,,,,,,,False,,,
000b291a142341289479933ddb447488,FINISHED,1736872145638,1.736877e+12,50.0,RecursiveClustering,5,10000,2,16,1000,...,2,,,,,,False,10,,
000f29a52641434a9a03ec3951ba12ba,FINISHED,1736889529482,1.736911e+12,50.0,AverageAgglomerativeClustering,5,10000,2,16,10000,...,,,,8,,,False,,,
00142f7d57c244c2b33da0cfa8480cbf,FINISHED,1737110379379,1.737119e+12,50.0,HDBSCAN,5,10000,2,16,100,...,,6,,,,,False,,,
00151ab7956742b391a46ebbc769a9ff,FINISHED,1737069092293,1.737078e+12,50.0,DBSCAN,5,100,2,16,10000,...,,,10,,,,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffefb39686434d8c8b9c8916e427b0fc,FINISHED,1734336072705,1.734523e+12,10.0,KMeansProj,5,100,2,16,1000,...,,,,2,,,False,,,
fff03a5c0e4147b6a0177fdc3ff06b93,FINISHED,1734323313209,1.734576e+12,10.0,Clique,5,100,2,16,1000,...,,,,,,,False,,,
fff18439dc90490186780bf945fad9d7,FINISHED,1736889583158,1.736899e+12,50.0,AverageAgglomerativeClustering,5,1000,2,16,1000,...,,,,5,,,False,,,
fffb0b0c2daa4098a83b9be7f16f7b4d,FINISHED,1736980833830,1.736984e+12,50.0,AffinityPropagation,5,1000,2,16,100,...,,,,,,,False,,,


In [14]:
# Separate our method with two types n_trials
df_runs_parents = df_runs_raw_parents.copy()
df_runs_parents.loc[(df_runs_parents['model_nickname'] == 'RecursiveClustering') & (df_runs_parents['n_trials'] == '20'), 'model_nickname'] = 'RecursiveClustering_20'

## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [15]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents = df_runs_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]

# Missing

In [16]:
df_runs_parents['model_nickname'].unique().tolist()

['AffinityPropagation',
 'RecursiveClustering',
 'AverageAgglomerativeClustering',
 'HDBSCAN',
 'DBSCAN',
 'KMeansProj',
 'OPTICS',
 'SpectralSubspaceRandomization',
 'Proclus',
 'WardAgglomerativeClustering',
 'KMeans',
 'SingleAgglomerativeClustering',
 'SpectralClustering',
 'CompleteAgglomerativeClustering',
 'IRFLLRR',
 'Clique',
 'RecursiveClustering_20']

In [17]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]

In [18]:
# n_samples = ['100', '1000', '10000']
# n_features = ['100', '1000', '10000']
# pct_random = ['0.0', '0.2', '0.5', '0.7']
# seed_unified = [f'{i}' for i in range(10)]
# class_sep = ['10.0', '50.0', '100.0']
# n_samples = ['1000']
# n_features = ['1000', '10000']
n_samples = ['100000']
n_features = ['10000']
pct_random = ['0.0']
seed_unified = [f'{i}' for i in range(0)]
class_sep = ['10.0', '20.0', '30.0', '40.0', '50.0', '100.0']
model_nickname = [
    # 'WardAgglomerativeClustering',
    # 'KMeans',
    # 'DBSCAN',
    # 'HDBSCAN',
    # 'SpectralSubspaceRandomization',
    # 'RecursiveClustering',
    'RecursiveClustering_20',
    # 'OPTICS',
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    # 'Proclus',
    # 'CompleteAgglomerativeClustering',
    # 'AverageAgglomerativeClustering',
    # 'AffinityPropagation'
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, pct_random, seed_unified, class_sep]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)

In [20]:
df_missing

Unnamed: 0,model_nickname,n_samples,n_features,pct_random,seed_unified,class_sep


In [None]:
df = df_runs_parents.copy()
df_child = df_runs_raw.copy()
df_child = df_child[['fit_model_return_elapsed_time']]
df = df.join(df_child, on='best_child_run_id', rsuffix='_child')
df['n_samples'] = df['n_samples'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df = df.loc[df['n_samples']==1000]
df = df.loc[df['pct_random']=='0.0']
df = df.loc[df['class_sep']=='100.0']
df = df.groupby(['model_nickname', 'n_features']).agg({'fit_model_return_elapsed_time_child': ('mean', 'std')})
df = df.reset_index()
df = df.sort_values(by=['n_features', 'model_nickname'])
df.columns = [' '.join(col).strip() for col in df.columns.values]

In [None]:
fig = px.line(df, x='n_features', y='fit_model_return_elapsed_time_child mean', color='model_nickname', line_dash='model_nickname', error_y='fit_model_return_elapsed_time_child std', log_x=True, log_y=True, markers=True)
fig.show()

In [None]:
df = df_runs_parents.copy()
df_child = df_runs_raw.copy()
df_child = df_child[['fit_model_return_elapsed_time']]
df = df.join(df_child, on='best_child_run_id', rsuffix='_child')
df['n_samples'] = df['n_samples'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df = df.loc[df['n_features']==1000]
df = df.loc[df['pct_random']=='0.0']
df = df.loc[df['class_sep']=='100.0']
df = df.groupby(['model_nickname', 'n_samples']).agg({'fit_model_return_elapsed_time_child': ('mean', 'std')})
df = df.reset_index()
df = df.sort_values(by=['n_samples', 'model_nickname'])
df.columns = [' '.join(col).strip() for col in df.columns.values]

In [None]:
fig = px.line(df, x='n_samples', y='fit_model_return_elapsed_time_child mean', color='model_nickname', line_dash='model_nickname', error_y='fit_model_return_elapsed_time_child std', log_x=True, log_y=True, markers=True)
fig.show()

In [21]:
df = df_runs_parents.copy()
df = df.loc[df['model_nickname']=='RecursiveClustering_20']
df = df.loc[df['n_samples']=='100000']
df = df.loc[df['n_features']=='10000']
df

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,kmeans_n_clusters,min_cluster_size,min_samples,n_clusters,n_similarities,parent_run_id,raised_exception,repetitions,sampling_ratio,sc_n_clusters
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1278d11bcee144828176f33d217a7186,FINISHED,1740665625519,1740699000000.0,40.0,RecursiveClustering_20,5,10000,,,100000,...,2,,,,,,False,10,,
1af66524551046418432b9c498fd947a,FINISHED,1740665599174,1740681000000.0,20.0,RecursiveClustering_20,5,10000,,,100000,...,3,,,,,,False,10,,
2b9bec0dd50148a2ab2bff096576c820,FINISHED,1740665756789,1740746000000.0,100.0,RecursiveClustering_20,5,10000,,,100000,...,2,,,,,,False,10,,
66ecae4442d34200b7149029c2a6a2a5,FINISHED,1740665650940,1740714000000.0,10.0,RecursiveClustering_20,5,10000,,,100000,...,3,,,,,,False,10,,
6e6b98048e2249c2ac077edf92606887,FINISHED,1740665638108,1740703000000.0,50.0,RecursiveClustering_20,5,10000,,,100000,...,2,,,,,,False,4,,
8607186b4ba34c22b85dad96995a7032,FINISHED,1740665612007,1740691000000.0,30.0,RecursiveClustering_20,5,10000,,,100000,...,7,,,,,,False,8,,


# Get common combinations

In [None]:
model_nickname = [
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'RecursiveClustering',
    'RecursiveClustering_20',
    'OPTICS',
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    # 'Proclus',
    # 'CompleteAgglomerativeClustering',
    # 'AverageAgglomerativeClustering',
    'AffinityPropagation'
]
df = df_runs_parents.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [None]:
df_common = get_df_with_combinations(df, combination_columns, common_combinations)

In [None]:
df_common['n_samples'] = df_common['n_samples'].astype(int)
df_common['n_features'] = df_common['n_features'].astype(int)
df_common['pct_random'] = df_common['pct_random'].astype(float)
df_common['class_sep'] = df_common['class_sep'].astype(float)
df_common['seed_unified'] = df_common['seed_unified'].astype(int)
df_common['n_classes'] = df_common['n_classes'].astype(int)
for param in all_model_parameters:
    df_common[param] = df_common[param].astype(float)

In [None]:
df_common

# Plots

In [None]:
df = df_runs_parents.copy()
df['n_samples'] = df['n_samples'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df['pct_random'] = df['pct_random'].astype(float)
df['class_sep'] = df['class_sep'].astype(float)
df['seed_unified'] = df['seed_unified'].astype(int)
df['n_classes'] = df['n_classes'].astype(int)
for param in all_model_parameters:
    df[param] = df[param].astype(float)
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.0]
df = df.loc[df['class_sep'] == 50.0]
df = df.loc[df['model_nickname'].isin(['SpectralSubspaceRandomization',])]
df = df[['model_nickname', 'n_similarities', 'sampling_ratio', 'sc_n_clusters']]
df = df.groupby('model_nickname').agg({'n_similarities': ['mean', 'std'], 'sampling_ratio': ['mean', 'std'], 'sc_n_clusters': ['mean', 'std']})

In [None]:
df

In [None]:
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}

In [None]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 10000]
df = df.loc[df['pct_random'] == 0.0]
df = df.loc[df['class_sep'] == 50.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [None]:
df = df_common.copy()
# n_features = 1000
# n_samples = 1000
n_samples_n_features = [(1000, 1000), (1000, 10000)]
pct_random = 0.0
models_names = {
    'RecursiveClustering': 'CoHiRF',
    'RecursiveClustering_20': 'CoHiRF (20 trials)',
    'KMeans': 'K-Means',
    'HDBSCAN': 'HDBSCAN',
    'DBSCAN': 'DBSCAN',
    'AffinityPropagation': 'Affinity Propagation',
    'OPTICS': 'OPTICS',
    'SpectralSubspaceRandomization': 'SC-SRGF',
    'WardAgglomerativeClustering': "Ward's Method",
}
df = df.loc[df['pct_random']==pct_random]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'RecursiveClustering_20', 'KMeans', 'HDBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.replace({'model_nickname': models_names})
df = df.sort_values(by='model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
plt.style.use('default')
# plt.style.use('seaborn-v0_8-paper')
with mpl.rc_context(rc={
    'figure.constrained_layout.use': True, 
    'savefig.bbox': 'tight', 
    'figure.figsize': (10, 6),
    'legend.loc': 'upper left',
    'legend.frameon': False,
    'font.size': 12,
    # 'font.family': 'serif',
    # 'font.serif': 'Times',
    'text.color': 'black',
    'grid.color': 'black',
    'grid.alpha': 0.5,
}):
    fig, axs = plt.subplots(2, 1)
    axs = axs.flatten()
    for i, (n_samples, n_features) in enumerate(n_samples_n_features):
        df_plot = df.copy()
        df_plot = df_plot.loc[df_plot['Number of features']==n_features]
        df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
        ax = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[i])
        ax.legend().set_visible(False)
        # plt.legend(title=None, bbox_to_anchor=(0.6, 0.3))
        # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}.pdf')
        # plt.show()
axs[0].legend(title=None, fontsize=12)
plt.savefig(f'hypercube_n-samples-n-features{n_samples_n_features}_pct-random{pct_random}.pdf')
plt.show()

In [None]:
df = df_common.copy()
n_features = 10000
n_samples = 1000
# n_samples_n_features = [(1000, 1000), (1000, 10000)]
pct_random = 0.0
models_names = {
    # 'RecursiveClustering': 'CoHiRF',
    'RecursiveClustering_20': 'CoHiRF',
    'KMeans': 'K-Means',
    'HDBSCAN': 'HDBSCAN',
    'DBSCAN': 'DBSCAN',
    'AffinityPropagation': 'Affinity Propagation',
    'OPTICS': 'OPTICS',
    'SpectralSubspaceRandomization': 'SC-SRGF',
    'WardAgglomerativeClustering': "Ward's Method",
}
df = df.loc[df['pct_random']==pct_random]
df = df.loc[~df['class_sep'].isin([10, 20])]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering_20', 'KMeans', 'HDBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.replace({'model_nickname': models_names})
df = df.sort_values(by='model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
plt.style.use('default')
# plt.style.use('seaborn-v0_8-paper')
with mpl.rc_context(rc={
    'figure.constrained_layout.use': True, 
    'savefig.bbox': 'tight', 
    'figure.figsize': (10, 6),
    'legend.loc': 'upper left',
    'legend.frameon': False,
    'font.size': 12,
    # 'font.family': 'serif',
    # 'font.serif': 'Times',
    'text.color': 'black',
    'grid.color': 'black',
    'grid.alpha': 0.5,
}):
    fig, axs = plt.subplots(2, 1)
    axs = axs.flatten()
    df_plot = df.copy()
    df_plot = df_plot.loc[df_plot['Number of features']==n_features]
    df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
    ax = sns.lineplot(data=df_plot, x='Class Separation', y='Time (s)', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[0])
    ax.legend().set_visible(False)
    ax.set_yscale('log')
    ax2 = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[1])
    ax2.legend().set_visible(False)
    ax.legend(title=None, fontsize=12, bbox_to_anchor=(0.0, 1.3), ncols=4)
    plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}_with_time.pdf')
    plt.show()
    # for i, (n_samples, n_features) in enumerate(n_samples_n_features):
    #     df_plot = df.copy()
    #     df_plot = df_plot.loc[df_plot['Number of features']==n_features]
    #     df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
    #     ax = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[i])
    #     ax.legend().set_visible(False)
    #     # plt.legend(title=None, bbox_to_anchor=(0.6, 0.3))
        # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}.pdf')
        # plt.show()
# axs[0].legend(title=None, fontsize=12)
    
# plt.show()

In [None]:
df = df_runs_parents.copy()
df['n_samples'] = df['n_samples'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df['pct_random'] = df['pct_random'].astype(float)
n_features = 10000
n_samples = 10000
df = df.loc[df['n_samples']==n_samples]
df = df.loc[df['n_features']==n_features]
df = df.loc[df['pct_random']==0.0]

In [None]:
df

In [None]:
df = df_runs_parents.copy()
df['n_samples'] = df['n_samples'].astype(int)
df['n_features'] = df['n_features'].astype(int)
df['pct_random'] = df['pct_random'].astype(float)
df['class_sep'] = df['class_sep'].astype(float)
n_features = 10000
n_samples = 10000
# n_samples_n_features = [(1000, 1000), (1000, 10000)]
pct_random = 0.0
models_names = {
    # 'RecursiveClustering': 'CoHiRF',
    'RecursiveClustering_20': 'CoHiRF',
    'KMeans': 'K-Means',
    'HDBSCAN': 'HDBSCAN',
    'DBSCAN': 'DBSCAN',
    'AffinityPropagation': 'Affinity Propagation',
    'OPTICS': 'OPTICS',
    'SpectralSubspaceRandomization': 'SC-SRGF',
    'WardAgglomerativeClustering': "Ward's Method",
}
df = df.loc[df['pct_random']==pct_random]
# df = df.loc[~df['class_sep'].isin([10, 20])]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering_20', 'KMeans', 'HDBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.replace({'model_nickname': models_names})
df = df.sort_values(by='model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
plt.style.use('default')
# plt.style.use('seaborn-v0_8-paper')
with mpl.rc_context(rc={
    'figure.constrained_layout.use': True, 
    'savefig.bbox': 'tight', 
    'figure.figsize': (10, 6),
    'legend.loc': 'upper left',
    'legend.frameon': False,
    'font.size': 12,
    # 'font.family': 'serif',
    # 'font.serif': 'Times',
    'text.color': 'black',
    'grid.color': 'black',
    'grid.alpha': 0.5,
}):
    fig, axs = plt.subplots(2, 1)
    axs = axs.flatten()
    df_plot = df.copy()
    df_plot = df_plot.loc[df_plot['Number of features']==n_features]
    df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
    ax = sns.lineplot(data=df_plot, x='Class Separation', y='Time (s)', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[0])
    ax.legend().set_visible(False)
    ax.set_yscale('log')
    ax2 = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[1])
    ax2.legend().set_visible(False)
    ax.legend(title=None, fontsize=12, bbox_to_anchor=(0.0, 1.3), ncols=4)
    # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}_with_time.pdf')
    plt.show()
    # for i, (n_samples, n_features) in enumerate(n_samples_n_features):
    #     df_plot = df.copy()
    #     df_plot = df_plot.loc[df_plot['Number of features']==n_features]
    #     df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
    #     ax = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[i])
    #     ax.legend().set_visible(False)
    #     # plt.legend(title=None, bbox_to_anchor=(0.6, 0.3))
        # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}.pdf')
        # plt.show()
# axs[0].legend(title=None, fontsize=12)
    
# plt.show()

In [None]:
df = df_common.copy()
n_features = 10000
n_samples = 1000
# n_samples_n_features = [(1000, 1000), (1000, 10000)]
pct_random = 0.0
models_names = {
    'RecursiveClustering': 'CoHiRF',
    'RecursiveClustering_20': 'CoHiRF (20 trials)',
    'KMeans': 'K-Means',
    'HDBSCAN': 'HDBSCAN',
    'DBSCAN': 'DBSCAN',
    'AffinityPropagation': 'Affinity Propagation',
    'OPTICS': 'OPTICS',
    'SpectralSubspaceRandomization': 'SC-SRGF',
    'WardAgglomerativeClustering': "Ward's Method",
}
df = df.loc[df['pct_random']==pct_random]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'RecursiveClustering_20', 'KMeans', 'HDBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.loc[df['pct_random']==pct_random]
df = df.replace({'model_nickname': models_names})
df = df.sort_values(by='model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
# plt.style.use('default')
# # plt.style.use('seaborn-v0_8-paper')
# with mpl.rc_context(rc={
#     'figure.constrained_layout.use': True, 
#     'savefig.bbox': 'tight', 
#     'figure.figsize': (10, 6),
#     'legend.loc': 'upper left',
#     'legend.frameon': False,
#     'font.size': 12,
#     # 'font.family': 'serif',
#     # 'font.serif': 'Times',
#     'text.color': 'black',
#     'grid.color': 'black',
#     'grid.alpha': 0.5,
# }):
df_plot = df.copy()
df_plot = df_plot.loc[df_plot['Number of features']==n_features]
df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
sns.lineplot(data=df_plot, x='Class Separation', y='Time (s)', hue='Model', style='Model', markers=True, errorbar='ci')
# fig, axs = plt.subplots(2, 1)
# axs = axs.flatten()
# for i, (n_samples, n_features) in enumerate(n_samples_n_features):
#     df_plot = df.copy()
#     df_plot = df_plot.loc[df_plot['Number of features']==n_features]
#     df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
#     ax = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[i])
#     ax.legend().set_visible(False)
#     # plt.legend(title=None, bbox_to_anchor=(0.6, 0.3))
#     # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}.pdf')
#     # plt.show()
# axs[0].legend(title=None, fontsize=12)
# plt.savefig(f'hypercube_n-samples-n-features{n_samples_n_features}_pct-random{pct_random}.pdf')
plt.show()

In [None]:
df_plot

In [None]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
# df = df.loc[df['pct_random'] == 0.7]
df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_easy.csv')
df

In [None]:
df = df_common.copy()
df = df.loc[df['n_samples'] == 1000]
df = df.loc[df['n_features'] == 1000]
df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 10.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values('model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
df.to_csv('simulated_data_hard.csv')
df

# With filling values

In [None]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents_fill = df_runs_raw_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

In [None]:
# for the moment, I did not manage to run HDBSCAN and SpectralSubspaceRandomization with samples >= 10000, so we will fill these values for the moment as "no run" and we will see later
status = 'FAILED'
no_run_time = np.inf
no_run_memory = np.inf
no_run_metric = np.nan
raised_exception = True
EXCEPTION = 'NoRun'
start_time = time.time() * 1000
end_time = time.time() * 1000

In [None]:
model_nickname = 'HDBSCAN'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [None]:
model_nickname = 'SpectralSubspaceRandomization'
n_samples = ['10000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(10)]
class_sep = ['10.0', '100.0', '50.0']
n_classes = 5
combinations = list(product(n_samples, n_features, pct_random, seed_unified, class_sep))
df_cat = []
indexes = []
for n_sample, n_feature, pct_r, seed, class_s in combinations:
    df = {}
    index = f'{model_nickname}_{n_sample}_{n_feature}'
    df['status'] = status
    df['start_time'] = start_time
    df['end_time'] = end_time
    df['model_nickname'] = model_nickname
    df['n_classes'] = n_classes
    df['n_features'] = n_feature
    df['n_samples'] = n_sample
    df['pct_random'] = pct_r
    df['seed_unified'] = seed
    df['class_sep'] = class_s
    df['fit_model_return_elapsed_time'] = no_run_time
    df['max_memory_used'] = no_run_memory
    df['best_adjusted_rand'] = no_run_metric
    df['best_n_clusters_'] = no_run_metric
    df['EXCEPTION'] = EXCEPTION
    df['raised_exception'] = raised_exception
    df_cat.append(df)
    indexes.append(index)
df_no_run = pd.DataFrame(df_cat, index=indexes)
df_runs_parents_fill = pd.concat([df_runs_parents_fill, df_no_run])

In [None]:
# ensure no duplicates
df_runs_parents_fill = df_runs_parents_fill.loc[(~df_runs_parents_fill.duplicated(non_duplicate_columns))]

# Missing

In [None]:
df_runs_parents_fill['model_nickname'].unique().tolist()

In [None]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]

In [None]:
df_runs_parents_fill

In [None]:
n_samples = ['1000']
n_features = ['100', '1000', '10000']
pct_random = ['0.0']
seed_unified = [f'{i}' for i in range(5)]
class_sep = ['10.0', '100.0']
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    # 'CompleteAgglomerativeClustering',
    # 'AverageAgglomerativeClustering',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    # 'Proclus',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'RecursiveClustering',
    'OPTICS',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'AffinityPropagation'
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, pct_random, seed_unified, class_sep]
df_missing = get_missing_entries(df_runs_parents_fill, columns_names, should_contain_values)

In [None]:
df_missing

# Get common combinations

In [None]:
model_nickname = [
    # 'KMeansProj',
    # 'IRFLLRR',
    # 'Clique',
    # 'CompleteAgglomerativeClustering',
    # 'AverageAgglomerativeClustering',
    # 'SingleAgglomerativeClustering',
    # 'SpectralClustering',
    # 'Proclus',
    'HDBSCAN',
    'SpectralSubspaceRandomization',
    'RecursiveClustering',
    'OPTICS',
    'WardAgglomerativeClustering',
    'KMeans',
    'DBSCAN',
    'AffinityPropagation'
]
df = df_runs_parents_fill.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [None]:
df_common_fill = get_df_with_combinations(df, combination_columns, common_combinations)

In [None]:
df_common_fill['n_samples'] = df_common_fill['n_samples'].astype(int)
df_common_fill['n_features'] = df_common_fill['n_features'].astype(int)
df_common_fill['pct_random'] = df_common_fill['pct_random'].astype(float)
df_common_fill['class_sep'] = df_common_fill['class_sep'].astype(float)
df_common_fill['seed_unified'] = df_common_fill['seed_unified'].astype(int)
df_common_fill['n_classes'] = df_common_fill['n_classes'].astype(int)

## Plot

In [None]:
df_common_fill

In [None]:
print(plt.style.available)

In [None]:
df = df_common_fill.copy()
# n_features = 1000
# n_samples = 1000
n_samples_n_features = [(1000, 1000), (1000, 10000)]
pct_random = 0.0
df = df.loc[df['pct_random']==pct_random]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values(by='model_nickname')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
plt.style.use('default')
# plt.style.use('seaborn-v0_8-paper')
with mpl.rc_context(rc={
    'figure.constrained_layout.use': True, 
    'savefig.bbox': 'tight', 
    'figure.figsize': (10, 6),
    'legend.loc': 'upper left',
    'legend.frameon': False,
    'font.size': 12,
    # 'font.family': 'serif',
    # 'font.serif': 'Times',
    'text.color': 'black',
    'grid.color': 'black',
    'grid.alpha': 0.5,
}):
    fig, axs = plt.subplots(2, 1)
    axs = axs.flatten()
    for i, (n_samples, n_features) in enumerate(n_samples_n_features):
        df_plot = df.copy()
        df_plot = df_plot.loc[df_plot['Number of features']==n_features]
        df_plot = df_plot.loc[df_plot['Number of samples']==n_samples]
        ax = sns.lineplot(data=df_plot, x='Class Separation', y='ARI', hue='Model', style='Model', markers=True, errorbar='ci', ax=axs[i])
        ax.legend().set_visible(False)
        # plt.legend(title=None, bbox_to_anchor=(0.6, 0.3))
        # plt.savefig(f'hypercube_n-samples{n_samples}_n-features{n_features}_pct-random{pct_random}.pdf')
        # plt.show()
axs[0].legend(title=None, fontsize=12)
plt.savefig(f'hypercube_n-samples-n-features{n_samples_n_features}_pct-random{pct_random}.pdf')
plt.show()

In [None]:
axs

In [None]:
df

In [None]:
df = df[['model_nickname', 'best_adjusted_rand']]

In [None]:
print(df)

In [None]:
df = df_common_fill.copy()
n_features = 1000
n_samples = 1000
pct_random = 0.0
df = df.loc[df['n_features']==n_features]
df = df.loc[df['n_samples']==n_samples]
df = df.loc[df['pct_random']==pct_random]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df = df.sort_values(by='class_sep')
df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'best_adjusted_rand' : 'ARI'})
fig = px.line(df, x='Class Separation', y='ARI', color='Model', template='presentation', line_dash='Model', markers=True,
              category_orders={'Model': ['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering']})
# # transparent background
fig.update_layout({
    'font': {
        'color': 'black',
        'size': 15
    },
    'xaxis': {
        'color': 'black',
        'title': {'standoff': 15},
    }, 
    'yaxis': {
        'color': 'black',
    },
    'margin': {
        't': 10,
        'b': 50,
        'l': 50,
        'r': 30,
    },
    'legend': {
        'orientation': 'h',
        'yanchor': 'bottom',
        'y': 1.02,
        'xanchor': 'left',
        'x': 0.01,
        'title': {'text': ''}
    }
})
# fig.update_xaxes(showline=True, mirror=True, dtick='D2')
# fig.update_yaxes(showline=True, mirror=True, dtick='D2')
fig.show()
fig.write_image(f'time_n-samples_n-features{n_features}.pdf')

In [None]:
df

In [None]:
df = df_common_fill.copy()
df = df.loc[df['n_samples'] == 10000]
df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
df = df.loc[df['class_sep'] == 50.0]
df = df.sort_values('model_nickname')
fig = px.box(df, x='model_nickname', y='best_adjusted_rand', color='model_nickname')
fig.show()

In [None]:
df = df_common_fill.copy()
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df['Config.'] = '\parbox{1.5cm}{l=' + df['class_sep'].astype(str) + '; n=' + df['n_samples'].astype(str) + '; p=' + df['n_features'].astype(str) + '; r=' + df['pct_random'].astype(str) + ';}'
df = df.rename(columns={'model_nickname': 'Model'})
df = df.groupby(['Config.', 'Model']).agg({'best_adjusted_rand': ['mean', 'std'], 'fit_model_return_elapsed_time': ['mean', 'std'], 'max_memory_used': ['mean', 'std'], 'seed_unified': 'count', 'class_sep': 'first', 'n_samples': 'first', 'n_features': 'first', 'pct_random': 'first'})
df = df.sort_values([('class_sep', 'first'), ('n_samples', 'first'), ('n_features', 'first'), ('pct_random', 'first')])
df = df[['best_adjusted_rand']]
df = df.rename(columns={'best_adjusted_rand': 'ARI'})
# df = df.rename(columns={'fit_model_return_elapsed_time': 'Time (s)', 'max_memory_used': 'Memory (MB)', 'n_samples': 'Number of samples', 'n_features': 'Number of features', 'model_nickname': 'Model', 'best_adjusted_rand': 'Adjusted Rand', 'best_n_clusters_': 'Number of Clusters Found', 'n_classes': 'True Number of Clusters', 'class_sep': 'Class Separation', 'pct_random': '% Random Features', 'seed_unified': 'Seed'})
# df = df[['Model', 'Adjusted Rand', 'Time (s)', 'Memory (MB)', 'Number of samples', 'Number of features', 'Class Separation', '% Random Features', 'Seed', 'Number of Clusters Found', 'True Number of Clusters']]
# df = df.groupby('Model').agg({'Adjusted Rand': ['mean', 'std'], 'Number of Clusters Found': ['median', ('Q25', lambda x: x.quantile(0.25)), ('Q75', lambda x: x.quantile(0.75))], 'Time (s)': ['mean', 'std'], 'Memory (MB)': ['mean', 'std'], 'Number of samples': ['mean', 'nunique'], 'Number of features': ['mean', 'nunique'], 'Class Separation': ['mean', 'nunique'], '% Random Features': ['mean', 'nunique'], 'True Number of Clusters': ['mean', 'nunique'], 'Seed': ['mean', 'nunique']})
# df.to_csv('simulated_data_easy_more_samples.csv')
# df

In [None]:
# "interesting cases"
# easy: 1000 samples, 1000 features, x random, 100 class sep
# easy more samples: 10000 samples, 10000 features, x random, 100 class sep
# hard: 1000 samples, 1000 features, x random, 10 class sep
# hard more samples: 10000 samples, 1000 features, x random, 10 class sep
# medium: x samples, 10000 features, x random, 50 class sep

df = df_common_fill.copy()
# df = df.loc[df['n_samples'] == 10000]
# df = df.loc[df['n_features'] == 10000]
# df = df.loc[df['pct_random'] == 0.5]
# df = df.loc[df['class_sep'] == 100.0]
df = df.loc[df['model_nickname'].isin(['RecursiveClustering', 'KMeans', 'HDBSCAN', 'DBSCAN', 'AffinityPropagation', 'OPTICS', 'SpectralSubspaceRandomization', 'WardAgglomerativeClustering'])]
df['Config.'] = '\parbox{1.5cm}{l=' + df['class_sep'].astype(str) + '; n=' + df['n_samples'].astype(str) + '; p=' + df['n_features'].astype(str) + '; r=' + df['pct_random'].astype(str) + ';}'
df = df.rename(columns={'model_nickname': 'Model'})
df = df.groupby(['Config.', 'Model']).agg({'best_adjusted_rand': ['mean', 'std'], 'fit_model_return_elapsed_time': ['mean', 'std'], 'max_memory_used': ['mean', 'std'], 'seed_unified': 'count', 'class_sep': 'first', 'n_samples': 'first', 'n_features': 'first', 'pct_random': 'first'})
df = df.sort_values([('class_sep', 'first'), ('n_samples', 'first'), ('n_features', 'first'), ('pct_random', 'first')])
df = df.rename(columns={'best_adjusted_rand': 'ARI'})
# easy = 1000 samples, 1000 features, x random, 100 class sep
df_easy = df.loc[(df[('n_samples', 'first')] == 1000) & (df[('n_features', 'first')] == 1000) & (df[('class_sep', 'first')] == 100.0) 
                 & (df[('pct_random', 'first')] == 0.0)].copy()
df_easy = df_easy[['ARI']]
# easy more samples = 10000 samples, 10000 features, x random, 100 class sep
df_easy_more_samples = df.loc[(df[('n_samples', 'first')] == 10000) & (df[('n_features', 'first')] == 10000) & (df[('class_sep', 'first')] == 100.0)
                               & (df[('pct_random', 'first')] == 0.0)].copy()
df_easy_more_samples = df_easy_more_samples[['ARI']]
# hard = 1000 samples, 1000 features, x random, 10 class sep
df_hard = df.loc[(df[('n_samples', 'first')] == 1000) & (df[('n_features', 'first')] == 1000) & (df[('class_sep', 'first')] == 10.0)].copy()
df_hard = df_hard[['ARI']]
# hard more samples = 10000 samples, 1000 features, x random, 10 class sep
# df_hard_more_samples = df.loc[(df[('n_samples', 'first')] == 10000) & (df[('n_features', 'first')] == 1000) & (df[('class_sep', 'first')] == 10.0)].copy()
# df_hard_more_samples = df_hard_more_samples[['ARI']]
# medium = x samples, 10000 features, x random, 50 class sep
df_medium = df.loc[(df[('n_samples', 'first')] == 100) & (df[('n_features', 'first')] == 10000) & (df[('class_sep', 'first')] == 50.0) & (df[('pct_random', 'first')] == 0.0)].copy()
df_medium = df_medium[['ARI']]

df = pd.concat([df_easy, df_easy_more_samples, df_hard, df_medium], axis=0)

In [None]:
def highlight_max(df):
    max_values = df.groupby(level=0).transform('max')
    is_highlighted = df == max_values
    df_css = df.copy().astype(str)
    df_css.loc[:, :] = ''
    df_css[is_highlighted] = 'font-weight: bold'
    return df_css

In [None]:
print(df.style.apply(highlight_max, subset=[('ARI', 'mean')], axis=None).format(precision=3, na_rep='No Run').to_latex(hrules=True, clines='skip-last;data', convert_css=True))

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(df.style.highlight_max(subset=[('ARI', 'mean')], axis=0))

# Debug and explore

In [None]:
df = df_runs_raw.copy()
df = df.loc[df['status'] == 'FAILED']

In [None]:
df['EXCEPTION'].unique()

In [None]:
df = df_runs_raw_parents.copy()
df = df.loc[df['status'] == 'FAILED']

In [None]:
df

In [None]:
parent_run_ids = list(df.index)

In [None]:
parent_run_ids

In [None]:
df = df_runs_raw.copy()
df = df.loc[df['parent_run_id'].isin(parent_run_ids)]

In [None]:
child_run_ids = list(df.index)

In [None]:
runs_to_delete = child_run_ids

In [None]:
print(runs_to_delete, len(runs_to_delete))

In [None]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [None]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    