In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from ml_experiments.analyze import get_df_runs_from_mlflow_sql, get_missing_entries, get_common_combinations, get_df_with_combinations
import plotly.express as px
from itertools import product
import time
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Save Results

## Load mlflow runs

In [2]:
db_port = 5001
db_name = 'recursive_clustering'
w = 'clust9'
# url = f'postgresql://{w}.ceremade.dauphine.lan:{db_port}/{db_name}'
url = f'postgresql://belucci@localhost:{db_port}/{db_name}'
engine = create_engine(url)
query = 'SELECT experiments.name from experiments'
experiment_names = pd.read_sql(query, engine)['name'].tolist()
# results_dir = Path('~/tab_benchmark/results')
# os.makedirs(results_dir, exist_ok=True)

In [3]:
experiment_names

['Default',
 'blob_experiment',
 'hpo_classif_experiment',
 'hpo_openml_experiment',
 'hpo_gaussian_experiment',
 'hpo_openml_final',
 'outlier_hc',
 'time_hc',
 'hpo_n_clusters',
 'hpo_classif_huge']

In [4]:
experiments_names = [
    'hpo_n_clusters',
]

In [5]:
params_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes',
    'n_trials'
]

In [6]:
latest_metrics_columns = [
    'fit_model_return_elapsed_time',
    'max_memory_used',
    'n_clusters_',
    'rand_score',
    'adjusted_rand',
    'mutual_info',
    'adjusted_mutual_info',
    'normalized_mutual_info',
    'homogeneity',
    'completeness',
    'v_measure',
    'silhouette',
    'calinski_harabasz_score',
    'davies_bouldin_score',
    'inertia_score',
    'best_n_clusters_',
    'best_rand_score',
    'best_adjusted_rand',
    'best_mutual_info',
    'best_adjusted_mutual_info',
    'best_normalized_mutual_info',
    'best_homogeneity_completeness_v_measure',
    'best_silhouette',
    'best_calinski_harabasz_score',
    'best_davies_bouldin_score',
    'best_inertia_score',
    'best_homogeneity',
    'best_completeness',
    'best_v_measure',
]

In [7]:
tags_columns = [
    'raised_exception',
    'EXCEPTION',
    'parent_run_id',
    'best_child_run_id',
]

In [8]:
# parameters of each model, they are saved as tags for the parent run
parameters = {
    'RecursiveClustering': ['components_size', 'repetitions', 'kmeans_n_clusters'],
    'KMeans': ['n_clusters'],
    'HDBSCAN': ['min_cluster_size'],
    'DBSCAN': ['eps', 'min_samples'],
    'AffinityPropagation': ['damping'],
    'OPTICS': ['min_samples'],
    'SpectralSubspaceRandomization': ['n_similarities', 'sampling_ratio', 'sc_n_clusters'],
    'WardAgglomerativeClustering': ['n_clusters'],
}
all_model_parameters = [p for model, params in parameters.items() for p in params]
all_model_parameters = list(set(all_model_parameters))
for param in all_model_parameters:
    tags_columns.append(param)

In [9]:
runs_columns = ['run_uuid', 'status', 'start_time', 'end_time']
experiments_columns = []
other_table = 'params'
other_table_keys = params_columns
df_params = get_df_runs_from_mlflow_sql(engine, runs_columns=runs_columns, experiments_columns=experiments_columns, experiments_names=experiments_names, other_table=other_table, other_table_keys=other_table_keys)
df_latest_metrics = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='latest_metrics', other_table_keys=latest_metrics_columns)
df_tags = get_df_runs_from_mlflow_sql(engine, runs_columns=['run_uuid'], experiments_columns=experiments_columns, experiments_names=experiments_names, other_table='tags', other_table_keys=tags_columns)

In [10]:
df_runs_raw = df_params.join(df_latest_metrics)
df_runs_raw = df_runs_raw.join(df_tags)

In [11]:
df_runs_raw_parents = df_runs_raw.copy()
df_runs_raw_parents = df_runs_raw_parents.loc[df_runs_raw_parents['parent_run_id'].isna()]

In [12]:
df_runs_raw_parents

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,silhouette,v_measure,EXCEPTION,best_child_run_id,components_size,kmeans_n_clusters,n_clusters,parent_run_id,raised_exception,repetitions
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00182d12bb17486d877981bc2f490589,FINISHED,1740659603558,1.740722e+12,100.0,KMeans,12,100,,,10000,...,0.181287,0.866055,,de1911a57904428e85be763b748ddb84,,,12,,False,
002dfa4ca2fa40098b82095f49eb29f2,FINISHED,1740659031041,1.740713e+12,10.0,KMeans,5,100,,,10000,...,0.298534,0.920810,,7452155edd294836add1c894e9f05b12,,,5,,False,
0059b9d861ab413db316f7debef66a05,FINISHED,1740659073869,1.740713e+12,100.0,KMeans,5,10000,,,1000,...,-0.000909,0.684588,,1af398163b2d4cfdafef751b4a83ae79,,,5,,False,
006177dac22b4611b298d14335515701,FINISHED,1740658172682,1.740683e+12,50.0,KMeans,5,10000,,,100,...,0.145530,0.928715,,95cd5ae8f504403a8d90d29314b9873f,,,5,,False,
006c38a1e2c74a118ea1c36f14348851,FINISHED,1739805439032,1.740066e+12,10.0,RecursiveClustering,20,100,,,10000,...,0.441788,0.999183,,99721d0b967e48a885d910b28c8008a5,30,10,,,False,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff53dff04d68443f91389bff77915d2f,FINISHED,1740659752745,1.740731e+12,100.0,KMeans,5,1000,,,10000,...,0.155419,0.783468,,c843d105c2b64d1b8ad7b6c1ce3f0416,,,5,,False,
ff5c84efc7b34584b09226825ead72e4,FINISHED,1740658268593,1.740690e+12,30.0,KMeans,5,100,,,1000,...,0.017918,0.667665,,1192ce6fd5e24832861c3f85db783812,,,5,,False,
ff6dc7d243f64486a415494e395f4872,FINISHED,1739804626988,1.739878e+12,10.0,RecursiveClustering,8,10000,,,100,...,-1.000000,0.621993,,d969900ce4f246989954ac8136232d57,25,4,,,False,8
ffa26f2b9d00441880edc3732da3da4c,FINISHED,1740657186999,1.740663e+12,20.0,KMeans,8,100,,,100,...,0.022911,0.798587,,b2db827e25d748c9b61372e9a847e6eb,,,8,,False,


In [13]:
# Separate our method with two types n_trials
df_runs_parents = df_runs_raw_parents.copy()
df_runs_parents.loc[(df_runs_parents['model_nickname'] == 'RecursiveClustering') & (df_runs_parents['n_trials'] == '20'), 'model_nickname'] = 'RecursiveClustering_20'

## Delete duplicate runs (if any) and complete some models that cannot run with some datasets

In [14]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'pct_random',
    'seed_unified',
    'class_sep',
    'n_random',
    'n_informative',
    'n_classes'
]
df_runs_parents = df_runs_parents.dropna(axis=0, how='all', subset=['best_adjusted_rand']).copy()
df_runs_parents = df_runs_parents.loc[(~df_runs_parents.duplicated(non_duplicate_columns))]
run_uuid_parents = df_runs_parents.index.tolist()
df_runs = df_runs_raw.copy()
df_runs = df_runs.loc[df_runs['parent_run_id'].isin(run_uuid_parents)]

# Missing

In [15]:
df_runs_parents['model_nickname'].unique().tolist()

['KMeans', 'RecursiveClustering']

In [16]:
non_duplicate_columns = [
    'model_nickname',
    'n_samples',
    'n_features',
    'seed_unified',
    'class_sep',
    'n_classes',
]

In [17]:
n_samples = ['100', '1000', '10000']
n_features = ['100', '1000', '10000']
# pct_random = ['0.0', '0.2', '0.5', '0.7']
seed_unified = [f'{i}' for i in range(5)]
class_sep = ['10.0', '20.0', '30.0', '40.0', '50.0', '100.0']
n_classes = ['2', '5', '8', '12', '20']
model_nickname = [
    'RecursiveClustering',
    'KMeans',
]
columns_names = non_duplicate_columns
should_contain_values = [model_nickname, n_samples, n_features, seed_unified, class_sep, n_classes]
df_missing = get_missing_entries(df_runs_parents, columns_names, should_contain_values)

In [18]:
df_missing

Unnamed: 0,model_nickname,n_samples,n_features,seed_unified,class_sep,n_classes


# Get common combinations

In [33]:
model_nickname = [
    'RecursiveClustering',
    'KMeans',
]
df = df_runs_parents.copy()
df = df.loc[df['model_nickname'].isin(model_nickname)]
column = 'model_nickname'
combination_columns = [
    'n_samples',
    'n_features',
    'seed_unified',
    'class_sep',
    'n_classes',
]
common_combinations = get_common_combinations(df, column, combination_columns)

In [35]:
df_common = get_df_with_combinations(df, combination_columns, common_combinations)

In [36]:
df_common['n_samples'] = df_common['n_samples'].astype(int)
df_common['n_features'] = df_common['n_features'].astype(int)
df_common['pct_random'] = df_common['pct_random'].astype(float)
df_common['class_sep'] = df_common['class_sep'].astype(float)
df_common['seed_unified'] = df_common['seed_unified'].astype(int)
df_common['n_classes'] = df_common['n_classes'].astype(int)

In [39]:
df_common

key,status,start_time,end_time,class_sep,model_nickname,n_classes,n_features,n_informative,n_random,n_samples,...,silhouette,v_measure,EXCEPTION,best_child_run_id,components_size,kmeans_n_clusters,n_clusters,parent_run_id,raised_exception,repetitions
run_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00182d12bb17486d877981bc2f490589,FINISHED,1740659603558,1.740722e+12,100.0,KMeans,12,100,,,10000,...,0.181287,0.866055,,de1911a57904428e85be763b748ddb84,,,12,,False,
002dfa4ca2fa40098b82095f49eb29f2,FINISHED,1740659031041,1.740713e+12,10.0,KMeans,5,100,,,10000,...,0.298534,0.920810,,7452155edd294836add1c894e9f05b12,,,5,,False,
0059b9d861ab413db316f7debef66a05,FINISHED,1740659073869,1.740713e+12,100.0,KMeans,5,10000,,,1000,...,-0.000909,0.684588,,1af398163b2d4cfdafef751b4a83ae79,,,5,,False,
006177dac22b4611b298d14335515701,FINISHED,1740658172682,1.740683e+12,50.0,KMeans,5,10000,,,100,...,0.145530,0.928715,,95cd5ae8f504403a8d90d29314b9873f,,,5,,False,
006c38a1e2c74a118ea1c36f14348851,FINISHED,1739805439032,1.740066e+12,10.0,RecursiveClustering,20,100,,,10000,...,0.441788,0.999183,,99721d0b967e48a885d910b28c8008a5,30,10,,,False,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff53dff04d68443f91389bff77915d2f,FINISHED,1740659752745,1.740731e+12,100.0,KMeans,5,1000,,,10000,...,0.155419,0.783468,,c843d105c2b64d1b8ad7b6c1ce3f0416,,,5,,False,
ff5c84efc7b34584b09226825ead72e4,FINISHED,1740658268593,1.740690e+12,30.0,KMeans,5,100,,,1000,...,0.017918,0.667665,,1192ce6fd5e24832861c3f85db783812,,,5,,False,
ff6dc7d243f64486a415494e395f4872,FINISHED,1739804626988,1.739878e+12,10.0,RecursiveClustering,8,10000,,,100,...,-1.000000,0.621993,,d969900ce4f246989954ac8136232d57,25,4,,,False,8
ffa26f2b9d00441880edc3732da3da4c,FINISHED,1740657186999,1.740663e+12,20.0,KMeans,8,100,,,100,...,0.022911,0.798587,,b2db827e25d748c9b61372e9a847e6eb,,,8,,False,


# Plots

In [78]:
df = df_common.copy()
df = df[['model_nickname', 'best_n_clusters_', 'n_samples', 'n_features', 'class_sep', 'n_classes', 'seed_unified']]
df['right_n_clusters'] = df['n_classes'] == df['best_n_clusters_']
df = df.groupby(['model_nickname', 'n_samples', 'n_features', 'class_sep', 'n_classes']).agg({'best_n_clusters_': ['mean','std'], 'right_n_clusters':['sum']}).reset_index()
df.columns = [' '.join(col).strip() for col in df.columns.values]
# df = df.loc[df['n_samples'] == 100]
df = df.loc[df['n_features'] == 10000]
px.scatter(df, x='class_sep', y='right_n_clusters sum', color='model_nickname', facet_row='n_classes', facet_col='n_samples', width=1000, height=1000)

In [49]:
df

key,model_nickname,n_samples,n_features,class_sep,n_classes,best_n_clusters_,best_n_clusters_,right_n_clusters
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,mean,std,sum
0,KMeans,100,100,10.0,2,2.0,0.0,5
1,KMeans,100,100,10.0,5,5.0,0.0,5
2,KMeans,100,100,10.0,8,8.0,0.0,5
3,KMeans,100,100,10.0,12,12.2,0.447214,4
4,KMeans,100,100,10.0,20,20.4,0.547723,3
5,KMeans,100,100,20.0,2,2.0,0.0,5
6,KMeans,100,100,20.0,5,5.0,0.0,5
7,KMeans,100,100,20.0,8,8.0,0.0,5
8,KMeans,100,100,20.0,12,12.0,0.0,5
9,KMeans,100,100,20.0,20,20.0,0.0,5


In [46]:
df = df_common.copy()
df = df[['model_nickname', 'best_n_clusters_', 'n_samples', 'n_features', 'class_sep', 'n_classes', 'seed_unified']]
df['right_n_clusters'] = df['n_classes'] == df['best_n_clusters_']
df = df.groupby(['model_nickname']).agg({'right_n_clusters':['sum']}).reset_index()
df

key,model_nickname,right_n_clusters
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
0,KMeans,1020
1,RecursiveClustering,881


# Debug and explore

In [None]:
df = df_runs_raw.copy()
df = df.loc[df['status'] == 'FAILED']

In [None]:
df['EXCEPTION'].unique()

In [None]:
df = df_runs_raw_parents.copy()
df = df.loc[df['status'] == 'FAILED']

In [None]:
df

In [None]:
parent_run_ids = list(df.index)

In [None]:
parent_run_ids

In [None]:
df = df_runs_raw.copy()
df = df.loc[df['parent_run_id'].isin(parent_run_ids)]

In [None]:
child_run_ids = list(df.index)

In [None]:
runs_to_delete = child_run_ids

In [None]:
print(runs_to_delete, len(runs_to_delete))

In [None]:
run_uuid_query = [f"'{run_id}'" for run_id in runs_to_delete]
run_uuid_query = ', '.join(run_uuid_query)

In [None]:
query = f"""
UPDATE runs
SET lifecycle_stage = 'deleted'
WHERE run_uuid IN ({run_uuid_query}) 
"""
with engine.begin() as conn:
    conn.execute(text(query))

for i, row in df_runs_raw.iterrows():
    run_id = row.run_id
    model_name = row['params.model_name']
    with mlflow.start_run(run_id) as run:
        mlflow.log_param('model_nickname', model_name)    