# Grid Search for Best Epsilon Configuration
This notebook performs a grid search over multiple epsilon values for HDBSCAN clustering, evaluating each configuration on two datasets and saving the results.

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json

# Set working directory (adjust if needed)
working_dir = '/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup'
os.chdir(working_dir)
print(f'Changed working directory to {working_dir}')
from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.utils import (
    is_noise,
    run_hdbscan,
    apply_predictions,
    compute_metrics,
)
from src.p05_refine_dedup.utils.s3_io_functions import (
    load_parquet_from_s3,
)

output_dir = Path("data/W03/from_notebooks/R06_additional_grid_search/v3")
output_dir.mkdir(parents=True, exist_ok=True)
results_xlsx = output_dir / 'grid_search_results.xlsx'
best_config_json = output_dir / 'best_config.json'

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load embeddings
s3_input_embeddings = 'registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet'
bucket_name = config.BUCKET_NAME_DEV
folder_path = s3_input_embeddings.rsplit('/', 1)[0]
file_name = s3_input_embeddings.rsplit('/', 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=folder_path,
    file_name=file_name,
)
clusters_table_xlsx = 'data/W02/R02_evaluate_model_performance/clusters_table.xlsx'
clusters_df = pd.read_excel(clusters_table_xlsx)
clusters_df = clusters_df.merge(
    embeddings_df[['full_name', 'full_name_embedding']],
    on='full_name',
    how='left'
)
clusters_df.rename(columns={'Final_Cluster': 'cluster_0'}, inplace=True)

evaluation_dataset_any = 'data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/assessed_pairs_v1.xlsx'
evaluation_dataset_famous = 'data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/famous_close_assessed_pairs_v1.xlsx'
eval_df_any = pd.read_excel(evaluation_dataset_any)
eval_df_famous = pd.read_excel(evaluation_dataset_famous)



In [3]:
# Define epsilon ranges for each cluster type
eps_grid = {
    # '0_0': np.arange(0.44, 0.51, 0.01),
    'start_0_': np.arange(0.34, 0.44, 0.01),
    # 'end__0': np.arange(0.28, 0.33, 0.01),
    'other': np.arange(0.19, 0.27, 0.01),
}
# convert array to list of str()
for key in eps_grid:
    eps_grid[key] = [str(round(x, 2)) for x in eps_grid[key]]

In [4]:
# Helper to get epsilon for a cluster given the current config
def get_epsilon(cluster_id, eps_config):
    if cluster_id == '0_0':
        return eps_config['0_0']
    elif cluster_id.startswith('0_'):
        return eps_config['start_0_']
    elif cluster_id.endswith('_0'):
        return eps_config['end__0']
    else:
        return eps_config['other']

In [5]:
# create one dictionary of new cluseters ids for each cluster type
# dict_0_0 = {
#     0.44:[
#         {'full_name': 'name_1',
#          'cluster_0': '0_0',
#          'subcluster': '0',
#          'cluster_1': '0_0_0'
#          },
#         {'full_name': 'name_1',
#          'cluster_0': '0_0',
#          'subcluster': '1',
#          'cluster_1': '0_0_1'
#          },
#          {'full_name': 'name_2',
#           'cluster_0': '0_0',
#           'subcluster': '2',
#           'cluster_1': '0_0_2'
#           }
#           ...
#          ],
#     0.45:[
#         {'full_name': 'name_1',
#          'cluster_0': '0_0',
#          'subcluster': '0',
#          'cluster_1': '0_0_0'
#          },
#         {'full_name': 'name_1',
#          'cluster_0': '0_0',
#          'subcluster': '1',
#          'cluster_1': '0_0_1'
#          },
#          {'full_name': 'name_2',
#           'cluster_0': '0_0',
#           'subcluster': '2',
#           'cluster_1': '0_0_2'
#           }
#           ...
#          ],
#     ...
# }
def filter_on_cluster_type(clusters_df, cluster_type):
    if cluster_type == '0_0':
        return clusters_df[clusters_df['cluster_0'] == cluster_type]
    elif cluster_type == 'start_0_':
        # starting with '0_' and not ending with '_0'
        return clusters_df[(clusters_df['cluster_0'].str.startswith('0_')) &
                           (~clusters_df['cluster_0'].str.endswith('_0'))]
    elif cluster_type == 'end__0':
        # ending with '_0' and not starting with '0_'
        return clusters_df[clusters_df['cluster_0'].str.endswith('_0') &
                           (~clusters_df['cluster_0'].str.startswith('0_'))]
    elif cluster_type == 'other':
        return clusters_df[~clusters_df['cluster_0'].str.startswith('0_') &
                           ~clusters_df['cluster_0'].str.endswith('_0')]
    else:
        raise ValueError(f"Unknown cluster type: {cluster_type}")

# Initiate dict_0_0 with all espilons and full_name and cluster_0 only
def create_initial_dict(clusters_df, cluster_type):
    initial_dict = {}
    for eps in eps_grid[cluster_type]:
        clusters_df_filtered = filter_on_cluster_type(clusters_df, cluster_type)
        initial_dict[eps] = clusters_df_filtered[['full_name', 'cluster_0']].copy()
        # set subcluster and cluster_1 to None
        initial_dict[eps]['subcluster'] = None
        initial_dict[eps]['cluster_1'] = None
    return initial_dict

In [6]:
n_max = 20
large_clusters = clusters_df['cluster_0'].value_counts()[clusters_df['cluster_0'].value_counts() >= n_max].index.tolist()
# filter on large clusters
large_clusters_df = clusters_df[clusters_df['cluster_0'].isin(large_clusters)]

In [7]:
# initiate dictionaries for each cluster type
# dict_0_0 = create_initial_dict(large_clusters_df, '0_0')
dict_start_0_ = create_initial_dict(large_clusters_df, 'start_0_')
# dict_end__0 = create_initial_dict(large_clusters_df, 'end__0')
dict_other = create_initial_dict(large_clusters_df, 'other')

# prepare dicts

In [8]:
min_cluster_size=1
min_samples=1
# cluster_selection_epsilon=0.0
max_cluster_size=30
metric="euclidean"
n_jobs=-1
cluster_selection_method="leaf"
store_centers="medoid"

In [9]:
def process_clusters(clusters_df, cluster_type, results_dict):
    # first filter on cluster type
    clusters_df_filtered = filter_on_cluster_type(clusters_df, cluster_type)
    # # test on 100 data points
    # clusters_df_filtered = clusters_df_filtered.head(1000) # For testing, remove this line for full dataset
    # retrive the list of clusters in cluster_0
    clusters = clusters_df_filtered['cluster_0'].unique().tolist()

    for eps in tqdm(eps_grid[cluster_type], desc=f"Processing {cluster_type} clusters"):
        for cluster in clusters:
            # first filter on this cluster
            df = clusters_df_filtered[clusters_df_filtered['cluster_0'] == cluster].copy()
            embeddings = np.vstack(df['full_name_embedding'].values)

            # Add subcluster and cluster_1 columns
            df['subcluster'] = None
            df['cluster_1'] = None
            
            # Apply HDBSCAN clustering
            labels, comp_time = run_hdbscan(
                embeddings,
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                cluster_selection_epsilon=float(eps),
                max_cluster_size=max_cluster_size,
                metric=metric,
                n_jobs=n_jobs,
                cluster_selection_method=cluster_selection_method,
                store_centers=store_centers,
            )
            
            df['subcluster'] = labels.astype(str)  # Convert labels to string for subcluster
            df['cluster_1'] = df['cluster_0'] + '_' + df['subcluster']

            # Vectorized update: set full_name as index for both DataFrames, then update
            updates = df[['full_name', 'subcluster', 'cluster_1']].set_index('full_name')
            res_df = results_dict[eps].set_index('full_name')
            res_df.update(updates)
            results_dict[eps] = res_df.reset_index()

    return results_dict

In [None]:
# # test on 100 data points on '0_0' clusters
# dict_0_0 = process_clusters(large_clusters_df, '0_0', dict_0_0)

Processing 0_0 clusters: 100%|██████████| 8/8 [18:44<00:00, 140.58s/it]


In [13]:
dict_start_0_ = process_clusters(large_clusters_df, 'start_0_', dict_start_0_)

Processing start_0_ clusters:   0%|          | 0/10 [00:00<?, ?it/s]

Processing start_0_ clusters: 100%|██████████| 10/10 [04:12<00:00, 25.24s/it]


In [14]:
dict_end__0 = process_clusters(large_clusters_df, 'end__0', dict_end__0)

Processing end__0 clusters: 100%|██████████| 5/5 [00:06<00:00,  1.22s/it]


In [15]:
dict_other = process_clusters(large_clusters_df, 'other', dict_other)

Processing other clusters: 100%|██████████| 9/9 [00:23<00:00,  2.67s/it]


In [None]:
# save each dictionary to a json file dict_0_0, dict_start_0_, dict_end__0, dict_other
# not looping on eps values, save direclty all eps values in on single json file
def save_dict_to_json(data_dict, file_path):
    with open(file_path, 'w') as f:
        json.dump(data_dict, f, indent=4)
# # transform each object (dataframe) in the dict to a dict
# dict_0_0 = {eps: df.to_dict(orient='records') for eps, df in dict_0_0.items()}
dict_start_0_ = {eps: df.to_dict(orient='records') for eps, df in dict_start_0_.items()}
dict_end__0 = {eps: df.to_dict(orient='records') for eps, df in dict_end__0.items()}
dict_other = {eps: df.to_dict(orient='records') for eps, df in dict_other.items()}

# Save dictionaries to json files
# save_dict_to_json(dict_0_0, output_dir / 'dict_0_0.json')
save_dict_to_json(dict_start_0_, output_dir / 'dict_start_0_.json')
# save_dict_to_json(dict_end__0, output_dir / 'dict_end__0.json')
save_dict_to_json(dict_other, output_dir / 'dict_other.json')

# compute original metrics

In [10]:
clusters_df[f"corrected_cluster"] = clusters_df[f"corrected_cluster"].apply(
    lambda x: None if is_noise(x) else x
)
# recompute current performance metrics
cluster_map = dict(zip(clusters_df["full_name"], clusters_df[f"corrected_cluster"]))
# Apply predictions based on cluster mapping
eval_df_any = apply_predictions(
    eval_df_any, cluster_map, col_el_1="full_name", col_el_2="alias"
)
# Compute metrics (assuming ground truth is in column "final_label")
metrics_any = compute_metrics(eval_df_any["final_label"], eval_df_any["prediction"])
# log the metrics with 2 decimal precision
metrics_any_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_any.items()
}
print(f"Metrics for any pairs: {metrics_any_to_print}")

# Apply predictions
eval_df_famous = apply_predictions(
    eval_df_famous, cluster_map, col_el_1="full_name", col_el_2="alias"
)
metrics_famous = compute_metrics(
    eval_df_famous["final_label"], eval_df_famous["prediction"]
)
# log the metrics with 2 decimal precision
metrics_famous_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_famous.items()
}
print(f"Metrics for famous pairs: {metrics_famous_to_print}")


Metrics for any pairs: {'precision': 0.72, 'recall': 0.79, 'f1': 0.76, 'accuracy': 0.81}
Metrics for famous pairs: {'precision': 0.81, 'recall': 0.77, 'f1': 0.79, 'accuracy': 0.86}


# reload an compute metrics

In [11]:
# reload the dictionaries from json files
def load_dict_from_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)
output_dir_original = Path("data/W03/from_notebooks/R06_additional_grid_search/v2")
# dict_0_0 = load_dict_from_json(output_dir_original / 'dict_0_0.json')
dict_start_0_ = load_dict_from_json(output_dir_original / 'dict_start_0_.json')
# dict_end__0 = load_dict_from_json(output_dir_original / 'dict_end__0.json')
dict_other = load_dict_from_json(output_dir_original / 'dict_other.json')

# transform each object (dict) in the dict to a dataframe
def transform_dict_to_df(data_dict):
    return {eps: pd.DataFrame(records) for eps, records in data_dict.items()}
# dict_0_0 = transform_dict_to_df(dict_0_0)
dict_start_0_ = transform_dict_to_df(dict_start_0_)
# dict_end__0 = transform_dict_to_df(dict_end__0)
dict_other = transform_dict_to_df(dict_other)

In [16]:
# initiate grid_search_results with columns eps_0_0, eps_start_0_, eps_end__0, eps_other, f1_any, precision_any, recall_any, f1_famous, precision_famous, recall_famous, f1_mean, precision_mean, recall_mean,
grid_search_results = pd.DataFrame(
    columns=[
        # "eps_0_0",
        "eps_start_0_",
        # "eps_end__0",
        "eps_other",
        "f1_any",
        "precision_any",
        "recall_any",
        "f1_famous",
        "precision_famous",
        "recall_famous",
        "f1_mean",
        "precision_mean",
        "recall_mean",
    ]
)
# finally loop on all combiinations of epsilons in all dictionaries
from itertools import product

# Create a list of all combinations of epsilons
eps_combinations = list(
    product(
        eps_grid["start_0_"], eps_grid["other"]
    )
) # eps_grid["0_0"], eps_grid["end__0"]
best_f1_mean = -1
best_precision_mean= -1
best_config = None
clusters_df["subcluster"] = None
clusters_df["cluster_1"] = clusters_df["cluster_0"]
clusters_best_config_df = clusters_df.copy()
# Loop through each combination of epsilons
for eps_combination in tqdm(eps_combinations, desc="Processing epsilon combinations"):
    clusters_copy = clusters_df.copy()
    clusters_copy = clusters_copy.set_index("full_name")

    eps_start_0_, eps_other = eps_combination # eps_0_0, eps_end__0
    # Create a new row for the results DataFrame
    new_row = {
        # "eps_0_0": eps_0_0,
        "eps_start_0_": eps_start_0_,
        # "eps_end__0": eps_end__0,
        "eps_other": eps_other,
    }

    # Update clusters_df with the new cluster_1 and subcluster for all dictionaries matching on 'full_name'
    for cluster_type, dict_data, eps in zip(
        ["start_0_", "other"], # "0_0", "end__0", 
        [dict_start_0_,dict_other], # dict_0_0,  dict_end__0, 
        [eps_start_0_, eps_other], # eps_0_0, eps_end__0, 
    ):
        df = dict_data[eps].copy().set_index("full_name")
        clusters_copy.update(df[["subcluster", "cluster_1"]])

    # reset normal index
    clusters_copy.reset_index(inplace=True)
    
    clusters_copy["cluster_1"] = clusters_copy["cluster_1"].apply(
        lambda x: None if is_noise(x) else x
    )
    cluster_map = dict(zip(clusters_copy["full_name"], clusters_copy["cluster_1"]))

    # Compute metrics on the updated clusters_df
    # Apply predictions
    eval_df_any = apply_predictions(
        eval_df_any, cluster_map, col_el_1="full_name", col_el_2="alias"
    )
    metrics_any = compute_metrics(eval_df_any["final_label"], eval_df_any["prediction"])
    eval_df_famous = apply_predictions(
        eval_df_famous, cluster_map, col_el_1="full_name", col_el_2="alias"
    )
    metrics_famous = compute_metrics(
        eval_df_famous["final_label"], eval_df_famous["prediction"]
    )

    # Add metrics to the new row
    new_row.update(
        {
            "f1_any": metrics_any["f1"],
            "precision_any": metrics_any["precision"],
            "recall_any": metrics_any["recall"],
            "f1_famous": metrics_famous["f1"],
            "precision_famous": metrics_famous["precision"],
            "recall_famous": metrics_famous["recall"],
            "f1_mean": (metrics_any["f1"] + metrics_famous["f1"]) / 2,
            "precision_mean": (metrics_any["precision"] + metrics_famous["precision"])
            / 2,
            "recall_mean": (metrics_any["recall"] + metrics_famous["recall"]) / 2,
        }
    )

    # Check if this is the best configuration so far. if yes, then update best_config and best_f1_mean
    if new_row["precision_mean"] > best_precision_mean:
        print(f"--- New best precision_mean: {round(new_row['precision_mean'],2)} with eps: {eps_combination}")
        best_precision_mean = new_row["precision_mean"]
        
    if new_row["f1_mean"] > best_f1_mean:
        print(f"--- New best f1_mean: {round(new_row['f1_mean'],2)} with eps: {eps_combination}")
        clusters_best_config_df = clusters_copy.copy()
        best_f1_mean = new_row["f1_mean"]
        best_config = {
            # "eps_0_0": eps_0_0,
            "eps_start_0_": eps_start_0_,
            # "eps_end__0": eps_end__0,
            "eps_other": eps_other,
            "f1_mean": best_f1_mean,
            "precision_mean": new_row["precision_mean"],
            "recall_mean": new_row["recall_mean"],
            "f1_any": new_row["f1_any"],
            "precision_any": new_row["precision_any"],
            "recall_any": new_row["recall_any"],
            "f1_famous": new_row["f1_famous"],
            "precision_famous": new_row["precision_famous"],
            "recall_famous": new_row["recall_famous"],
        }
        # convert all values in best_config to float with 2 decimal places
        best_config = {k: round(float(v), 2) for k, v in best_config.items()}

    # Append the new row to the results DataFrame
    grid_search_results = pd.concat(
        [grid_search_results, pd.DataFrame([new_row])],
        ignore_index=True
    )

# Save grid search results to Excel
grid_search_results.to_excel(results_xlsx, index=False)
# print and save best configuration to JSON
print(f"Best configuration: {best_config}")
with open(best_config_json, 'w') as f:
    json.dump(best_config, f, indent=4)

Processing epsilon combinations:   0%|          | 0/90 [00:00<?, ?it/s]

  grid_search_results = pd.concat(
Processing epsilon combinations:   1%|          | 1/90 [00:00<00:49,  1.82it/s]

--- New best precision_mean: 0.8 with eps: ('0.34', '0.19')
--- New best f1_mean: 0.76 with eps: ('0.34', '0.19')


Processing epsilon combinations:   2%|▏         | 2/90 [00:01<00:48,  1.81it/s]

--- New best f1_mean: 0.76 with eps: ('0.34', '0.2')


Processing epsilon combinations:   3%|▎         | 3/90 [00:01<00:47,  1.82it/s]

--- New best f1_mean: 0.76 with eps: ('0.34', '0.21')


Processing epsilon combinations:   4%|▍         | 4/90 [00:02<00:47,  1.82it/s]

--- New best f1_mean: 0.77 with eps: ('0.34', '0.22')


Processing epsilon combinations:   6%|▌         | 5/90 [00:02<00:46,  1.83it/s]

--- New best f1_mean: 0.77 with eps: ('0.34', '0.23')


Processing epsilon combinations:   7%|▋         | 6/90 [00:03<00:46,  1.81it/s]

--- New best f1_mean: 0.77 with eps: ('0.34', '0.24')


Processing epsilon combinations:   8%|▊         | 7/90 [00:03<00:45,  1.82it/s]

--- New best f1_mean: 0.78 with eps: ('0.34', '0.25')


Processing epsilon combinations:   9%|▉         | 8/90 [00:04<00:44,  1.82it/s]

--- New best f1_mean: 0.78 with eps: ('0.34', '0.26')


Processing epsilon combinations: 100%|██████████| 90/90 [00:48<00:00,  1.86it/s]

Best configuration: {'eps_start_0_': 0.34, 'eps_other': 0.26, 'f1_mean': 0.78, 'precision_mean': 0.79, 'recall_mean': 0.77, 'f1_any': 0.77, 'precision_any': 0.75, 'recall_any': 0.79, 'f1_famous': 0.79, 'precision_famous': 0.83, 'recall_famous': 0.75}





In [15]:
# show the best configuration datframe
print("Best configuration DataFrame:")
display(clusters_best_config_df.head(50))

Best configuration DataFrame:


Unnamed: 0,full_name,number_of_occurrences,cluster_0,corrected_cluster,full_name_embedding,subcluster,cluster_1
0,Spanish ABPM Registry (ABPM),12,1_1,1_1,"[-0.035308838, -0.005203247, 0.026031494, -0.0...",,
1,Fasa Registry for Systolic Heart Failure (FARSH),1,2,2,"[-0.03074646, 0.028259277, 0.022598267, -0.019...",,
2,OnCovid Registry (OnCovid),12,3,3,"[-0.019241333, -0.008605957, 0.04559326, -0.01...",,
3,New York State Cancer Registry (NYSCR),62,4,4,"[-0.0053710938, 0.018249512, 0.044647217, -0.0...",,
4,China Liver Transplant Registry (CLTR),39,5_1,5_1,"[-0.03540039, 0.009819031, -0.0044784546, -0.0...",,
5,CMR-COVID (CMR-COVID),2,0_0,,"[-0.034179688, 0.0039596558, 0.05001831, -0.02...",,
6,SAR-COVID (SAR-COVID),3,0_0,,"[-0.0016880035, 0.014373779, 0.05230713, 0.000...",,
7,Swedish Neonatal Quality Register (SNQ),34,6_1,6_1,"[-0.045928955, 0.035369873, 0.01473999, 0.0049...",,
8,Persistent Pain Outcomes Collaboration (PPOC),1,7,7,"[-0.053222656, 0.016342163, 0.062927246, 0.006...",,
9,National Health Insurance claims data (NHI),3,8_1,8_1,"[-0.014968872, 0.0065956116, 0.048461914, -0.0...",1.0,8_1_1


## Grid search complete
- All results are saved in `grid_search_results.xlsx`.
- The best configuration is saved in `best_config.json`.
- You can now use the best configuration for further clustering and evaluation.