## Setup & Imports

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import logging

# Set working directory (adjust if needed)
working_dir = '/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup'
os.chdir(working_dir)
print(f'Changed working directory to {working_dir}')

from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.utils import (
    is_noise,
    run_hdbscan,
    apply_predictions,
    compute_metrics,
)
from src.p05_refine_dedup.utils.s3_io_functions import (
    load_parquet_from_s3,
)  # for loading embeddings

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
s3_input_embeddings = "registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet"
# 2. Load embeddings from S3 (parquet file)
logger.info(f"Loading embeddings from {s3_input_embeddings}")
bucket_name = config.BUCKET_NAME_DEV
# folder_path = s3_input_embeddings, file_name = last part
folder_path = s3_input_embeddings.rsplit("/", 1)[0]
file_name = s3_input_embeddings.rsplit("/", 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=folder_path,
    file_name=file_name,
)

2025-08-07 22:38:18,845 INFO Loading embeddings from registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet


In [3]:
evaluation_dataset_any="data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/assessed_pairs_v1.xlsx"
evaluation_dataset_famous="data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/famous_close_assessed_pairs_v1.xlsx"

# best_config="etc/best_config.json", # this is the best config obtained from the previous step, in json format


In [4]:
print(f"Load evaluation dataset (any pairs) from {evaluation_dataset_any}")
eval_df_any = pd.read_excel(evaluation_dataset_any)
print(f"Load evaluation dataset (famous pairs) from {evaluation_dataset_famous}")
eval_df_famous = pd.read_excel(evaluation_dataset_famous)

Load evaluation dataset (any pairs) from data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/assessed_pairs_v1.xlsx
Load evaluation dataset (famous pairs) from data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/famous_close_assessed_pairs_v1.xlsx


# 1. Compute first global clustering with hdbscan

In [5]:
# define outputs
version="v1"
step=1
clusters_table_output = f"data/W03/from_notebooks/R03_evaluate_hdbscan_eom_model_performance/{version}/clusters_table.xlsx"
# make sure the output directory exists
output_dir = Path(clusters_table_output).parent
output_dir.mkdir(parents=True, exist_ok=True)
prediction_any_results_xlsx=f"data/W03/from_notebooks/R03_evaluate_hdbscan_eom_model_performance/{version}/prediction_results_any_reg.xlsx"
prediction_famous_results_xlsx=f"data/W03/from_notebooks/R03_evaluate_hdbscan_eom_model_performance/{version}/prediction_results_famous_reg.xlsx"

## Apply HDBSCAN clustering

In [6]:
# # selet top n rows for testing
# n_rows = 100
# embeddings_df = embeddings_df.head(n_rows)

In [7]:
min_cluster_size=2
min_samples=2
cluster_selection_epsilon=0.0
max_cluster_size=30
metric="euclidean"
n_jobs=-1
cluster_selection_method="eom"
store_centers="medoid"

In [8]:
embeddings = np.vstack(embeddings_df["full_name_embedding"].values)
labels, comp_time = run_hdbscan(
    embeddings,
    min_cluster_size=min_cluster_size,
    min_samples=min_samples,
    cluster_selection_epsilon=cluster_selection_epsilon,
    max_cluster_size=max_cluster_size,
    metric=metric,
    n_jobs=n_jobs,
    cluster_selection_method=cluster_selection_method,
    store_centers=store_centers,
)
clusters_df = embeddings_df.copy()
clusters_df[f"cluster_{step}"] = labels

2025-08-07 23:22:49,054 INFO HDBSCAN completed in 2664.16 seconds with min_cluster_size=2, min_samples=2.


In [9]:
# corrected cluster is equal to Final_Cluster, except for noise clusters where it is equal to None
clusters_df[f"cluster_{step}"] = clusters_df[f"cluster_{step}"].apply(
    lambda x: None if is_noise(x) else x
)
cluster_map = dict(zip(clusters_df["full_name"], clusters_df[f"cluster_{step}"]))

In [10]:
# save clusters table
# columns_to_keep = [
#     "object_id",
#     "full_name",
#     "number_of_occurences",
#     f"cluster_{step}",
#]
clusters_df.to_excel(clusters_table_output, index=False)

## Assess perf

In [11]:
# reload table
clusters_df = pd.read_excel(clusters_table_output)
clusters_df[f"cluster_{step}"] = clusters_df[f"cluster_{step}"].apply(
    lambda x: None if is_noise(x) else x
)

In [12]:
# compute % of noise clusters in corrected_cluster
noise_clusters = clusters_df[clusters_df[f"cluster_{step}"].isna()]
noise_percentage = len(noise_clusters) / len(clusters_df) * 100
print(f"Number of raw extracted registry names: {len(clusters_df)}")
# number of consolidated registry names is unique values of corrected_cluster
print(f"Number of Consolidated registry names: {len(clusters_df[f'cluster_{step}'].dropna().unique())}")
print(f"Number of un-consolidated/lost (noise) registry names: {len(noise_clusters)}")
print(f"Percentage of Consolidated registry names: {(100-noise_percentage):.2f}%")
print(f"Percentage of un-consolidated/lost (noise) registry names: {noise_percentage:.2f}%")
print('---')
# count total number of occurrences in clusters
total_occurrences = clusters_df['number_of_occurrences'].sum()
# count total number of occurences of noise clusters
total_noise_occurrences = noise_clusters['number_of_occurrences'].sum()
print(f"Total number of publications with one of the 'official' extracted registry names: {total_occurrences}")
print(f"Total number of publications with one of the consolidated registry names: {total_occurrences - total_noise_occurrences}")
print(f"Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): {total_noise_occurrences}")
# print %
print(f"Percentage of lost publications: {total_noise_occurrences / total_occurrences * 100:.2f}%")

Number of raw extracted registry names: 54335
Number of Consolidated registry names: 12462
Number of un-consolidated/lost (noise) registry names: 17870
Percentage of Consolidated registry names: 67.11%
Percentage of un-consolidated/lost (noise) registry names: 32.89%
---
Total number of publications with one of the 'official' extracted registry names: 163102
Total number of publications with one of the consolidated registry names: 134139
Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): 28963
Percentage of lost publications: 17.76%


### a. Dataset 'Any'

In [13]:
cluster_map = dict(zip(clusters_df["full_name"], clusters_df[f"cluster_{step}"]))

In [14]:
# Apply predictions based on cluster mapping
eval_df_any = apply_predictions(
    eval_df_any, cluster_map, col_el_1="full_name", col_el_2="alias"
)
# Compute metrics (assuming ground truth is in column "final_label")
metrics_any = compute_metrics(eval_df_any["final_label"], eval_df_any["prediction"])
# log the metrics with 2 decimal precision
metrics_any_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_any.items()
}
print(f"Metrics for any pairs: {metrics_any_to_print}")
# Save predictions Excel file with required columns
eval_df_any[
    [
        "full_name",
        "alias",
        "number_of_occurrences",
        "alias_number_of_occurrences",
        "similarity",
        "uncertain",
        "final_label",
        "prediction",
    ]
].to_excel(prediction_any_results_xlsx, index=False)

Metrics for any pairs: {'precision': 0.71, 'recall': 0.75, 'f1': 0.73, 'accuracy': 0.8}


### b. Dataset 'famous'

In [15]:
# Apply predictions
eval_df_famous = apply_predictions(
    eval_df_famous, cluster_map, col_el_1="full_name", col_el_2="alias"
)
metrics_famous = compute_metrics(
    eval_df_famous["final_label"], eval_df_famous["prediction"]
)
# log the metrics with 2 decimal precision
metrics_famous_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_famous.items()
}
print(f"Metrics for famous pairs: {metrics_famous_to_print}")
# Save predictions Excel file with required columns
eval_df_famous[
    [
        "full_name",
        "alias",
        "number_of_occurrences",
        "alias_number_of_occurrences",
        "similarity",
        "uncertain",
        "final_label",
        "prediction",
    ]
].to_excel(prediction_famous_results_xlsx, index=False)

Metrics for famous pairs: {'precision': 0.86, 'recall': 0.49, 'f1': 0.63, 'accuracy': 0.8}


# 2. Recompute a second step of clustering on noise

In [16]:
# reload the clusters_df
step=2
clusters_df = pd.read_excel(clusters_table_output)
#cols to keep: ["object_id", "full_name", "number_of_occurrences", f"cluster_{step-1}"]]
clusters_df = clusters_df[
    ["object_id", "full_name", "number_of_occurrences", f"cluster_{step-1}"]
].copy()

# set f"cluster_{step}" None to 0, convert column to str(int())
clusters_df[f"cluster_{step-1}"] = clusters_df[f"cluster_{step-1}"].fillna(0).astype(int).astype(str)

In [17]:
# retrieve the list of clusters of size >=nmax
n_max=20
large_clusters = clusters_df[f"cluster_{step-1}"].value_counts()
large_clusters = large_clusters[large_clusters >= n_max].index.tolist()
# show cluster_{step-1} value counts top 30, in pandas dataframe
large_clusters_df = (
    clusters_df[f"cluster_{step-1}"].value_counts().reset_index()
)
large_clusters_df.columns = [f"cluster_{step-1}", "count"]
print(f"Clusters of size >={n_max}: {len(large_clusters)}")
display(large_clusters_df.head(10))

Clusters of size >=20: 6


Unnamed: 0,cluster_1,count
0,0,17870
1,10357,24
2,9379,23
3,12030,23
4,8818,23
5,9547,23
6,10385,19
7,8269,19
8,6257,18
9,11616,18


In [18]:
embeddings = np.vstack(embeddings_df["full_name_embedding"].values)

min_cluster_size=3
min_samples=3
cluster_selection_epsilon=0.0
max_cluster_size=30
metric="euclidean"
n_jobs=-1
cluster_selection_method="eom"
store_centers="medoid"

In [19]:
# for each large cluster or noise, rerun an hdbscan clustering on the given cluster
# create a column 'subcluster' in clusters_df
# then create a new column 'cluster_{step}' that is str(cluster_{step-1}) + '_' + str(subcluster)
clusters_df[f"cluster_{step}"] = clusters_df[f"cluster_{step-1}"]
for cluster in tqdm(large_clusters, desc="Processing large clusters"):
    # get the indices of the current cluster
    indices = clusters_df[clusters_df[f"cluster_{step-1}"] == cluster].index.tolist()
    # get the embeddings for the current cluster
    cluster_embeddings = embeddings[indices]
    
    # run hdbscan on the current cluster
    sub_labels, _ = run_hdbscan(
        cluster_embeddings,
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=cluster_selection_epsilon,
        max_cluster_size=max_cluster_size,
        metric=metric,
        n_jobs=n_jobs,
        cluster_selection_method=cluster_selection_method,
        store_centers=store_centers,
    )
    
    # update the clusters_df with the new subcluster labels
    clusters_df.loc[indices, f"subcluster_{step}"] = sub_labels
    # compose a list of new cluster ids being str(cluster) + '_' + str(subcluster)
    new_labels = str(cluster) + '_' + sub_labels.astype(str)
    clusters_df.loc[indices, f"cluster_{step}"] = new_labels

Processing large clusters:   0%|          | 0/6 [00:00<?, ?it/s]2025-08-07 23:28:09,230 INFO HDBSCAN completed in 296.01 seconds with min_cluster_size=3, min_samples=3.
Processing large clusters:  17%|█▋        | 1/6 [04:56<24:40, 296.05s/it]2025-08-07 23:28:09,280 INFO HDBSCAN completed in 0.03 seconds with min_cluster_size=3, min_samples=3.
2025-08-07 23:28:09,310 INFO HDBSCAN completed in 0.02 seconds with min_cluster_size=3, min_samples=3.
2025-08-07 23:28:09,344 INFO HDBSCAN completed in 0.03 seconds with min_cluster_size=3, min_samples=3.
Processing large clusters:  67%|██████▋   | 4/6 [04:56<01:52, 56.04s/it] 2025-08-07 23:28:09,374 INFO HDBSCAN completed in 0.02 seconds with min_cluster_size=3, min_samples=3.
2025-08-07 23:28:09,403 INFO HDBSCAN completed in 0.02 seconds with min_cluster_size=3, min_samples=3.
Processing large clusters: 100%|██████████| 6/6 [04:56<00:00, 49.37s/it]


In [20]:
# step=2
# new_labels = str(cluster) + '_' + sub_labels.astype(str)
# # show top 5 labels
# print (f"New labels for step {step}: {new_labels[:5]}")
# clusters_df.loc[indices, f"cluster_{step}"] = new_labels
# # display(clusters_df.head(50))

In [21]:
# display top 10 occuring clusters
clusters_step_2 = clusters_df[f"cluster_{step}"].value_counts().reset_index()
clusters_step_2.columns = [f"cluster_{step}", "count"]
print(f"Number of clusters in step {step}: {len(clusters_step_2)}")
display(clusters_step_2.head(10))

Number of clusters in step 2: 12962


Unnamed: 0,cluster_2,count
0,0_0,14996
1,0_251,25
2,0_489,24
3,0_313,24
4,0_148,24
5,0_362,23
6,9379_0,23
7,0_340,23
8,0_342,22
9,0_343,22


In [22]:
# corrected cluster is equal to Final_Cluster, except for noise clusters where it is equal to None
clusters_df[f"cluster_{step}"] = clusters_df[f"cluster_{step}"].apply(
    lambda x: None if is_noise(x) else x
)
cluster_map = dict(zip(clusters_df["full_name"], clusters_df[f"cluster_{step}"]))

In [23]:
# save clusters table
# columns_to_keep = [
#     "object_id",
#     "full_name",
#     "number_of_occurences",
#     f"cluster_{step}",
#]
clusters_df.to_excel(clusters_table_output, index=False)

## Assess perf

In [24]:
# compute % of noise clusters in corrected_cluster
noise_clusters = clusters_df[clusters_df[f"cluster_{step}"].isna()]
noise_percentage = len(noise_clusters) / len(clusters_df) * 100
print(f"Number of raw extracted registry names: {len(clusters_df)}")
# number of consolidated registry names is unique values of corrected_cluster
print(f"Number of Consolidated registry names: {len(clusters_df[f'cluster_{step}'].dropna().unique())}")
print(f"Number of un-consolidated/lost (noise) registry names: {len(noise_clusters)}")
print(f"Percentage of Consolidated registry names: {(100-noise_percentage):.2f}%")
print(f"Percentage of un-consolidated/lost (noise) registry names: {noise_percentage:.2f}%")
print('---')
# count total number of occurrences in clusters
total_occurrences = clusters_df['number_of_occurrences'].sum()
# count total number of occurences of noise clusters
total_noise_occurrences = noise_clusters['number_of_occurrences'].sum()
print(f"Total number of publications with one of the 'official' extracted registry names: {total_occurrences}")
print(f"Total number of publications with one of the consolidated registry names: {total_occurrences - total_noise_occurrences}")
print(f"Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): {total_noise_occurrences}")
# print %
print(f"Percentage of lost publications: {total_noise_occurrences / total_occurrences * 100:.2f}%")

Number of raw extracted registry names: 54335
Number of Consolidated registry names: 12956
Number of un-consolidated/lost (noise) registry names: 15054
Percentage of Consolidated registry names: 72.29%
Percentage of un-consolidated/lost (noise) registry names: 27.71%
---
Total number of publications with one of the 'official' extracted registry names: 163102
Total number of publications with one of the consolidated registry names: 139836
Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): 23266
Percentage of lost publications: 14.26%


### a. Dataset 'Any'

In [25]:
# Apply predictions based on cluster mapping
eval_df_any = apply_predictions(
    eval_df_any, cluster_map, col_el_1="full_name", col_el_2="alias"
)
# Compute metrics (assuming ground truth is in column "final_label")
metrics_any = compute_metrics(eval_df_any["final_label"], eval_df_any["prediction"])
# log the metrics with 2 decimal precision
metrics_any_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_any.items()
}
print(f"Metrics for any pairs: {metrics_any_to_print}")
# Save predictions Excel file with required columns
eval_df_any[
    [
        "full_name",
        "alias",
        "number_of_occurrences",
        "alias_number_of_occurrences",
        "similarity",
        "uncertain",
        "final_label",
        "prediction",
    ]
].to_excel(prediction_any_results_xlsx, index=False)

Metrics for any pairs: {'precision': 0.7, 'recall': 0.75, 'f1': 0.73, 'accuracy': 0.79}


### b. Dataset 'famous'

In [26]:
# Apply predictions
eval_df_famous = apply_predictions(
    eval_df_famous, cluster_map, col_el_1="full_name", col_el_2="alias"
)
metrics_famous = compute_metrics(
    eval_df_famous["final_label"], eval_df_famous["prediction"]
)
# log the metrics with 2 decimal precision
metrics_famous_to_print = {
    k: round(v, 2) if isinstance(v, float) else v for k, v in metrics_famous.items()
}
print(f"Metrics for famous pairs: {metrics_famous_to_print}")
# Save predictions Excel file with required columns
eval_df_famous[
    [
        "full_name",
        "alias",
        "number_of_occurrences",
        "alias_number_of_occurrences",
        "similarity",
        "uncertain",
        "final_label",
        "prediction",
    ]
].to_excel(prediction_famous_results_xlsx, index=False)

Metrics for famous pairs: {'precision': 0.85, 'recall': 0.49, 'f1': 0.62, 'accuracy': 0.8}
