# Iterative Grid Search on 'Other' Clusters
This notebook performs consecutive grid searches on 'other' clusters (not starting or ending with 0) with size > 20, creating a new cluster column at each iteration. The process stops when no such cluster remains or after 30 iterations.

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup"
os.chdir(working_dir)

from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.utils import (
    is_noise,
    run_hdbscan,
    apply_predictions,
    compute_metrics,
)
from src.p05_refine_dedup.utils.s3_io_functions import load_parquet_from_s3

# Load embeddings and clusters
s3_input_embeddings = "registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet"
bucket_name = config.BUCKET_NAME_DEV
folder_path = s3_input_embeddings.rsplit("/", 1)[0]
file_name = s3_input_embeddings.rsplit("/", 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name, folder_path=folder_path, file_name=file_name
)
# clusters_table_xlsx = 'data/W03/from_notebooks/R06_additional_grid_search/v5/clusters_best_config.xlsx'
clusters_table_xlsx = "data/W02/R02_evaluate_model_performance/clusters_table.xlsx"
clusters_df = pd.read_excel(clusters_table_xlsx)
if "full_name_embedding" in clusters_df.columns:
    clusters_df.drop(columns=["full_name_embedding"], inplace=True)
clusters_df = clusters_df.merge(
    embeddings_df[["full_name", "full_name_embedding"]], on="full_name", how="left"
)
if "subcluster" in clusters_df.columns:
    clusters_df.drop(columns=["subcluster"], inplace=True)
if 'Final_Cluster' in clusters_df.columns:
    clusters_df.rename(columns={"Final_Cluster": "cluster_0"}, inplace=True)

# Load evaluation datasets
evaluation_dataset_any = "data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/assessed_pairs_v1.xlsx"
evaluation_dataset_famous = "data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/famous_close_assessed_pairs_v1.xlsx"
eval_df_any = pd.read_excel(evaluation_dataset_any)
eval_df_famous = pd.read_excel(evaluation_dataset_famous)

  from .autonotebook import tqdm as notebook_tqdm


## Parameters and Helper Functions

In [None]:
eps_values = np.arange(0.21, 0.27, 0.01)
min_cluster_size = 2
min_samples = 2
max_cluster_size = 20
metric = 'euclidean'
n_jobs = -1
cluster_selection_method = 'eom'
store_centers = 'medoid'
max_iterations = 30

def filter_other_clusters(df, cluster_col):
            return df[~df[cluster_col].astype(str).str.startswith('0_') & ~df[cluster_col].astype(str).str.endswith('_0')]
def get_large_clusters(df, cluster_col, n_max):
            vc = df[cluster_col].value_counts()
            return vc[vc > n_max].index.tolist()

## Main Loop: Iterative Grid Search on 'Other' Clusters

In [13]:
# current_col = 'cluster_0'
# for iteration in range(1, max_iterations + 1):
#     print(f'\n=== Iteration {iteration} ===')
#     # Find large 'other' clusters
#     other_df = filter_other_clusters(clusters_df, current_col)
#     large_clusters = get_large_clusters(other_df, current_col, max_cluster_size)
#     if not large_clusters:
#             print('No more large "other" clusters to split. Stopping.')
#             break
#     print(f'Number of "other" clusters > {max_cluster_size}:', len(large_clusters))
#     # Prepare for grid search
#     best_f1_mean = -1
#     best_metrics = None
#     best_eps = None
#     best_new_col = f'cluster_{iteration+1}'
#     best_clusters_df = None
#     for eps in eps_values:
#         # Only process large clusters
#         temp_df = clusters_df.copy()
#         temp_df[best_new_col] = temp_df[current_col]
#         for cluster_id in large_clusters:
#             mask = temp_df[current_col] == cluster_id
#             sub_df = temp_df[mask]
#             if len(sub_df) <= max_cluster_size:
#                 continue
#             embeddings = np.vstack(sub_df['full_name_embedding'].values)
#             labels, _ = run_hdbscan(embeddings, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=eps, max_cluster_size=max_cluster_size, metric=metric, n_jobs=n_jobs, cluster_selection_method=cluster_selection_method, store_centers=store_centers)
#             subcluster_labels = labels.astype(str)
#             new_ids = [f'{cluster_id}_{lbl}' if lbl != '-1' else None for lbl in subcluster_labels]
#             temp_df.loc[mask, best_new_col] = new_ids
#         # Evaluate
#         cluster_map = dict(zip(temp_df['full_name'], temp_df[best_new_col]))
#         eval_any = apply_predictions(eval_df_any.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
#         eval_famous = apply_predictions(eval_df_famous.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
#         metrics_any = compute_metrics(eval_any['final_label'], eval_any['prediction'])
#         metrics_famous = compute_metrics(eval_famous['final_label'], eval_famous['prediction'])
#         f1_mean = (metrics_any['f1'] + metrics_famous['f1']) / 2
#         if f1_mean > best_f1_mean:
#             best_f1_mean = f1_mean
#             best_metrics = {
#                 'eps_other': eps,
#                 'f1_any': metrics_any['f1'],
#                 'precision_any': metrics_any['precision'],
#                 'recall_any': metrics_any['recall'],
#                 'f1_famous': metrics_famous['f1'],
#                 'precision_famous': metrics_famous['precision'],
#                 'recall_famous': metrics_famous['recall'],
#                 'f1_mean': f1_mean
#             }
#             best_eps = eps
#             best_clusters_df = temp_df.copy()
#     # Print best result for this iteration
#     print(f'Best eps_other: {best_eps}')
#     print(f'Best metrics: {best_metrics}')
#     # Show how many clusters > 20 and top 5 largest clusters
#     vc = best_clusters_df[best_new_col].value_counts()
#     n_large = (vc > max_cluster_size).sum()
#     print(f'Clusters > {max_cluster_size}: {n_large}')
#     print('Top 5 clusters:')
#     display(vc.head(5))
#     # Prepare for next iteration
#     clusters_df = best_clusters_df
#     current_col = best_new_col
# print('Loop finished.')

# Tracking and Saving Results
This section adds progress bars, tracks performance at each iteration, and saves results at the end.

In [None]:
from tqdm import tqdm
import json
from pathlib import Path
version = 'v2'
results_dir = Path(f'data/W03/from_notebooks/R07_loop_grid_searchs/{version}')
results_dir.mkdir(parents=True, exist_ok=True)

perf_tracking = []
best_precision_mean_so_far = -1
best_recall_mean_so_far = -1
current_col = 'cluster_0'
for iteration in tqdm(range(0, max_iterations + 1), desc='Iterations'):
    print(f'\n=== Iteration {iteration} ===')
    # Find large 'other' clusters
    other_df = filter_other_clusters(clusters_df, current_col)
    large_clusters = get_large_clusters(other_df, current_col, max_cluster_size)
    # Show how many large clusters and top 5 most popular
    if not large_clusters:
        print('No more large "other" clusters to split. Stopping.')
        # add column 'cluster_final' with current_col values
        clusters_df['cluster_final'] = clusters_df[current_col]
        break
    print(f'Number of "other" clusters > {max_cluster_size}:', len(large_clusters))
    print('Top 5 clusters:')
    display(other_df[current_col].value_counts().head(5))
    # Prepare for grid search
    best_f1_mean = -1
    best_metrics = None
    best_eps = None
    best_new_col = f'cluster_{iteration+1}'
    best_clusters_df = None
    best_precision_mean = -1
    best_recall_mean = -1
    for eps in tqdm(eps_values, desc=f'Epsilons (iter {iteration})', leave=False):
        temp_df = clusters_df.copy()
        temp_df[best_new_col] = temp_df[current_col]
        for cluster_id in large_clusters:
            mask = temp_df[current_col] == cluster_id
            sub_df = temp_df[mask]
            if len(sub_df) <= max_cluster_size:
                continue
            embeddings = np.vstack(sub_df['full_name_embedding'].values)
            labels, _ = run_hdbscan(embeddings, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=eps, max_cluster_size=max_cluster_size, metric=metric, n_jobs=n_jobs, cluster_selection_method=cluster_selection_method, store_centers=store_centers)
            subcluster_labels = labels.astype(str)
            new_ids = [f'{cluster_id}_{lbl}' if lbl != '-1' else None for lbl in subcluster_labels]
            temp_df.loc[mask, best_new_col] = new_ids
        # Evaluate
        temp_df[best_new_col] = temp_df[best_new_col].apply(
            lambda x: None if is_noise(x) else x
        )
        cluster_map = dict(zip(temp_df['full_name'], temp_df[best_new_col]))
        eval_any = apply_predictions(eval_df_any.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
        eval_famous = apply_predictions(eval_df_famous.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
        metrics_any = compute_metrics(eval_any['final_label'], eval_any['prediction'])
        metrics_famous = compute_metrics(eval_famous['final_label'], eval_famous['prediction'])
        f1_mean = (metrics_any['f1'] + metrics_famous['f1']) / 2
        precision_mean = (metrics_any['precision'] + metrics_famous['precision']) / 2
        recall_mean = (metrics_any['recall'] + metrics_famous['recall']) / 2
        if precision_mean > best_precision_mean:
            best_precision_mean = precision_mean
        if recall_mean > best_recall_mean:
            best_recall_mean = recall_mean
        if f1_mean > best_f1_mean:
            best_f1_mean = f1_mean
            best_metrics = {
                'eps_other': eps,
                'f1_any': metrics_any['f1'],
                'precision_any': metrics_any['precision'],
                'recall_any': metrics_any['recall'],
                'f1_famous': metrics_famous['f1'],
                'precision_famous': metrics_famous['precision'],
                'recall_famous': metrics_famous['recall'],
                'f1_mean': f1_mean,
                'precision_mean': precision_mean,
                'recall_mean': recall_mean
            }
            best_eps = eps
            best_clusters_df = temp_df.copy()
    # Print best result for this iteration
    print(f'Best eps_other: {best_eps:.3f}')
    metrics_rounded = {k: (round(v, 3) if isinstance(v, float) else v) for k, v in best_metrics.items()}
    print('Best metrics:')
    print(json.dumps(metrics_rounded, indent=2))
    perf_tracking.append({
        'iteration': iteration,
        'best_eps': best_eps,
        **best_metrics,
        'best_precision_mean_so_far': max(best_precision_mean_so_far, best_precision_mean),
        'best_recall_mean_so_far': max(best_recall_mean_so_far, best_recall_mean),
        'n_large_clusters': len(large_clusters),
    })
    best_precision_mean_so_far = max(best_precision_mean_so_far, best_precision_mean)
    best_recall_mean_so_far = max(best_recall_mean_so_far, best_recall_mean)
    # Prepare for next iteration
    clusters_df = best_clusters_df
    current_col = best_new_col
print('Loop finished.')

Iterations:   0%|          | 0/31 [00:00<?, ?it/s]


=== Iteration 0 ===
Number of "other" clusters > 20: 68
Top 5 clusters:


cluster_0
6_3     690
23_1    101
8_1      76
38_1     69
6_2      68
Name: count, dtype: int64



Iterations:   3%|▎         | 1/31 [00:20<10:28, 20.95s/it]

Best eps_other: 0.260
Best metrics:
{
  "eps_other": 0.26,
  "f1_any": 0.756,
  "precision_any": 0.724,
  "recall_any": 0.791,
  "f1_famous": 0.788,
  "precision_famous": 0.825,
  "recall_famous": 0.754,
  "f1_mean": 0.772,
  "precision_mean": 0.774,
  "recall_mean": 0.773
}

=== Iteration 1 ===
Number of "other" clusters > 20: 43
Top 5 clusters:


cluster_1
6_3_17    453
6_3_2     118
23_1_2     95
8_1_1      74
38_1_1     65
Name: count, dtype: int64

Iterations:   6%|▋         | 2/31 [00:34<08:03, 16.68s/it]

Best eps_other: 0.260
Best metrics:
{
  "eps_other": 0.26,
  "f1_any": 0.754,
  "precision_any": 0.725,
  "recall_any": 0.786,
  "f1_famous": 0.783,
  "precision_famous": 0.827,
  "recall_famous": 0.743,
  "f1_mean": 0.769,
  "precision_mean": 0.776,
  "recall_mean": 0.765
}

=== Iteration 2 ===
Number of "other" clusters > 20: 27
Top 5 clusters:


cluster_2
6_3_17_1    439
6_3_2_1     110
23_1_2_1     88
8_1_1_1      64
38_1_1_1     62
Name: count, dtype: int64

Iterations:  10%|▉         | 3/31 [00:45<06:29, 13.91s/it]

Best eps_other: 0.250
Best metrics:
{
  "eps_other": 0.25,
  "f1_any": 0.752,
  "precision_any": 0.724,
  "recall_any": 0.782,
  "f1_famous": 0.778,
  "precision_famous": 0.829,
  "recall_famous": 0.734,
  "f1_mean": 0.765,
  "precision_mean": 0.777,
  "recall_mean": 0.758
}

=== Iteration 3 ===
Number of "other" clusters > 20: 19
Top 5 clusters:


cluster_3
6_3_17_1_16    326
23_1_2_1_1      72
6_3_2_1_3       64
38_1_1_1_1      56
356_1_1_1_1     50
Name: count, dtype: int64

Iterations:  13%|█▎        | 4/31 [00:53<05:15, 11.67s/it]

Best eps_other: 0.250
Best metrics:
{
  "eps_other": 0.25,
  "f1_any": 0.752,
  "precision_any": 0.725,
  "recall_any": 0.781,
  "f1_famous": 0.777,
  "precision_famous": 0.83,
  "recall_famous": 0.73,
  "f1_mean": 0.765,
  "precision_mean": 0.778,
  "recall_mean": 0.756
}

=== Iteration 4 ===
Number of "other" clusters > 20: 14
Top 5 clusters:


cluster_4
6_3_17_1_16_1    319
6_3_2_1_3_1       56
356_1_1_1_1_1     45
23_1_2_1_1_2      43
38_1_1_1_1_1      43
Name: count, dtype: int64

Iterations:  16%|█▌        | 5/31 [01:00<04:20, 10.03s/it]

Best eps_other: 0.250
Best metrics:
{
  "eps_other": 0.25,
  "f1_any": 0.752,
  "precision_any": 0.726,
  "recall_any": 0.78,
  "f1_famous": 0.772,
  "precision_famous": 0.829,
  "recall_famous": 0.723,
  "f1_mean": 0.762,
  "precision_mean": 0.777,
  "recall_mean": 0.751
}

=== Iteration 5 ===
Number of "other" clusters > 20: 10
Top 5 clusters:


cluster_5
6_3_17_1_16_1_1    317
6_3_2_1_3_1_1       53
38_1_1_1_1_1_1      36
6_45_1_1_1_1_1      34
23_1_2_1_1_2_1      33
Name: count, dtype: int64

Iterations:  19%|█▉        | 6/31 [01:06<03:39,  8.79s/it]

Best eps_other: 0.250
Best metrics:
{
  "eps_other": 0.25,
  "f1_any": 0.75,
  "precision_any": 0.725,
  "recall_any": 0.778,
  "f1_famous": 0.772,
  "precision_famous": 0.83,
  "recall_famous": 0.721,
  "f1_mean": 0.761,
  "precision_mean": 0.778,
  "recall_mean": 0.749
}

=== Iteration 6 ===
Number of "other" clusters > 20: 4
Top 5 clusters:


cluster_6
6_3_17_1_16_1_1_1    314
6_3_2_1_3_1_1_1       46
6_45_1_1_1_1_1_1      32
38_1_1_1_1_1_1_1      29
6_17_1_1              20
Name: count, dtype: int64

Iterations:  23%|██▎       | 7/31 [01:12<03:03,  7.63s/it]

Best eps_other: 0.250
Best metrics:
{
  "eps_other": 0.25,
  "f1_any": 0.75,
  "precision_any": 0.725,
  "recall_any": 0.778,
  "f1_famous": 0.771,
  "precision_famous": 0.83,
  "recall_famous": 0.719,
  "f1_mean": 0.76,
  "precision_mean": 0.777,
  "recall_mean": 0.748
}

=== Iteration 7 ===
Number of "other" clusters > 20: 3
Top 5 clusters:


cluster_7
6_3_17_1_16_1_1_1_1    311
6_3_2_1_3_1_1_1_1       44
6_45_1_1_1_1_1_1_1      28
38_1_1_1_1_1_1_1_1      20
504_1                   20
Name: count, dtype: int64

Iterations:  26%|██▌       | 8/31 [01:17<02:36,  6.80s/it]

Best eps_other: 0.240
Best metrics:
{
  "eps_other": 0.24,
  "f1_any": 0.75,
  "precision_any": 0.726,
  "recall_any": 0.777,
  "f1_famous": 0.77,
  "precision_famous": 0.831,
  "recall_famous": 0.717,
  "f1_mean": 0.76,
  "precision_mean": 0.778,
  "recall_mean": 0.747
}

=== Iteration 8 ===
Number of "other" clusters > 20: 4
Top 5 clusters:


cluster_8
6_3_17_1_16_1_1_1_1_7    267
6_3_2_1_3_1_1_1_1_1       41
6_45_1_1_1_1_1_1_1_1      21
6_3_17_1_16_1_1_1_1_8     21
248_5_3                   20
Name: count, dtype: int64

Iterations:  29%|██▉       | 9/31 [01:22<02:17,  6.26s/it]

Best eps_other: 0.240
Best metrics:
{
  "eps_other": 0.24,
  "f1_any": 0.75,
  "precision_any": 0.725,
  "recall_any": 0.776,
  "f1_famous": 0.769,
  "precision_famous": 0.83,
  "recall_famous": 0.716,
  "f1_mean": 0.759,
  "precision_mean": 0.778,
  "recall_mean": 0.746
}

=== Iteration 9 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_9
6_3_17_1_16_1_1_1_1_7_1    260
6_3_2_1_3_1_1_1_1_1_1       34
38_1_1_1_1_1_1_1_1          20
340_2_1                     20
1094_1_1                    20
Name: count, dtype: int64

Iterations:  32%|███▏      | 10/31 [01:26<02:01,  5.76s/it]

Best eps_other: 0.240
Best metrics:
{
  "eps_other": 0.24,
  "f1_any": 0.75,
  "precision_any": 0.726,
  "recall_any": 0.776,
  "f1_famous": 0.769,
  "precision_famous": 0.831,
  "recall_famous": 0.715,
  "f1_mean": 0.759,
  "precision_mean": 0.778,
  "recall_mean": 0.746
}

=== Iteration 10 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_10
6_3_17_1_16_1_1_1_1_7_1_1    251
6_3_2_1_3_1_1_1_1_1_1_1       29
919_1                         20
28_7_1                        20
28_25                         20
Name: count, dtype: int64

Iterations:  35%|███▌      | 11/31 [01:31<01:49,  5.45s/it]

Best eps_other: 0.240
Best metrics:
{
  "eps_other": 0.24,
  "f1_any": 0.75,
  "precision_any": 0.726,
  "recall_any": 0.775,
  "f1_famous": 0.769,
  "precision_famous": 0.831,
  "recall_famous": 0.715,
  "f1_mean": 0.759,
  "precision_mean": 0.778,
  "recall_mean": 0.745
}

=== Iteration 11 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_11
6_3_17_1_16_1_1_1_1_7_1_1_1    244
6_3_2_1_3_1_1_1_1_1_1_1_1       20
6_25_2_1                        20
504_1                           20
919_1                           20
Name: count, dtype: int64

Iterations:  39%|███▊      | 12/31 [01:36<01:37,  5.14s/it]

Best eps_other: 0.240
Best metrics:
{
  "eps_other": 0.24,
  "f1_any": 0.75,
  "precision_any": 0.726,
  "recall_any": 0.775,
  "f1_famous": 0.768,
  "precision_famous": 0.831,
  "recall_famous": 0.714,
  "f1_mean": 0.759,
  "precision_mean": 0.778,
  "recall_mean": 0.745
}

=== Iteration 12 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_12
6_3_17_1_16_1_1_1_1_7_1_1_1_1    231
6_2_2_1                           20
28_25                             20
441_2_2                           20
28_12                             20
Name: count, dtype: int64

Iterations:  42%|████▏     | 13/31 [01:40<01:28,  4.90s/it]

Best eps_other: 0.220
Best metrics:
{
  "eps_other": 0.22,
  "f1_any": 0.75,
  "precision_any": 0.726,
  "recall_any": 0.775,
  "f1_famous": 0.767,
  "precision_famous": 0.831,
  "recall_famous": 0.713,
  "f1_mean": 0.759,
  "precision_mean": 0.779,
  "recall_mean": 0.744
}

=== Iteration 13 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_13
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7    176
28_25                               20
9_5_1_2                             20
1094_1_1                            20
38_1_1_1_1_1_1_1_1                  20
Name: count, dtype: int64

Iterations:  45%|████▌     | 14/31 [01:44<01:19,  4.68s/it]

Best eps_other: 0.220
Best metrics:
{
  "eps_other": 0.22,
  "f1_any": 0.75,
  "precision_any": 0.727,
  "recall_any": 0.775,
  "f1_famous": 0.767,
  "precision_famous": 0.831,
  "recall_famous": 0.712,
  "f1_mean": 0.758,
  "precision_mean": 0.779,
  "recall_mean": 0.744
}

=== Iteration 14 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_14
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2    160
14_1                                  20
9_1_1_1_1                             20
6_17_1_1                              20
242_1_2_1                             20
Name: count, dtype: int64

Iterations:  48%|████▊     | 15/31 [01:48<01:12,  4.50s/it]

Best eps_other: 0.220
Best metrics:
{
  "eps_other": 0.22,
  "f1_any": 0.75,
  "precision_any": 0.727,
  "recall_any": 0.775,
  "f1_famous": 0.766,
  "precision_famous": 0.831,
  "recall_famous": 0.711,
  "f1_mean": 0.758,
  "precision_mean": 0.779,
  "recall_mean": 0.743
}

=== Iteration 15 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_15
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1    118
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_2     40
9_1_1_1_1                               20
6_4_1_1_1_1                             20
6_17_1_1                                20
Name: count, dtype: int64

Iterations:  52%|█████▏    | 16/31 [01:53<01:06,  4.44s/it]

Best eps_other: 0.220
Best metrics:
{
  "eps_other": 0.22,
  "f1_any": 0.751,
  "precision_any": 0.727,
  "recall_any": 0.775,
  "f1_famous": 0.766,
  "precision_famous": 0.831,
  "recall_famous": 0.71,
  "f1_mean": 0.758,
  "precision_mean": 0.779,
  "recall_mean": 0.743
}

=== Iteration 16 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_16
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1    114
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_2_1     31
6_116_1                                   20
38_1_1_1_1_1_1_1_1                        20
9_5_1_2                                   20
Name: count, dtype: int64

Iterations:  55%|█████▍    | 17/31 [01:57<01:01,  4.37s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.75,
  "precision_any": 0.727,
  "recall_any": 0.774,
  "f1_famous": 0.766,
  "precision_famous": 0.832,
  "recall_famous": 0.71,
  "f1_mean": 0.758,
  "precision_mean": 0.779,
  "recall_mean": 0.742
}

=== Iteration 17 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_17
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1    111
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_2_1_1     24
6_116_1                                     20
6_4_1_1_1_1                                 20
6_25_2_1                                    20
Name: count, dtype: int64

Iterations:  58%|█████▊    | 18/31 [02:01<00:56,  4.32s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.727,
  "recall_any": 0.773,
  "f1_famous": 0.766,
  "precision_famous": 0.831,
  "recall_famous": 0.71,
  "f1_mean": 0.758,
  "precision_mean": 0.779,
  "recall_mean": 0.742
}

=== Iteration 18 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_18
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1    106
38_1_1_1_1_1_1_1_1                            20
6_25_2_1                                      20
1094_1_1                                      20
6_3_2_1_3_1_1_1_1_1_1_1_1                     20
Name: count, dtype: int64

Iterations:  61%|██████▏   | 19/31 [02:05<00:50,  4.22s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.727,
  "recall_any": 0.773,
  "f1_famous": 0.765,
  "precision_famous": 0.831,
  "recall_famous": 0.709,
  "f1_mean": 0.757,
  "precision_mean": 0.779,
  "recall_mean": 0.741
}

=== Iteration 19 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_19
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1    100
6_3_2_1_3_1_1_1_1_1_1_1_1                       20
242_1_2_1                                       20
9_1_1_1_1                                       20
28_25                                           20
Name: count, dtype: int64

Iterations:  65%|██████▍   | 20/31 [02:09<00:45,  4.16s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.772,
  "f1_famous": 0.765,
  "precision_famous": 0.831,
  "recall_famous": 0.709,
  "f1_mean": 0.757,
  "precision_mean": 0.779,
  "recall_mean": 0.74
}

=== Iteration 20 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_20
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1    62
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_2    35
9_5_1_2                                          20
28_12                                            20
9_1_1_1_1                                        20
Name: count, dtype: int64

Iterations:  68%|██████▊   | 21/31 [02:13<00:41,  4.17s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.772,
  "f1_famous": 0.765,
  "precision_famous": 0.831,
  "recall_famous": 0.709,
  "f1_mean": 0.757,
  "precision_mean": 0.779,
  "recall_mean": 0.74
}

=== Iteration 21 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_21
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1    42
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_2_1    33
541_1_1                                            20
6_116_1                                            20
504_1                                              20
Name: count, dtype: int64

Iterations:  71%|███████   | 22/31 [02:17<00:37,  4.17s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.772,
  "f1_famous": 0.765,
  "precision_famous": 0.832,
  "recall_famous": 0.708,
  "f1_mean": 0.757,
  "precision_mean": 0.78,
  "recall_mean": 0.74
}

=== Iteration 22 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_22
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1_1    39
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_2_1_2    24
9_1_1_1_1                                            20
6_4_1_1_1_1                                          20
523_1                                                20
Name: count, dtype: int64

Iterations:  74%|███████▍  | 23/31 [02:22<00:33,  4.17s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.772,
  "f1_famous": 0.765,
  "precision_famous": 0.832,
  "recall_famous": 0.707,
  "f1_mean": 0.757,
  "precision_mean": 0.78,
  "recall_mean": 0.74
}

=== Iteration 23 ===
Number of "other" clusters > 20: 2
Top 5 clusters:


cluster_23
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1_1_1    36
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_2_1_2_2    22
6_3_2_1_3_1_1_1_1_1_1_1_1                              20
523_1                                                  20
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_2        20
Name: count, dtype: int64

Iterations:  77%|███████▋  | 24/31 [02:26<00:29,  4.17s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.771,
  "f1_famous": 0.764,
  "precision_famous": 0.832,
  "recall_famous": 0.707,
  "f1_mean": 0.757,
  "precision_mean": 0.78,
  "recall_mean": 0.739
}

=== Iteration 24 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_24
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1_1_1_2    31
6_4_1_1_1_1                                              20
441_2_2                                                  20
504_1                                                    20
38_1_1_1_1_1_1_1_1                                       20
Name: count, dtype: int64

Iterations:  81%|████████  | 25/31 [02:30<00:24,  4.11s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.771,
  "f1_famous": 0.764,
  "precision_famous": 0.832,
  "recall_famous": 0.706,
  "f1_mean": 0.756,
  "precision_mean": 0.78,
  "recall_mean": 0.739
}

=== Iteration 25 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_25
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1_1_1_2_1    28
6_4_1_1_1_1                                                20
9_1_1_1_1                                                  20
6_25_2_1                                                   20
28_25                                                      20
Name: count, dtype: int64

Iterations:  84%|████████▍ | 26/31 [02:34<00:20,  4.07s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.771,
  "f1_famous": 0.764,
  "precision_famous": 0.832,
  "recall_famous": 0.706,
  "f1_mean": 0.756,
  "precision_mean": 0.78,
  "recall_mean": 0.739
}

=== Iteration 26 ===
Number of "other" clusters > 20: 1
Top 5 clusters:


cluster_26
6_3_17_1_16_1_1_1_1_7_1_1_1_1_7_2_1_1_1_1_1_1_1_1_1_2_1_1    25
6_17_1_1                                                     20
248_5_3                                                      20
14_1                                                         20
28_12                                                        20
Name: count, dtype: int64

Iterations:  87%|████████▋ | 27/31 [02:38<00:23,  5.86s/it]

Best eps_other: 0.210
Best metrics:
{
  "eps_other": 0.21,
  "f1_any": 0.749,
  "precision_any": 0.728,
  "recall_any": 0.771,
  "f1_famous": 0.764,
  "precision_famous": 0.832,
  "recall_famous": 0.706,
  "f1_mean": 0.756,
  "precision_mean": 0.78,
  "recall_mean": 0.738
}

=== Iteration 27 ===
No more large "other" clusters to split. Stopping.
Loop finished.





In [15]:
version = 'v2'
results_dir = Path(f'data/W03/from_notebooks/R07_loop_grid_searchs/{version}')
results_dir.mkdir(parents=True, exist_ok=True)
# Save tracking DataFrame and final results
perf_df = pd.DataFrame(perf_tracking)
perf_df.to_excel(results_dir / 'loop_grid_searchs.xlsx', index=False)
# # Save final best metrics
# with open(results_dir / 'best_perf_metrics.json', 'w') as f:
#     json.dump(perf_tracking[-1], f, indent=4)
# Save final clusters_df
clusters_df.to_excel(results_dir / 'clusters_table.xlsx', index=False)

In [16]:
import numpy as np

def convert_to_native(obj):
    if isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(v) for v in obj]
    else:
        return obj

with open(results_dir / 'best_perf_metrics.json', 'w') as f:
    json.dump(convert_to_native(perf_tracking[-1]), f, indent=4)

# reload to save tables of predictions

In [5]:
# redefine what is necessary to recompute
version = 'v2'
results_dir = Path(f'data/W03/from_notebooks/R07_loop_grid_searchs/{version}')
results_dir.mkdir(parents=True, exist_ok=True)
# Save tracking DataFrame and final results
# reload from excel
final_clusters_df = pd.read_excel(results_dir / 'clusters_table.xlsx')
# add a new column cluster_final equal to 'cluster_27'
final_clusters_df['cluster_final'] = final_clusters_df['cluster_27']

In [6]:
# reapply predictions to the evaluation datasets with the final clusters on 'cluster_final' column

# and save each dataframe to excel file as prediction_results_any_reg.xlsx and prediction_results_famous_reg.xlsx
prediction_any_results_xlsx= results_dir / 'prediction_results_any_reg.xlsx'
prediction_famous_results_xlsx= results_dir / 'prediction_results_famous_reg.xlsx'
cols_to_keep = [
        "full_name",
        "alias",
        "number_of_occurrences",
        "alias_number_of_occurrences",
        "similarity",
        "uncertain",
        "final_label",
        "prediction",
    ]

eval_any_final = apply_predictions(
    eval_df_any.copy(),
    dict(zip(final_clusters_df['full_name'], final_clusters_df['cluster_final'])),
    col_el_1='full_name',
    col_el_2='alias'
)[cols_to_keep]
eval_any_final.to_excel(prediction_any_results_xlsx, index=False)

eval_famous_final = apply_predictions(
    eval_df_famous.copy(),
    dict(zip(final_clusters_df['full_name'], final_clusters_df['cluster_final'])),
    col_el_1='full_name',
    col_el_2='alias'
)[cols_to_keep]
eval_famous_final.to_excel(prediction_famous_results_xlsx, index=False)