# Iterative Grid Search on 'start_0' Clusters
This notebook reloads the clusters table from the output of the 'other' loop, adds a new column for further splitting, and performs an iterative grid search on 'start_0' clusters (starting with '0_' but not ending with '_0') with size > 3. Each iteration creates a new cluster column.

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

working_dir = '/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup'
os.chdir(working_dir)
from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.utils import (
    is_noise,
    run_hdbscan,
    apply_predictions,
    compute_metrics,
)
from src.p05_refine_dedup.utils.s3_io_functions import load_parquet_from_s3

# Load clusters table from the output of the 'other' loop
clusters_table_xlsx = 'data/W03/from_notebooks/R07_loop_grid_searchs/v2/clusters_table.xlsx'
clusters_df = pd.read_excel(clusters_table_xlsx)
# Add a new column for further splitting (default: 'cluster_after_other' = 'cluster_27')
clusters_df['cluster_after_other'] = clusters_df['cluster_27']

# Load embeddings
s3_input_embeddings = 'registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet'
bucket_name = config.BUCKET_NAME_DEV
folder_path = s3_input_embeddings.rsplit('/', 1)[0]
file_name = s3_input_embeddings.rsplit('/', 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name, folder_path=folder_path, file_name=file_name
)
# drop full_name_embedding from clusters_df
if 'full_name_embedding' in clusters_df.columns:
    clusters_df = clusters_df.drop(columns=['full_name_embedding'])
clusters_df = clusters_df.merge(embeddings_df[['full_name', 'full_name_embedding']], on='full_name', how='left')

# Load evaluation datasets
evaluation_dataset_any = 'data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/assessed_pairs_v1.xlsx'
evaluation_dataset_famous = 'data/W01/R03_eval_pairs_similarity_assessment_with_llm/gpt4_1_openai/famous_close_assessed_pairs_v1.xlsx'
eval_df_any = pd.read_excel(evaluation_dataset_any)
eval_df_famous = pd.read_excel(evaluation_dataset_famous)

  from .autonotebook import tqdm as notebook_tqdm


## Parameters and Helper Functions

In [2]:
eps_values = np.arange(0.34, 0.44, 0.01)
min_cluster_size = 2
min_samples = 2
max_cluster_size = 3
metric = 'euclidean'
n_jobs = -1
cluster_selection_method = 'eom'
store_centers = 'medoid'
max_iterations = 3

def filter_start_0_clusters(df, cluster_col):
    return df[df[cluster_col].astype(str).str.startswith('0_') & ~df[cluster_col].astype(str).str.endswith('_0')]
def get_large_clusters(df, cluster_col, n_max):
    vc = df[cluster_col].value_counts()
    return vc[vc > n_max].index.tolist()

## Main Loop: Iterative Grid Search on 'start_0' Clusters

In [4]:
version = 'v1'
results_dir = Path(f'data/W03/from_notebooks/R08_loop_grid_searchs_start_0/{version}')
results_dir.mkdir(parents=True, exist_ok=True)

In [None]:
from tqdm import tqdm
import json
from pathlib import Path

perf_tracking = []
best_precision_mean_so_far = -1
best_recall_mean_so_far = -1
current_col = 'cluster_after_other'
for iteration in tqdm(range(0, max_iterations + 1), desc='Iterations'):
    print(f'\n=== Iteration {iteration} ===')
    # Find large 'start_0' clusters
    start_0_df = filter_start_0_clusters(clusters_df, current_col)
    large_clusters = get_large_clusters(start_0_df, current_col, max_cluster_size)
    if not large_clusters:
        print('No more large clusters to split. Stopping.')
        clusters_df['cluster_final'] = clusters_df[current_col]
        break
    print(f'Number of clusters > {max_cluster_size}:', len(large_clusters))
    print('Top 5 clusters:')
    display(start_0_df[current_col].value_counts().head(5))
    best_f1_mean = -1
    best_metrics = None
    best_eps = None
    best_new_col = f'cluster_start_0_{iteration+1}'
    best_clusters_df = None
    best_precision_mean = -1
    best_recall_mean = -1
    for eps in tqdm(eps_values, desc=f'Epsilons (iter {iteration})', leave=False):
        temp_df = clusters_df.copy()
        temp_df[best_new_col] = temp_df[current_col]
        for cluster_id in large_clusters:
            mask = temp_df[current_col] == cluster_id
            sub_df = temp_df[mask]
            if len(sub_df) <= max_cluster_size:
                continue
            embeddings = np.vstack(sub_df['full_name_embedding'].values)
            labels, _ = run_hdbscan(embeddings, min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_epsilon=eps, max_cluster_size=max_cluster_size, metric=metric, n_jobs=n_jobs, cluster_selection_method=cluster_selection_method, store_centers=store_centers)
            subcluster_labels = labels.astype(str)
            new_ids = [f'{cluster_id}_{lbl}' if lbl != '-1' else None for lbl in subcluster_labels]
            temp_df.loc[mask, best_new_col] = new_ids
        temp_df[best_new_col] = temp_df[best_new_col].apply(lambda x: None if is_noise(x) else x)
        cluster_map = dict(zip(temp_df['full_name'], temp_df[best_new_col]))
        eval_any = apply_predictions(eval_df_any.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
        eval_famous = apply_predictions(eval_df_famous.copy(), cluster_map, col_el_1='full_name', col_el_2='alias')
        metrics_any = compute_metrics(eval_any['final_label'], eval_any['prediction'])
        metrics_famous = compute_metrics(eval_famous['final_label'], eval_famous['prediction'])
        f1_mean = (metrics_any['f1'] + metrics_famous['f1']) / 2
        precision_mean = (metrics_any['precision'] + metrics_famous['precision']) / 2
        recall_mean = (metrics_any['recall'] + metrics_famous['recall']) / 2
        if precision_mean > best_precision_mean:
            best_precision_mean = precision_mean
        if recall_mean > best_recall_mean:
            best_recall_mean = recall_mean
        if f1_mean > best_f1_mean:
            best_f1_mean = f1_mean
            best_metrics = {
                'eps_start_0': eps,
                'f1_any': metrics_any['f1'],
                'precision_any': metrics_any['precision'],
                'recall_any': metrics_any['recall'],
                'f1_famous': metrics_famous['f1'],
                'precision_famous': metrics_famous['precision'],
                'recall_famous': metrics_famous['recall'],
                'f1_mean': f1_mean,
                'precision_mean': precision_mean,
                'recall_mean': recall_mean
            }
            best_eps = eps
            best_clusters_df = temp_df.copy()
    print(f'Best eps_start_0: {best_eps}')
    import json
    metrics_rounded = {k: (round(v, 3) if isinstance(v, float) else v) for k, v in best_metrics.items()}
    print('Best metrics:')
    print(json.dumps(metrics_rounded, indent=2))
    perf_tracking.append({
        'iteration': iteration,
        'best_eps': best_eps,
        **best_metrics,
        'best_precision_mean_so_far': max(best_precision_mean_so_far, best_precision_mean),
        'best_recall_mean_so_far': max(best_recall_mean_so_far, best_recall_mean),
        'n_large_clusters': len(large_clusters),
    })
    best_precision_mean_so_far = max(best_precision_mean_so_far, best_precision_mean)
    best_recall_mean_so_far = max(best_recall_mean_so_far, best_recall_mean)
    clusters_df = best_clusters_df
    current_col = best_new_col
print('Loop finished.')

Iterations:   0%|          | 0/4 [00:00<?, ?it/s]


=== Iteration 0 ===
Number of clusters > 3: 276
Top 5 clusters:


cluster_after_other
0_1      5256
0_40       39
0_16       34
0_557      19
0_52       18
Name: count, dtype: int64

Iterations:  25%|██▌       | 1/4 [05:21<16:05, 321.80s/it]

Best eps_start_0: 0.34
Best metrics:
{
  "eps_start_0": 0.34,
  "f1_any": 0.769,
  "precision_any": 0.776,
  "recall_any": 0.763,
  "f1_famous": 0.768,
  "precision_famous": 0.849,
  "recall_famous": 0.701,
  "f1_mean": 0.769,
  "precision_mean": 0.812,
  "recall_mean": 0.732
}

=== Iteration 1 ===
Number of clusters > 3: 1
Top 5 clusters:


cluster_start_0_1
0_1_538    11
0_1_251     3
0_1_932     3
0_1_104     3
0_1_95      3
Name: count, dtype: int64

Iterations:  50%|█████     | 2/4 [05:27<05:27, 163.83s/it]

Best eps_start_0: 0.34
Best metrics:
{
  "eps_start_0": 0.34,
  "f1_any": 0.77,
  "precision_any": 0.776,
  "recall_any": 0.763,
  "f1_famous": 0.768,
  "precision_famous": 0.849,
  "recall_famous": 0.701,
  "f1_mean": 0.769,
  "precision_mean": 0.813,
  "recall_mean": 0.732
}

=== Iteration 2 ===
No more large clusters to split. Stopping.
Loop finished.





## Save Results

In [7]:
perf_df = pd.DataFrame(perf_tracking)
perf_df.to_excel(results_dir / 'loop_grid_searchs_start_0.xlsx', index=False)
clusters_df.to_excel(results_dir / 'clusters_table.xlsx', index=False)
# Save best metrics as JSON (convert numpy types to native)
import numpy as np
def convert_to_native(obj):
    if isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(v) for v in obj]
    else:
        return obj
with open(results_dir / 'best_perf_metrics.json', 'w') as f:
    json.dump(convert_to_native(perf_tracking[-1]), f, indent=4)

## Reapply Predictions and Save

In [14]:
version = 'v1'
results_dir = Path(f'data/W03/from_notebooks/R08_loop_grid_searchs_start_0/{version}')
results_dir.mkdir(parents=True, exist_ok=True)

In [15]:
final_clusters_df = pd.read_excel(results_dir / 'clusters_table.xlsx')

In [16]:
# Use the last cluster column as 'cluster_final'
last_cluster_col = [col for col in final_clusters_df.columns if col.startswith('cluster_start_0_')][-1]
final_clusters_df['cluster_final'] = final_clusters_df[last_cluster_col]
cols_to_keep = [
    'full_name',
    'alias',
    'number_of_occurrences',
    'alias_number_of_occurrences',
    'similarity',
    'uncertain',
    'final_label',
    'prediction',
]
prediction_any_results_xlsx = results_dir / 'prediction_results_any_reg.xlsx'
prediction_famous_results_xlsx = results_dir / 'prediction_results_famous_reg.xlsx'
eval_any_final = apply_predictions(
    eval_df_any.copy(),
    dict(zip(final_clusters_df['full_name'], final_clusters_df['cluster_final'])),
    col_el_1='full_name',
    col_el_2='alias'
)[cols_to_keep]
eval_any_final.to_excel(prediction_any_results_xlsx, index=False)
eval_famous_final = apply_predictions(
    eval_df_famous.copy(),
    dict(zip(final_clusters_df['full_name'], final_clusters_df['cluster_final'])),
    col_el_1='full_name',
    col_el_2='alias'
)[cols_to_keep]
eval_famous_final.to_excel(prediction_famous_results_xlsx, index=False)

# Some additional stats

## New results

In [17]:
# Vectorize the assignment for popular noise items (solo clusters)
n_solo = 1
final_clusters_df['processed_cluster_final'] = final_clusters_df['cluster_final']
mask = final_clusters_df['cluster_final'].isna() & (final_clusters_df['number_of_occurrences'] >= n_solo)
final_clusters_df.loc[mask, 'processed_cluster_final'] = [
    f"solo_{i+1}" for i in range(mask.sum())
]

In [18]:
column_final_cluster = 'processed_cluster_final' # 'cluster_final' # 
final_clusters_df[column_final_cluster] = final_clusters_df[column_final_cluster].apply(lambda x: None if is_noise(x) else x)
# compute % of noise clusters in corrected_cluster (is NA or None)
noise_clusters = final_clusters_df[final_clusters_df['cluster_final'].isna()]
display(noise_clusters[['full_name', 'number_of_occurrences', 'cluster_final','processed_cluster_final']].head())

Unnamed: 0,full_name,number_of_occurrences,cluster_final,processed_cluster_final
5,CMR-COVID (CMR-COVID),2,,solo_1
6,SAR-COVID (SAR-COVID),3,,solo_2
10,Danish Fracture Database (DFD),8,,solo_3
26,Japanese Data Center for Hematopoietic Cell Tr...,4,,solo_4
28,Florida Cancer Registry (FCR),28,,solo_5


In [19]:
print('---')
noise_percentage = len(noise_clusters) / len(final_clusters_df) * 100
print(f"Number of raw extracted registry names: {len(final_clusters_df)}")
# number of consolidated registry names is unique values of corrected_cluster
nb_consolidated_registries = len(final_clusters_df[column_final_cluster].dropna().unique())
print(f"Number of Consolidated registry names: {nb_consolidated_registries}")

print('---')
# how many clusters have more than 1 element? (i.e. how many clusters have their aggreagted number of occurrences > 1, group on 'processed_cluster_final')
clusters_with_multiple_elements = final_clusters_df.groupby(column_final_cluster)['number_of_occurrences'].sum().reset_index()
print(f"Number of Multi-mention registries: {len(clusters_with_multiple_elements[clusters_with_multiple_elements['number_of_occurrences'] > 1])}")
clusters_with_multiple_aliases = final_clusters_df.groupby(column_final_cluster)['full_name'].nunique().reset_index()
print(f"Number of Multi-alias: {len(clusters_with_multiple_aliases[clusters_with_multiple_aliases['full_name'] > 1])}")
#print %
print(f'Percentage of Multi-mention registries (at least 2 mentions): {(len(clusters_with_multiple_elements[clusters_with_multiple_elements["number_of_occurrences"] > 1]) / nb_consolidated_registries) * 100:.2f}%')
print(f'Percentage of Multi-alias (at least 2 different aliases): {(len(clusters_with_multiple_aliases[clusters_with_multiple_aliases["full_name"] > 1]) / nb_consolidated_registries) * 100:.2f}%')

# print(f"Number of un-consolidated/lost (noise) registry names: {len(noise_clusters)}")
# print(f"Percentage of Consolidated registry names: {(100-noise_percentage):.2f}%")
# print(f"Percentage of un-consolidated/lost (noise) registry names: {noise_percentage:.2f}%")

print('---')
# count total number of occurrences in clusters
total_occurrences = final_clusters_df['number_of_occurrences'].sum()
clusters_with_one_element = final_clusters_df.groupby(column_final_cluster)['number_of_occurrences'].sum().reset_index()
total_final_noise_occurrences = len(clusters_with_one_element[clusters_with_one_element['number_of_occurrences'] == 1])
# count total number of occurences in the 
print(f"Total number of publications with one of the 'official' raw extracted registry names: {total_occurrences}")
# print(f"Total number of publications with one of the consolidated registry names: {total_occurrences - total_noise_occurrences}")
# print(f"Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): {total_noise_occurrences}")
# # print %
print(f"Percentage of publications related to Multi-mention registries : {(total_occurrences-total_final_noise_occurrences) / total_occurrences * 100:.2f}%")
print(f"Percentage of publications related to Single-mention registries : {total_final_noise_occurrences / total_occurrences * 100:.2f}%")

---
Number of raw extracted registry names: 54347
Number of Consolidated registry names: 31299
---
Number of Multi-mention registries: 14724
Number of Multi-alias: 10985
Percentage of Multi-mention registries (at least 2 mentions): 47.04%
Percentage of Multi-alias (at least 2 different aliases): 35.10%
---
Total number of publications with one of the 'official' raw extracted registry names: 163120
Percentage of publications related to Multi-mention registries : 89.84%
Percentage of publications related to Single-mention registries : 10.16%


In [22]:
# show number of registries with more than 10 aliases using column cluster_final
print('---')
nmax = 20
multi_aliases = final_clusters_df.groupby(column_final_cluster)['full_name'].nunique().reset_index()
print(f"Number of registries with more than 10 aliases: {len(multi_aliases[multi_aliases['full_name'] > nmax])}")
# show top 10 registries with more than 10 aliases
top_multi_aliases = multi_aliases[multi_aliases['full_name'] > nmax].sort_values(by='full_name', ascending=False).head(10)
display(top_multi_aliases)

---
Number of registries with more than 10 aliases: 0


Unnamed: 0,processed_cluster_final,full_name


## Compare with original clusters

In [23]:
# Vectorize the assignment for popular noise items (solo clusters)
n_solo = 1
cluster_final = 'cluster_0' # 'cluster_final'
final_clusters_df[cluster_final] = final_clusters_df[cluster_final].apply(lambda x: None if is_noise(x) else x)
final_clusters_df['processed_cluster_final'] = final_clusters_df[cluster_final]
mask = final_clusters_df[cluster_final].isna() & (final_clusters_df['number_of_occurrences'] >= n_solo)
final_clusters_df.loc[mask, 'processed_cluster_final'] = [
    f"solo_{i+1}" for i in range(mask.sum())
]

In [24]:
column_final_cluster = 'processed_cluster_final' # 'cluster_final' # 
final_clusters_df[column_final_cluster] = final_clusters_df[column_final_cluster].apply(lambda x: None if is_noise(x) else x)
# compute % of noise clusters in corrected_cluster (is NA or None)
noise_clusters = final_clusters_df[final_clusters_df[cluster_final].isna()]
display(noise_clusters[['full_name', 'number_of_occurrences', cluster_final,'processed_cluster_final']].head())

Unnamed: 0,full_name,number_of_occurrences,cluster_0,processed_cluster_final
5,CMR-COVID (CMR-COVID),2,,solo_1
6,SAR-COVID (SAR-COVID),3,,solo_2
33,National Registry of Cardiovascular Interventi...,3,,solo_3
45,Dutch Biologic Monitor (DBM),4,,solo_4
46,CORONOR (CORONOR),2,,solo_5


In [25]:
print('---')
noise_percentage = len(noise_clusters) / len(final_clusters_df) * 100
print(f"Number of raw extracted registry names: {len(final_clusters_df)}")
# number of consolidated registry names is unique values of corrected_cluster
nb_consolidated_registries = len(final_clusters_df[column_final_cluster].dropna().unique())
print(f"Number of Consolidated registry names: {nb_consolidated_registries}")

print('---')
# how many clusters have more than 1 element? (i.e. how many clusters have their aggreagted number of occurrences > 1, group on 'processed_cluster_final')
clusters_with_multiple_elements = final_clusters_df.groupby(column_final_cluster)['number_of_occurrences'].sum().reset_index()
print(f"Number of Multi-mention registries: {len(clusters_with_multiple_elements[clusters_with_multiple_elements['number_of_occurrences'] > 1])}")
clusters_with_multiple_aliases = final_clusters_df.groupby(column_final_cluster)['full_name'].nunique().reset_index()
print(f"Number of Multi-alias registries: {len(clusters_with_multiple_aliases[clusters_with_multiple_aliases['full_name'] > 1])}")
#print %
print(f'Percentage of Multi-mention registries (at least 2 mentions): {(len(clusters_with_multiple_elements[clusters_with_multiple_elements["number_of_occurrences"] > 1]) / nb_consolidated_registries) * 100:.2f}%')
print(f'Percentage of Multi-alias (at least 2 different aliases): {(len(clusters_with_multiple_aliases[clusters_with_multiple_aliases["full_name"] > 1]) / nb_consolidated_registries) * 100:.2f}%')

# print(f"Number of un-consolidated/lost (noise) registry names: {len(noise_clusters)}")
# print(f"Percentage of Consolidated registry names: {(100-noise_percentage):.2f}%")
# print(f"Percentage of un-consolidated/lost (noise) registry names: {noise_percentage:.2f}%")

print('---')
# count total number of occurrences in clusters
total_occurrences = final_clusters_df['number_of_occurrences'].sum()
clusters_with_one_element = final_clusters_df.groupby(column_final_cluster)['number_of_occurrences'].sum().reset_index()
total_final_noise_occurrences = len(clusters_with_one_element[clusters_with_one_element['number_of_occurrences'] == 1])
# count total number of occurences in the 
print(f"Total number of publications with one of the 'official' raw extracted registry names: {total_occurrences}")
# print(f"Total number of publications with one of the consolidated registry names: {total_occurrences - total_noise_occurrences}")
# print(f"Total number of publications lost (with one of the un-consolidated/lost (noise) registry names): {total_noise_occurrences}")
# # print %
print(f"Percentage of publications related to Multi-mention registries : {(total_occurrences-total_final_noise_occurrences) / total_occurrences * 100:.2f}%")
print(f"Percentage of publications related to Single-mention registries : {total_final_noise_occurrences / total_occurrences * 100:.2f}%")

---
Number of raw extracted registry names: 54347
Number of Consolidated registry names: 25787
---
Number of Multi-mention registries: 12561
Number of Multi-alias registries: 9667
Percentage of Multi-mention registries (at least 2 mentions): 48.71%
Percentage of Multi-alias (at least 2 different aliases): 37.49%
---
Total number of publications with one of the 'official' raw extracted registry names: 163120
Percentage of publications related to Multi-mention registries : 91.89%
Percentage of publications related to Single-mention registries : 8.11%


In [26]:
# show number of registries with more than 10 aliases using column cluster_final
print('---')
nmax = 20
multi_aliases = final_clusters_df.groupby(column_final_cluster)['full_name'].nunique().reset_index()
print(f"Number of registries with more than 10 aliases: {len(multi_aliases[multi_aliases['full_name'] > nmax])}")
# show top 10 registries with more than 10 aliases
top_multi_aliases = multi_aliases[multi_aliases['full_name'] > nmax].sort_values(by='full_name', ascending=False).head(10)
display(top_multi_aliases)

---
Number of registries with more than 10 aliases: 71


Unnamed: 0,processed_cluster_final,full_name
0,0_1,5256
8736,6_3,690
3785,23_1,101
9469,8_1,76
5788,38_1,69
8625,6_2,68
5394,356_1,65
9090,6_7,60
8903,6_45,58
8847,6_4,56


# Some additional checks

In [31]:
version = 'v1'
results_dir = Path(f'data/W03/from_notebooks/R08_loop_grid_searchs_start_0/{version}')
results_dir.mkdir(parents=True, exist_ok=True)

In [32]:
final_clusters_df = pd.read_excel(results_dir / 'clusters_table.xlsx')

In [33]:
# Use the last cluster column as 'cluster_final'
last_cluster_col = [col for col in final_clusters_df.columns if col.startswith('cluster_start_0_')][-1]
final_clusters_df['cluster_final'] = final_clusters_df[last_cluster_col]

In [34]:
pd.set_option('max_colwidth', 400)

In [36]:
# show number of rows of final_clusters_df
print(f"Number of rows in final_clusters_df: {len(final_clusters_df)}")

Number of rows in final_clusters_df: 54347


In [38]:
cluster_final = 'cluster_final' # 'cluster_final'
# show 3 clusters of size between 10 and 20 (number of aliases)
clusters_of_interest = final_clusters_df.groupby(cluster_final)['full_name'].nunique().reset_index()
# show top 10
# display(clusters_of_interest.sort_values(by='full_name', ascending=False).head(10))
clusters_of_interest = clusters_of_interest[(clusters_of_interest['full_name'] >= 10) & (clusters_of_interest['full_name'] <= 20)]
clusters_of_interest = clusters_of_interest.sort_values(by='full_name', ascending=False)# .head(3)
print(f"Clusters of interest (size between 10 and 20): {len(clusters_of_interest)}")
for cluster in clusters_of_interest[cluster_final].head(3).unique():
    print(f"\nCluster: {cluster}")
    display(final_clusters_df[final_clusters_df[cluster_final] == cluster][['full_name', 'number_of_occurrences']])

Clusters of interest (size between 10 and 20): 327

Cluster: 9_1_1_1_1


Unnamed: 0,full_name,number_of_occurrences
12,Society of Thoracic Surgeons/American College of Cardiology Transcatheter Valve Therapy Registry (STS/ACC TVT Registry),92
1645,Society of Thoracic Surgeons/American College of Cardiology Transcatheter Valve Therapies Registry (STS/ACC TVT Registry),18
4043,American College of Cardiology Transcatheter Valve Therapy Registry (ACC TVT),1
4559,The Society of Thoracic Surgeons/American College of Cardiology Transcatheter Valve Therapy Registry (STS/ACC TVT),8
4841,Society of Thoracic Surgeons/Transcatheter Valve Therapy Registry (STS/TVT),5
10034,Society of Thoracic Surgery/American College of Cardiology Transvalvular Therapeutics Registry (STS/ACC TVT Registry),1
10077,American College of Cardiology/Society of Thoracic Surgeons Transcatheter Valve Therapy Registry (TVT Registry),1
15859,Society of Thoracic Surgeons and American College of Cardiology Transcatheter Valve Therapies Registry (TVT Registry),1
16192,Society of Thoracic Surgeons/American College of Cardiology/Transcatheter Valve Therapy Registry (STS/ACC/TVT),3
18751,Society of Thoracic Surgeons/American College of Cardiology Transcatheter Valve Registry (STS/ACC TVT Registry),1



Cluster: 9_5_1_2


Unnamed: 0,full_name,number_of_occurrences
2756,Optimised Catheter Valvular Intervention transcatheter aortic valve implantation (OCEAN-TAVI),1
3040,Optimized CathEter vAlvular iNtervention (OCEAN-TAVI),1
5819,Optimised transCathEter vAlvular interventioN-transcatheter aortic valve implantation registry (OCEAN-TAVI),1
6706,Optimized CathEter vAlvular iNtervention Registry (OCEAN-TAVI),7
7024,Optimized transCathEter vAlvular iNtervention Registry (OCEAN-TAVI),11
7317,Optimized Catheter Valvular Intervention Transcatheter Aortic Valve Implantation Registry (OCEAN-TAVI),2
7783,Optimized transCathEter vAlvular iNtervention (OCEAN-TAVI),3
8485,Optimized Catheter Valvular Intervention-Transcatheter Aortic Valve Implantation Registry (OCEAN-TAVI),3
10391,Optimized CathEter vAlvular iNtervention-TAVI registry (OCEAN-TAVI),4
14581,Optimized CathEter vAlvular iNtervention-Transcatheter Aortic Valve Implantation Japanese multicentre registry (OCEAN-TAVI),1



Cluster: 28_12


Unnamed: 0,full_name,number_of_occurrences
1702,Acute Myocardial Infarction in Switzerland Plus Registry (AMIS Plus),21
3292,Acute Myocardial Infarction in Switzerland (AMIS) Plus Registry (AMIS Plus),10
8112,Acute Myocardial Infarction in Switzerland Registry (AMIS Plus),1
8137,Acute Myocardial Infarction and unstable angina in Switzerland (AMIS Plus),4
9450,Acute Myocardial Infarction in Switzerland (AMIS Plus) (AMIS Plus),6
9833,Acute Myocardial Infarction Swiss registry (AMIS),1
10511,Acute Myocardial Infarction in Switzerland (AMIS Plus),6
11130,Swiss Acute Myocardial Infarction (AMIS) Plus Registry (AMIS Plus),6
12747,Swiss Registry of Acute Coronary Syndromes (AMIS Plus) (AMIS Plus),1
12862,Acute Myocardial Infarction in Switzerland (AMIS)-Plus ACS Registry (AMIS-Plus),1
