# EMA Registries: Nearest Neighbor Assignment (Option 1)

This notebook implements and analyzes the nearest neighbor assignment approach for landing EMA registries into existing clusters.

## 1. Setup & Imports

In [2]:
import os
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from tqdm import tqdm
from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.s3_io_functions import load_parquet_from_s3

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


## 2. Load Data

In [3]:
# a. Load registry embeddings from S3
s3_input_embeddings = "registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet"
bucket_name = config.BUCKET_NAME_DEV
folder_path = s3_input_embeddings.rsplit('/', 1)[0]
file_name = s3_input_embeddings.rsplit('/', 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=folder_path,
    file_name=file_name,
)
print(f'Loaded registry embeddings: {embeddings_df.shape}')



Loaded registry embeddings: (54335, 4)


In [4]:
# b. Load cluster assignments from local Excel
clusters_table_xlsx = 'data/W02/R02_evaluate_model_performance/clusters_table.xlsx'
clusters_df = pd.read_excel(clusters_table_xlsx)
print(f'Loaded clusters table: {clusters_df.shape}')

Loaded clusters table: (54335, 3)


In [5]:
# c. Load EMA registry embeddings from S3
s3_ema_embeddings = 'registry_data_catalog_experiments/P05_refine_dedup/ema_registry_names_embeddings.parquet'
ema_folder_path = s3_ema_embeddings.rsplit('/', 1)[0]
ema_file_name = s3_ema_embeddings.rsplit('/', 1)[-1]
ema_embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=ema_folder_path,
    file_name=ema_file_name,
)
print(f'Loaded EMA registry embeddings: {ema_embeddings_df.shape}')



Loaded EMA registry embeddings: (237, 4)


## 3. Preprocessing
No normalization required. Ensure embeddings are in correct format.

In [6]:
# Ensure embeddings are numpy arrays
embeddings_df['full_name_embedding'] = embeddings_df['full_name_embedding'].apply(np.array)
ema_embeddings_df['full_name_embedding'] = ema_embeddings_df['full_name_embedding'].apply(np.array)
clusters_df['Final_Cluster'] = clusters_df['Final_Cluster'].astype(str)
print('Embeddings and clusters preprocessed.')

Embeddings and clusters preprocessed.


## 4. Nearest Neighbor Search and Results Table

In [7]:
from sklearn.metrics.pairwise import cosine_distances
# Prepare lookup tables
emb_matrix = np.vstack(embeddings_df['full_name_embedding'].values)
ema_matrix = np.vstack(ema_embeddings_df['full_name_embedding'].values)
# Add cluster info to embeddings_df
embeddings_with_clusters = embeddings_df.merge(clusters_df, on=['full_name', 'number_of_occurrences'], how='left')
# For each EMA registry, find nearest neighbor and cluster info
results = []
for idx, ema_row in tqdm(ema_embeddings_df.iterrows(), total=ema_embeddings_df.shape[0]):
    ema_emb = ema_row['full_name_embedding'].reshape(1, -1)
    dists = cosine_distances(ema_emb, emb_matrix)[0]
    min_idx = np.argmin(dists)
    closest_row = embeddings_with_clusters.iloc[min_idx]
    assigned_cluster = closest_row['Final_Cluster']
    # Get all aliases in the same cluster
    cluster_aliases = embeddings_with_clusters[embeddings_with_clusters['Final_Cluster'] == assigned_cluster]
    total_aliases = cluster_aliases.shape[0]
    total_occurrences = cluster_aliases['number_of_occurrences'].sum()
    # Most popular alias
    n1_row = cluster_aliases.sort_values('number_of_occurrences', ascending=False).iloc[0]
    results.append({
        'ema_full_name': ema_row['full_name'],
        'ema_object_id': ema_row['object_id'],
        'assigned_cluster': assigned_cluster,
        'distance_to_closest': dists[min_idx],
        'closest': closest_row['full_name'],
        'closest_nb_occ': closest_row['number_of_occurrences'],
        'total_aliases': total_aliases,
        'total_occurrences': total_occurrences,
        'N1_alias': n1_row['full_name'],
        'N1_alias_nb_occ': n1_row['number_of_occurrences'],
    })
results_df = pd.DataFrame(results)
display(results_df.head())

100%|██████████| 237/237 [00:49<00:00,  4.82it/s]



Unnamed: 0,ema_full_name,ema_object_id,assigned_cluster,distance_to_closest,closest,closest_nb_occ,total_aliases,total_occurrences,N1_alias,N1_alias_nb_occ
0,Clinical Practice Research Datalink (CPRD) GOL...,028ac269-7f7f-56d3-8dd3-0bd93913d864,70_3,0.007017,Clinical Practice Research Datalink GOLD (CPRD...,11,2,12,Clinical Practice Research Datalink GOLD (CPRD...,11
1,Cancer Registry of Instituto Português de Onco...,0446b0c0-0bec-5999-8c59-f0e4107dabc3,0_0,0.043343,Herzinfarktverbund Essen (HIVE),1,12453,17753,Fushimi AF Registry (FAR),77
2,European Registry of Patients with McArdle dis...,060d06a8-0a14-5b8b-9fe4-882f08412d7f,3354,0.010645,European registry for patients with McArdle di...,2,2,3,European registry for patients with McArdle di...,2
3,The UK-Irish Atopic Eczema Systemic Therapy Re...,060ede1e-35d6-5dab-9e1e-d7e1e2f78c2a,5482,0.005611,Texas Trauma Registry (TTR),1,2,2,Texas Trauma Registry (TTR),1
4,syndena GmbH (former OncoTyrol) (syndena GmbH),08920b82-907a-5d03-828e-f76f7b3c735e,144_0,0.17924,Cancer Registry of Tyrol (CRT),5,3,7,Cancer Registry of Tyrol (CRT),5


## 5. Save Results
Save the results DataFrame to Excel for further analysis.

In [9]:
results_xlsx = 'data/W02/R03_evaluate_model_performance_on_ema_registries/ema_prediction_results.xlsx'
# make sure the directory exists
results_dir = Path(results_xlsx).parent
results_dir.mkdir(parents=True, exist_ok=True)
results_df.to_excel(results_xlsx, index=False)
print(f'Results saved to {results_xlsx}')

Results saved to data/W02/R03_evaluate_model_performance_on_ema_registries/ema_prediction_results.xlsx


## 6. Analysis & Metrics
Reload the results and compute the required metrics.

In [15]:
# Reload results
results_xlsx = 'data/W02/R03_evaluate_model_performance_on_ema_registries/ema_prediction_results.xlsx'
results_df = pd.read_excel(results_xlsx)
# Identify noise clusters: '0' or ending with '_0'
def is_noise(cluster):
    return str(cluster) == '0' or str(cluster).endswith('_0')
results_df['is_noise'] = results_df['assigned_cluster'].apply(is_noise)
# EMA transformation rate
n_total = results_df.shape[0]
# add a column 'identical' to check if the closest alias is the same as the EMA registry    
results_df['identical'] = results_df['ema_full_name'] == results_df['closest']
# count how many EMA registries are identical to the closest alias
n_identical = results_df['identical'].sum()
print(f'Number of identical EMA registries: {n_identical} ({(n_identical / n_total)*100:.1f}%)')
# Count how many EMA registries are in noise
n_noise = results_df['is_noise'].sum()
print(f'Number of EMA registries in noise clusters: {n_noise} ({(n_noise / n_total)*100:.1f}%)')
n_transformed = (~results_df['is_noise']).sum()
ema_transformation_rate = n_transformed / n_total
print(f'EMA transformation rate: {ema_transformation_rate*100:.1f}% ({n_transformed}/{n_total})')
# Number of clusters with at least one EMA registry
transformed_clusters = results_df.loc[~results_df['is_noise'], 'assigned_cluster'].unique()
n_clusters_with_ema = len(transformed_clusters)
print(f'Number of clusters with at least one EMA registry: {n_clusters_with_ema}')
# Clusters with multiple EMA registries
cluster_counts = results_df.loc[~results_df['is_noise'], 'assigned_cluster'].value_counts()
multi_ema_clusters = cluster_counts[cluster_counts > 1]
print(f'Clusters with multiple EMA registries: {multi_ema_clusters.to_dict()}')
# Aliases per transformed EMA registry
aliases_stats = results_df.loc[~results_df['is_noise'], 'total_aliases'].describe()
print('Aliases per transformed EMA registry:')
print(aliases_stats)

Number of identical EMA registries: 21 (8.9%)
Number of EMA registries in noise clusters: 73 (30.8%)
EMA transformation rate: 69.2% (164/237)
Number of clusters with at least one EMA registry: 126
Clusters with multiple EMA registries: {'0_1': 24, '4032': 8, '6_3': 5, '26_1': 2, '4449': 2, '0_1167': 2, '534_1': 2}
Aliases per transformed EMA registry:
count     164.000000
mean      794.585366
std      1856.610980
min         2.000000
25%         2.000000
50%         3.000000
75%        13.250000
max      5256.000000
Name: total_aliases, dtype: float64


In [12]:
# display the 10 first rows of transformed_clusters ranked by total_aliases
transformed_clusters_df = results_df.loc[~results_df['is_noise']].sort_values('total_aliases', ascending=False)
display(transformed_clusters_df.head(50)[['ema_full_name', 'closest', 'closest_nb_occ','assigned_cluster', 'total_aliases', 'total_occurrences']])


Unnamed: 0,ema_full_name,closest,closest_nb_occ,assigned_cluster,total_aliases,total_occurrences
39,FranceCoag (FranceCoag),FranceCoag (FranceCoag),3,0_1,5256,8651
19,Global Registry for COL6-related Dystrophies (...,"Nationwide, prospective, multicentre registry",1,0_1,5256,8651
8,LynxCare (LXC),North East Thames Regional Health Authority Ce...,1,0_1,5256,8651
7,Telotron - Telomera (Telotron),Institute of Toxicology of Madrid Registry (ITM),1,0_1,5256,8651
38,AOUPR EHDEN Database (AOUPR),Odense University Hospital Databases (OUHDB),1,0_1,5256,8651
37,"Health Data Architecture for Learning, INFOBAN...",Colombian Ministry of Health Database (CMHD),1,0_1,5256,8651
34,CureDRPLA Global Patient Registry (CureDRPLA G...,Stanford Atrial Fibrillation Registry (Stanfor...,1,0_1,5256,8651
27,Institut Municipal d'Assistència Sanitària Inf...,Catalan Medical Evaluations Institute database...,1,0_1,5256,8651
172,The European HBV Registry - A joint initiative...,HBV Diagnostic Registry (HBVDR),1,0_1,5256,8651
183,The United Kingdom Spinal Muscular Atrophy Pat...,Pavia Diabetes Registry (PDR),1,0_1,5256,8651
