# EMA Registries: Nearest Neighbor Assignment (Option 1)

This notebook implements and analyzes the nearest neighbor assignment approach for landing EMA registries into existing clusters.

## 1. Setup & Imports

In [1]:
import os
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

from src.p05_refine_dedup import config
from src.p05_refine_dedup.utils.s3_io_functions import load_parquet_from_s3

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P05_refine_dedup


## 2. Load Data

In [2]:
# a. Load registry embeddings from S3
s3_input_embeddings = "registry_data_catalog_experiments/P05_refine_dedup/registry_names_embeddings.parquet"
bucket_name = config.BUCKET_NAME_DEV
folder_path = s3_input_embeddings.rsplit('/', 1)[0]
file_name = s3_input_embeddings.rsplit('/', 1)[-1]
embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=folder_path,
    file_name=file_name,
)
print(f'Loaded registry embeddings: {embeddings_df.shape}')



Loaded registry embeddings: (54335, 4)


In [3]:
# b. Load EMA registry embeddings from S3
s3_ema_embeddings = 'registry_data_catalog_experiments/P05_refine_dedup/ema_registry_names_embeddings.parquet'
ema_folder_path = s3_ema_embeddings.rsplit('/', 1)[0]
ema_file_name = s3_ema_embeddings.rsplit('/', 1)[-1]
ema_embeddings_df = load_parquet_from_s3(
    bucket_name=bucket_name,
    folder_path=ema_folder_path,
    file_name=ema_file_name,
)
print(f'Loaded EMA registry embeddings: {ema_embeddings_df.shape}')



Loaded EMA registry embeddings: (237, 4)


In [4]:
# c.define elements specific to the experiment
exp_name='new_eucl_0.2'
ema_prediction_results_xlsx=f"data/W02/R03_evaluate_model_performance_on_ema_registries/{exp_name}/ema_prediction_results.xlsx"
# make sure the directory exists
results_dir = Path(ema_prediction_results_xlsx).parent
results_dir.mkdir(parents=True, exist_ok=True)

In [5]:
dist_type = 'euclidean'# or 'cosine'
max_dist = 0.2 # maximum distance threshold for considering a match

In [6]:
# d. Load cluster assignments from local Excel
clusters_table_xlsx="data/W03/from_notebooks/R08_loop_grid_searchs_start_0/v1/clusters_table.xlsx"
clusters_df = pd.read_excel(clusters_table_xlsx)
print(f'Loaded clusters table: {clusters_df.shape}')
col_cluster_final = 'cluster_final'

Loaded clusters table: (54347, 36)


In [7]:
# additional preprocessing, dependent on the experiment and the selected clusters table to load
# Vectorize the assignment for popular noise items (solo clusters)
n_solo = 1
col_cluster_id = 'cluster_id' # 'cluster_final'
clusters_df[col_cluster_id] = clusters_df[col_cluster_final]
mask = clusters_df[col_cluster_final].isna() & (clusters_df['number_of_occurrences'] >= n_solo)
clusters_df.loc[mask, col_cluster_id] = [
    f"solo_{i+1}" for i in range(mask.sum())
]

## 3. Preprocessing
No normalization required. Ensure embeddings are in correct format.

In [8]:
# Ensure embeddings are numpy arrays
embeddings_df['full_name_embedding'] = embeddings_df['full_name_embedding'].apply(np.array)
ema_embeddings_df['full_name_embedding'] = ema_embeddings_df['full_name_embedding'].apply(np.array)
# clusters_df[col_cluster_id] = clusters_df[col_cluster_id].astype(str)
print('Embeddings and clusters preprocessed.')

Embeddings and clusters preprocessed.


## 4. Nearest Neighbor Search and Results Table

In [None]:
# Prepare lookup tables
emb_matrix = np.vstack(embeddings_df['full_name_embedding'].values)
ema_matrix = np.vstack(ema_embeddings_df['full_name_embedding'].values)
# Add cluster info to embeddings_df
embeddings_with_clusters = embeddings_df.merge(clusters_df, on=['full_name', 'number_of_occurrences'], how='left')
# For each EMA registry, find nearest neighbor and cluster info
results = []
for idx, ema_row in tqdm(ema_embeddings_df.iterrows(), total=ema_embeddings_df.shape[0]):
    print('---')
    print(f"Registry: {ema_row['full_name']}")
    ema_emb = ema_row['full_name_embedding'].reshape(1, -1)
    if dist_type == 'cosine':
        dists = cosine_distances(ema_emb, emb_matrix)[0]
    elif dist_type == 'euclidean':
        dists = euclidean_distances(ema_emb, emb_matrix)[0]
    # Find the closest match
    min_idx = np.argmin(dists)
    closest_row = embeddings_with_clusters.iloc[min_idx]
    if dists[min_idx] > max_dist:
        # If the closest distance is greater than the threshold, assign None
        assigned_cluster = None
        print(f"EMA {ema_row['full_name']} has no close match within threshold {max_dist}")
        print(f"Closest match: {closest_row['full_name']} with distance {round(dists[min_idx], 3)}")
    else:
        assigned_cluster = closest_row[col_cluster_id]
        print(f"EMA {ema_row['full_name']} assigned to cluster {assigned_cluster} with distance {round(dists[min_idx], 3)}")
        # Get all aliases in the same cluster
        cluster_aliases = embeddings_with_clusters[embeddings_with_clusters[col_cluster_id] == assigned_cluster]
        total_aliases = cluster_aliases.shape[0]
        total_occurrences = cluster_aliases['number_of_occurrences'].sum()
        # Most popular alias
        n1_row = cluster_aliases.sort_values('number_of_occurrences', ascending=False).iloc[0]
    results.append({
        'ema_object_id': ema_row['object_id'],
        'ema_full_name': ema_row['full_name'],
        'assigned_cluster': assigned_cluster,
        'distance_to_closest': dists[min_idx],
        'closest': closest_row['full_name'],
        'closest_nb_occ': closest_row['number_of_occurrences'],
        # total aliases should be the number of unique aliases in the cluster, or 0
        'total_aliases': total_aliases if assigned_cluster else 0,
        'total_occurrences': total_occurrences if assigned_cluster else 0,
        'N1_alias': n1_row['full_name'] if assigned_cluster else None,
        'N1_alias_nb_occ': n1_row['number_of_occurrences'] if assigned_cluster else None,
    })
results_df = pd.DataFrame(results)
display(results_df.head())

  0%|          | 1/237 [00:00<00:30,  7.78it/s]

---
Registry: Clinical Practice Research Datalink (CPRD) GOLD (CPRD GOLD)
EMA Clinical Practice Research Datalink (CPRD) GOLD (CPRD GOLD) assigned to cluster 70_3 with distance 0.11800000071525574
---
Registry: Cancer Registry of Instituto Português de Oncologia do Porto Francisco Gentil, E.P.E (IPO-Porto Cancer Registry)


  1%|▏         | 3/237 [00:00<00:30,  7.55it/s]

EMA Cancer Registry of Instituto Português de Oncologia do Porto Francisco Gentil, E.P.E (IPO-Porto Cancer Registry) has no close match within threshold 0.2
Closest match: The Nationwide Danish 1905 Cohort Study (1905 Cohort Study) with distance 0.2939999997615814
---
Registry: European Registry of Patients with McArdle disease or other rare form of muscle Glycogenoses (EUROMAC)
EMA European Registry of Patients with McArdle disease or other rare form of muscle Glycogenoses (EUROMAC) assigned to cluster 3354 with distance 0.1459999978542328
---
Registry: The UK-Irish Atopic Eczema Systemic Therapy Register (A-STAR)


  2%|▏         | 5/237 [00:00<00:32,  7.20it/s]

EMA The UK-Irish Atopic Eczema Systemic Therapy Register (A-STAR) assigned to cluster 0_1_24 with distance 0.10599999874830246
---
Registry: syndena GmbH (former OncoTyrol) (syndena GmbH)
EMA syndena GmbH (former OncoTyrol) (syndena GmbH) has no close match within threshold 0.2
Closest match: Cancer Registry of Tyrol (CRT) with distance 0.5989999771118164
---
Registry: Ambulatory EMR - OMOP


  3%|▎         | 7/237 [00:00<00:31,  7.37it/s]

EMA Ambulatory EMR - OMOP has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.6499999761581421
---
Registry: European Cystic Fibrosis Society Patient Registry (ECFSPR - Cystic Fibrosis)
EMA European Cystic Fibrosis Society Patient Registry (ECFSPR - Cystic Fibrosis) assigned to cluster 235_2 with distance 0.19499999284744263
---
Registry: Telotron - Telomera (Telotron)


  4%|▍         | 9/237 [00:01<00:32,  7.00it/s]

EMA Telotron - Telomera (Telotron) has no close match within threshold 0.2
Closest match: National Institute for Communicable Diseases Case Register (NICD Case Register) with distance 0.7110000252723694
---
Registry: LynxCare (LXC)
EMA LynxCare (LXC) has no close match within threshold 0.2
Closest match: National Swedish Inpatient Register (NIPR) with distance 0.628000020980835
---
Registry: The Bulgarian Diabetes Register and the Register of rare endocrine deseases (BDR-RRED)


  5%|▍         | 11/237 [00:01<00:31,  7.28it/s]

EMA The Bulgarian Diabetes Register and the Register of rare endocrine deseases (BDR-RRED) has no close match within threshold 0.2
Closest match: Japanese Catheter Ablation Registry for Rivaroxaban (JACRE-R) with distance 0.26600000262260437
---
Registry: HULAFE (HULAFE)
EMA HULAFE (HULAFE) has no close match within threshold 0.2
Closest match: CardioHULA registry (CardioHULA) with distance 0.6290000081062317
---
Registry: Longitudinal Patient Data Spain - OMOP (LPD SPA)


  5%|▌         | 13/237 [00:01<00:31,  7.03it/s]

EMA Longitudinal Patient Data Spain - OMOP (LPD SPA) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.5329999923706055
---
Registry: The Valencia Health System Integrated Database (VID)
EMA The Valencia Health System Integrated Database (VID) has no close match within threshold 0.2
Closest match: Khorasan Razavi Multiple Sclerosis Registry (KRMSR) with distance 0.42100000381469727
---
Registry: IATROS (IATROS)


  6%|▋         | 15/237 [00:02<00:36,  6.14it/s]

EMA IATROS (IATROS) has no close match within threshold 0.2
Closest match: National Surveillance System for Hospital Data (SNSHD) with distance 0.5929999947547913
---
Registry: UK Renal Registry (UKRR)
EMA UK Renal Registry (UKRR) assigned to cluster 23_4 with distance 0.0
---
Registry: Linkage of Swedish national registers for psychiatric research


  7%|▋         | 17/237 [00:02<00:31,  6.93it/s]

EMA Linkage of Swedish national registers for psychiatric research has no close match within threshold 0.2
Closest match: STEMI-ENDO Registry (STEMI-ENDO) with distance 0.34200000762939453
---
Registry: IQVIA Longitudinal Patient Data - Belgium (LPD BE)
EMA IQVIA Longitudinal Patient Data - Belgium (LPD BE) has no close match within threshold 0.2
Closest match: Health Search/IQVIA Health LPD Longitudinal Patient Database (Health Search/IQVIA LPD) with distance 0.4519999921321869
---
Registry: Enroll-HD: A Prospective Registry Study in a Global Huntington’s Disease Cohort A CHDI Foundation Project (Enroll-HD)


  8%|▊         | 19/237 [00:02<00:31,  7.00it/s]

EMA Enroll-HD: A Prospective Registry Study in a Global Huntington’s Disease Cohort A CHDI Foundation Project (Enroll-HD) has no close match within threshold 0.2
Closest match: European Huntington's Disease Network's REGISTRY (EHDN REGISTRY) with distance 0.3779999911785126
---
Registry: Hospitalizations database (Portugal)
EMA Hospitalizations database (Portugal) has no close match within threshold 0.2
Closest match: RESONANCE Registry (RES) with distance 0.4480000138282776
---
Registry: Global Registry for COL6-related Dystrophies (Global COL6 Patient Registry)


  9%|▉         | 21/237 [00:02<00:30,  7.04it/s]

EMA Global Registry for COL6-related Dystrophies (Global COL6 Patient Registry) has no close match within threshold 0.2
Closest match: National ALS Registry and Biorepository (NALSR) with distance 0.5080000162124634
---
Registry: BAse Resultados DE Navarra (BARDENA)
EMA BAse Resultados DE Navarra (BARDENA) has no close match within threshold 0.2
Closest match: PEARL I Registry (PEARL I) with distance 0.7229999899864197
---
Registry: PHARMO Data Network (PHARMO Data Network)


 10%|▉         | 23/237 [00:03<00:28,  7.46it/s]

EMA PHARMO Data Network (PHARMO Data Network) has no close match within threshold 0.2
Closest match: PHARMO Database Network (PHARMO) with distance 0.26899999380111694
---
Registry: IMI 116026 HARMONY; IMI 945406 HARMONY PLUS - HARMONY Big Data Platform (HARMONY Big Data Platform)
EMA IMI 116026 HARMONY; IMI 945406 HARMONY PLUS - HARMONY Big Data Platform (HARMONY Big Data Platform) has no close match within threshold 0.2
Closest match: SKIP-SH cohort study with distance 0.5920000076293945
---
Registry: AZ Klina (AZK)


 11%|█         | 25/237 [00:03<00:29,  7.11it/s]

EMA AZ Klina (AZK) has no close match within threshold 0.2
Closest match: IMproving Pediatric and Adult Congenital Treatments Registry (IMPACT) with distance 0.6880000233650208
---
Registry: THIN® (The Health Improvement Network®) (THIN®)
EMA THIN® (The Health Improvement Network®) (THIN®) has no close match within threshold 0.2
Closest match: The Health Improvement Network (THIN) with distance 0.30799999833106995
---
Registry: Oncology EMR - OMOP


 11%|█▏        | 27/237 [00:03<00:28,  7.50it/s]

EMA Oncology EMR - OMOP has no close match within threshold 0.2
Closest match: University of Pennsylvania Health System Trauma Network Registry (UPHS Trauma Registry) with distance 0.5839999914169312
---
Registry: AZORG general OMOP database (AZORG OMOP)
EMA AZORG general OMOP database (AZORG OMOP) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.44999998807907104
---
Registry: Institut Municipal d'Assistència Sanitària Information System (IMASIS)


 12%|█▏        | 29/237 [00:04<00:28,  7.40it/s]

EMA Institut Municipal d'Assistència Sanitària Information System (IMASIS) has no close match within threshold 0.2
Closest match: Catalan Medical Evaluations Institute database (ICAM) with distance 0.49000000953674316
---
Registry: EULAR COVAX Registry
EMA EULAR COVAX Registry assigned to cluster solo_6830 with distance 0.19900000095367432
---
Registry: ABUCASIS INCLIVA (ABUCASIS INCLIVA OMOP)


 13%|█▎        | 31/237 [00:04<00:26,  7.67it/s]

EMA ABUCASIS INCLIVA (ABUCASIS INCLIVA OMOP) has no close match within threshold 0.2
Closest match: JROAD Diagnosis Procedure Combination/Per Diem Payment System dataset (DPC) with distance 0.4699999988079071
---
Registry: DPV registry (DPV)
EMA DPV registry (DPV) has no close match within threshold 0.2
Closest match: DPV (DPV) with distance 0.4009999930858612
---
Registry: Extended Cohort for E-health, Environment and DNA (EXCEED)


 14%|█▍        | 33/237 [00:04<00:26,  7.75it/s]

EMA Extended Cohort for E-health, Environment and DNA (EXCEED) has no close match within threshold 0.2
Closest match: Epidemiological Registry of Prion Diseases in the Basque Country (ERPDBC) with distance 0.48399999737739563
---
Registry: Phollow, the community pharmacies cohort (Phollow)
EMA Phollow, the community pharmacies cohort (Phollow) has no close match within threshold 0.2
Closest match: EWOC-1 registry (EWOC) with distance 0.5770000219345093
---
Registry: The European Clarkson’s syndrome registry (EurêClark)


 15%|█▍        | 35/237 [00:04<00:26,  7.62it/s]

EMA The European Clarkson’s syndrome registry (EurêClark) has no close match within threshold 0.2
Closest match: Registry for Geriatric Trauma DGU (DGU) with distance 0.3310000002384186
---
Registry: CureDRPLA Global Patient Registry (CureDRPLA Global Patient Registry - DRPLA)
EMA CureDRPLA Global Patient Registry (CureDRPLA Global Patient Registry - DRPLA) has no close match within threshold 0.2
Closest match: American Burn Association Patient Registry (ABAPR) with distance 0.5649999976158142
---
Registry: Network of Sentinel General Practitioners (SGP)


 16%|█▌        | 37/237 [00:05<00:27,  7.23it/s]

EMA Network of Sentinel General Practitioners (SGP) has no close match within threshold 0.2
Closest match: Health Outcome Predictive Evaluation for Corona Virus Disease 2019 (HOPE-COVID19) with distance 0.4449999928474426
---
Registry: European Multicentre Bronchiectasis Audit and Research Collaboration (EMBARC)
EMA European Multicentre Bronchiectasis Audit and Research Collaboration (EMBARC) assigned to cluster 54 with distance 0.0
---
Registry: Health Data Architecture for Learning, INFOBANCO (INFOBANCO)


 16%|█▋        | 39/237 [00:05<00:26,  7.55it/s]

EMA Health Data Architecture for Learning, INFOBANCO (INFOBANCO) has no close match within threshold 0.2
Closest match: Colombian Ministry of Health Database (CMHD) with distance 0.5889999866485596
---
Registry: AOUPR EHDEN Database (AOUPR)
EMA AOUPR EHDEN Database (AOUPR) has no close match within threshold 0.2
Closest match: Odense University Hospital Databases (OUHDB) with distance 0.6150000095367432
---
Registry: FranceCoag (FranceCoag)


 17%|█▋        | 40/237 [00:05<00:42,  4.67it/s]

EMA FranceCoag (FranceCoag) assigned to cluster 0_1_442 with distance 0.0
---
Registry: Stockholm CREAtinine Measurements project (SCREAM)
EMA Stockholm CREAtinine Measurements project (SCREAM) assigned to cluster 612_5 with distance 0.07900000363588333


 18%|█▊        | 42/237 [00:06<00:35,  5.44it/s]

---
Registry: Pedianet network
EMA Pedianet network has no close match within threshold 0.2
Closest match: Massachusetts General Hospital SCAD Registry (MGH SCAD Registry) with distance 0.34700000286102295
---
Registry: Dutch Keratinocyte Cancer Collaborative (DKCC - Skin Cancer)


 19%|█▊        | 44/237 [00:06<00:30,  6.36it/s]

EMA Dutch Keratinocyte Cancer Collaborative (DKCC - Skin Cancer) has no close match within threshold 0.2
Closest match: Dutch Cutaneous Lymphoma Registry (DCLR) with distance 0.5320000052452087
---
Registry: BELpREG: the Belgian pregnancy registry (BELpREG)
EMA BELpREG: the Belgian pregnancy registry (BELpREG) has no close match within threshold 0.2
Closest match: Schwabing City Hospital Acute Myocardial Infarction Registry (AMI) with distance 0.21299999952316284
---
Registry: Norwegian Health Registers


 19%|█▉        | 46/237 [00:06<00:28,  6.72it/s]

EMA Norwegian Health Registers has no close match within threshold 0.2
Closest match: Swedish National Catheter Ablation Registry (SNCAR) with distance 0.24899999797344208
---
Registry: The Norwegian Prescribed Drug Registry (NorPD)
EMA The Norwegian Prescribed Drug Registry (NorPD) assigned to cluster 6_3_17_1_16_1_1_1_1_8_2 with distance 0.16699999570846558
---
Registry: ARCA Cardiology (ARCA)


 20%|██        | 48/237 [00:06<00:26,  7.18it/s]

EMA ARCA Cardiology (ARCA) has no close match within threshold 0.2
Closest match: World Health Organization Multinational Monitoring of Trends and Determinants in Cardiovascular Diseases (MONICA) with distance 0.40799999237060547
---
Registry: ERN eUROGEN registry (ERN eUROGEN registry)
EMA ERN eUROGEN registry (ERN eUROGEN registry) has no close match within threshold 0.2
Closest match: Institutional Register of The Elderly with Hip Fracture (IREHF) with distance 0.4269999861717224
---
Registry: European Clinical Research Alliance On Infectious Diseases (ECRAID)-Base (ECRAID-Base)


 21%|██        | 50/237 [00:07<00:26,  6.93it/s]

EMA European Clinical Research Alliance On Infectious Diseases (ECRAID)-Base (ECRAID-Base) has no close match within threshold 0.2
Closest match: STEMI Network Registry (STEMI) with distance 0.47200000286102295
---
Registry: Database of Fondazione ReS (ReS database)
EMA Database of Fondazione ReS (ReS database) has no close match within threshold 0.2
Closest match: Hospital Episode Statistics-Admitted Patient Care (HES-APC) with distance 0.36399999260902405
---
Registry: INvestigating SIGnificant Health TrendS in Growth Hormone Treatments Registry (INSIGHTS-GHT)


 22%|██▏       | 52/237 [00:07<00:28,  6.51it/s]

EMA INvestigating SIGnificant Health TrendS in Growth Hormone Treatments Registry (INSIGHTS-GHT) has no close match within threshold 0.2
Closest match: Dutch National Registry of Growth Hormone Treatment in adults (DNRGHTA) with distance 0.41499999165534973
---
Registry: Hospital Episode Statistics (HES)
EMA Hospital Episode Statistics (HES) assigned to cluster 69_1 with distance 0.0
---
Registry: European Network and Registry for Homocystinurias and Methylation Defects - E-HOD (E-HOD)


 23%|██▎       | 54/237 [00:07<00:30,  6.01it/s]

EMA European Network and Registry for Homocystinurias and Methylation Defects - E-HOD (E-HOD) has no close match within threshold 0.2
Closest match: European Severe Chronic Neutropenia Registry (SCNER) with distance 0.2290000021457672
---
Registry: Multiple Sclerosis Documentation System (MSDS) - AOK PLUS Linked Database (MSDS-AOK PLUS)
EMA Multiple Sclerosis Documentation System (MSDS) - AOK PLUS Linked Database (MSDS-AOK PLUS) has no close match within threshold 0.2
Closest match: Austrian National Registry of Mountain Accidents (ANRMA) with distance 0.3880000114440918
---
Registry: Platform-Residras and Residras (Dravet-SCN1A-PCHD19 Registry)


 24%|██▎       | 56/237 [00:08<00:26,  6.79it/s]

EMA Platform-Residras and Residras (Dravet-SCN1A-PCHD19 Registry) has no close match within threshold 0.2
Closest match: Multicenter Registry for Hyperbaric Oxygen Therapy (MRHOT) with distance 0.49900001287460327
---
Registry: Hospital Vall d'Hebron (VH)
EMA Hospital Vall d'Hebron (VH) has no close match within threshold 0.2
Closest match: Catalan Healthcare-associated Infections Surveillance Programme (VINCat) with distance 0.5559999942779541
---
Registry: Croatia National Public Health Information System (Nacionalni javnozdravstveni informacijski sustav) (NAJS)


 24%|██▍       | 58/237 [00:08<00:24,  7.21it/s]

EMA Croatia National Public Health Information System (Nacionalni javnozdravstveni informacijski sustav) (NAJS) has no close match within threshold 0.2
Closest match: Croatian Institute of Public Health Database (CIPH) with distance 0.4390000104904175
---
Registry: AZ Maria Middelares (AZMM)
EMA AZ Maria Middelares (AZMM) has no close match within threshold 0.2
Closest match: ESPOIR cohort study (ESPOIR) with distance 0.6460000276565552
---
Registry: LRx Claims - SK


 25%|██▌       | 60/237 [00:08<00:23,  7.46it/s]

EMA LRx Claims - SK has no close match within threshold 0.2
Closest match: Vasaloppet Registry (VR) with distance 0.8230000138282776
---
Registry: Hillel Yaffe -Kineret (HYKHDL)
EMA Hillel Yaffe -Kineret (HYKHDL) has no close match within threshold 0.2
Closest match: Observatório Nacional de Saúde (ONSA) with distance 0.6769999861717224
---
Registry: Initiative for Quality Improvement and Epidemiology in Diabetes (IQED)


 26%|██▌       | 62/237 [00:08<00:22,  7.63it/s]

EMA Initiative for Quality Improvement and Epidemiology in Diabetes (IQED) has no close match within threshold 0.2
Closest match: United Network for Organ Sharing Dataset (UNOS) with distance 0.45500001311302185
---
Registry: Respiratory Syncytial Virus (RSV) Observatory
EMA Respiratory Syncytial Virus (RSV) Observatory has no close match within threshold 0.2
Closest match: RSV Global Online Mortality Database (RSV GOLD) with distance 0.4830000102519989
---
Registry: World Bleeding Disorders Registry (World Bleeding Disorders Registry)


 27%|██▋       | 64/237 [00:09<00:24,  7.15it/s]

EMA World Bleeding Disorders Registry (World Bleeding Disorders Registry) has no close match within threshold 0.2
Closest match: Carotid Artery Revascularization Using the Boston Scientific FilterWire EX/EZ and the EndoTex NexStent (CABERNET) with distance 0.26899999380111694
---
Registry: Czech Registry of Monoclonal Gammopathies (RMG)
EMA Czech Registry of Monoclonal Gammopathies (RMG) assigned to cluster 1636_1 with distance 0.0
---
Registry: PedNet Haemophilia registry (PHR)


 28%|██▊       | 66/237 [00:09<00:23,  7.35it/s]

EMA PedNet Haemophilia registry (PHR) has no close match within threshold 0.2
Closest match: Prague Charles University General Hospital Registry (PCUGHR) with distance 0.23199999332427979
---
Registry: Multiple Sclerosis DataConnect (MSDC)
EMA Multiple Sclerosis DataConnect (MSDC) has no close match within threshold 0.2
Closest match: Iranian Twin Registry (ITR) with distance 0.3790000081062317
---
Registry: Myotubular and Centronuclear Myopathy Patient Registry (MTM & CNM Patient Registry)


 29%|██▊       | 68/237 [00:09<00:24,  6.81it/s]

EMA Myotubular and Centronuclear Myopathy Patient Registry (MTM & CNM Patient Registry) assigned to cluster solo_12528 with distance 0.14300000667572021
---
Registry: Unidade Local de Saúde de Castelo Branco (ULSCB)
EMA Unidade Local de Saúde de Castelo Branco (ULSCB) has no close match within threshold 0.2
Closest match: Cancer Registry of Beira (CRB) with distance 0.6200000047683716
---
Registry: INvestigating SIGnificant Health TrendS in progressive fibrosing Interstitial Lung Disease (INSIGHTS-ILD)


 30%|██▉       | 70/237 [00:10<00:23,  7.20it/s]

EMA INvestigating SIGnificant Health TrendS in progressive fibrosing Interstitial Lung Disease (INSIGHTS-ILD) has no close match within threshold 0.2
Closest match: Spanish Society of Neonatology Registry (SSNR) with distance 0.375
---
Registry: Sweden National Prescribed Drugs Register / Läkemedelsregistret (NPDR)
EMA Sweden National Prescribed Drugs Register / Läkemedelsregistret (NPDR) has no close match within threshold 0.2
Closest match: Swedish National Prescribed Drug Register (SNPDR) with distance 0.3089999854564667
---
Registry: Landspatientregisteret (National Patient Register) (LPR (NPR))


 30%|███       | 72/237 [00:10<00:21,  7.52it/s]

EMA Landspatientregisteret (National Patient Register) (LPR (NPR)) has no close match within threshold 0.2
Closest match: Norwegian National Patient Register (NPR) with distance 0.29600000381469727
---
Registry: US Open Claims (DxHx)
EMA US Open Claims (DxHx) has no close match within threshold 0.2
Closest match: Swiss Paediatric Surveillance Unit (SPSU) with distance 0.7149999737739563
---
Registry: Deutsches Hämophilieregister (DHR)


 31%|███       | 74/237 [00:10<00:21,  7.50it/s]

EMA Deutsches Hämophilieregister (DHR) assigned to cluster solo_13824 with distance 0.19599999487400055
---
Registry: PAN Cancer Research Platform (PAN Registry)
EMA PAN Cancer Research Platform (PAN Registry) has no close match within threshold 0.2
Closest match: Centers for Disease Control Universal Data Collection (CDC UDC) with distance 0.43299999833106995
---
Registry: Dutch Pregnancy Drug Register (DPDR)


 32%|███▏      | 76/237 [00:10<00:20,  7.68it/s]

EMA Dutch Pregnancy Drug Register (DPDR) has no close match within threshold 0.2
Closest match: National Danish Spine Registry (DaneSpine) with distance 0.30300000309944153
---
Registry: Therapy Monitor Multiple Myeloma Germany (TM MM DE)
EMA Therapy Monitor Multiple Myeloma Germany (TM MM DE) has no close match within threshold 0.2
Closest match: South African Inflammatory Bowel Disease Registry (SAIBD) with distance 0.4440000057220459
---
Registry: IQVIA Medical Research Data - OMOP (IMRD)


 33%|███▎      | 78/237 [00:11<00:20,  7.69it/s]

EMA IQVIA Medical Research Data - OMOP (IMRD) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.5479999780654907
---
Registry: European Rare Diseases Research Alliance Data Hub (ERDERA-DH)
EMA European Rare Diseases Research Alliance Data Hub (ERDERA-DH) has no close match within threshold 0.2
Closest match: European Rare Disease Registration Infrastructure (ERDRI) with distance 0.3840000033378601
---
Registry: German Pharmacoepidemiological Research Database (GePaRD)


 34%|███▍      | 80/237 [00:11<00:21,  7.44it/s]

EMA German Pharmacoepidemiological Research Database (GePaRD) assigned to cluster 3279 with distance 0.0
---
Registry: IQVIA Claims - OMOP (Japan Claims)
EMA IQVIA Claims - OMOP (Japan Claims) has no close match within threshold 0.2
Closest match: JMDC Claims Database (JMDC) with distance 0.6240000128746033
---
Registry: Danish Health Care Registries (DHCR)


 35%|███▍      | 82/237 [00:11<00:21,  7.28it/s]

EMA Danish Health Care Registries (DHCR) assigned to cluster solo_9235 with distance 0.12200000137090683
---
Registry: IQVIA(TM) LRx Germany
EMA IQVIA(TM) LRx Germany has no close match within threshold 0.2
Closest match: IQVIA Prescription Registry (IQVIA) with distance 0.6169999837875366
---
Registry: German Atopic Dermatitis Registry TREATgermany (TREATgermany-AD Registry)


 35%|███▌      | 84/237 [00:11<00:22,  6.70it/s]

EMA German Atopic Dermatitis Registry TREATgermany (TREATgermany-AD Registry) assigned to cluster 338 with distance 0.1120000034570694
---
Registry: REGISTRY: a study by the European Huntington’s Disease Network (EHDN - REGISTRY)
EMA REGISTRY: a study by the European Huntington’s Disease Network (EHDN - REGISTRY) assigned to cluster 26_1 with distance 0.17000000178813934
---
Registry: UK Cystic Fibrosis Registry (UK CF Registry)


 36%|███▋      | 86/237 [00:12<00:21,  7.11it/s]

EMA UK Cystic Fibrosis Registry (UK CF Registry) assigned to cluster solo_12483 with distance 0.09799999743700027
---
Registry: Hepatitis Delta International Network (HDIN) - Patient Registry (HDIN - Hepatitis D)
EMA Hepatitis Delta International Network (HDIN) - Patient Registry (HDIN - Hepatitis D) has no close match within threshold 0.2
Closest match: National Register of Psychiatric Hospitalizations (NRPH) with distance 0.3089999854564667
---
Registry: Galactosemia Patient Registry (GalNet)


 37%|███▋      | 88/237 [00:12<00:19,  7.47it/s]

EMA Galactosemia Patient Registry (GalNet) has no close match within threshold 0.2
Closest match: Neonatal Research Network in Japan (NRNJ) with distance 0.20000000298023224
---
Registry: Research Repository @Fondazione IRCCS Ca' Granda Ospedale Maggiore Policlinico (POLIMI)
EMA Research Repository @Fondazione IRCCS Ca' Granda Ospedale Maggiore Policlinico (POLIMI) has no close match within threshold 0.2
Closest match: Italian Registry for Rare Diseases (IRRD) with distance 0.6230000257492065
---
Registry: IQVIA Medical Research Data Thin - OMOP (IMRD THIN)


 38%|███▊      | 90/237 [00:12<00:19,  7.68it/s]

EMA IQVIA Medical Research Data Thin - OMOP (IMRD THIN) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.5630000233650208
---
Registry: Shamir - Kineret (SKHDL)
EMA Shamir - Kineret (SKHDL) has no close match within threshold 0.2
Closest match: Baden-Wuerttemberg Register for Coercive Measures (BWRCM) with distance 0.7490000128746033
---
Registry: IRCCS Policlinico San Donato (PSD)


 39%|███▉      | 92/237 [00:13<00:19,  7.59it/s]

EMA IRCCS Policlinico San Donato (PSD) has no close match within threshold 0.2
Closest match: Registry of Glaucoma Outcomes Research (RiGOR) with distance 0.5109999775886536
---
Registry: LRx claims - CZ
EMA LRx claims - CZ has no close match within threshold 0.2
Closest match: UK National Transplant Registry (UKNTR) with distance 0.8730000257492065
---
Registry: Estonian Biobank (EBB)


 39%|███▉      | 93/237 [00:13<00:19,  7.35it/s]

EMA Estonian Biobank (EBB) has no close match within threshold 0.2
Closest match: Estonian Genome Center Biobank (EGC) with distance 0.28700000047683716
---
Registry: Système National des Données de Santé (French national health system main database) (SNDS)


 40%|████      | 95/237 [00:13<00:23,  6.12it/s]

EMA Système National des Données de Santé (French national health system main database) (SNDS) has no close match within threshold 0.2
Closest match: Système National des Données de Santé (SNDS) with distance 0.20000000298023224
---
Registry: Amyloid Imaging to Prevent Alzheimer’s Disease (AMYPAD) Prognostic and Natural History Study (PNHS) (AMYPAD PNHS)
EMA Amyloid Imaging to Prevent Alzheimer’s Disease (AMYPAD) Prognostic and Natural History Study (PNHS) (AMYPAD PNHS) has no close match within threshold 0.2
Closest match: French National Registry of Childhood Hematopoietic Malignancies (NRCH) with distance 0.4480000138282776
---
Registry: Health Search/IQVIA Health Longitudinal Patient Database (LPD)


 41%|████      | 97/237 [00:13<00:21,  6.37it/s]

EMA Health Search/IQVIA Health Longitudinal Patient Database (LPD) assigned to cluster solo_6908 with distance 0.1599999964237213
---
Registry: EFEMERIS (EFEMERIS)
EMA EFEMERIS (EFEMERIS) assigned to cluster 0_401 with distance 0.0
---
Registry: The Information System for Research in Primary Care (SIDIAP) (SIDIAP)


 42%|████▏     | 99/237 [00:14<00:22,  6.17it/s]

EMA The Information System for Research in Primary Care (SIDIAP) (SIDIAP) assigned to cluster 50_5 with distance 0.14399999380111694
---
Registry: ERN RARE-LIVER prospective research registry (R-LIVER rare liver disease registry)
EMA ERN RARE-LIVER prospective research registry (R-LIVER rare liver disease registry) has no close match within threshold 0.2
Closest match: The University of Texas M. D. Anderson Cancer Center's Tumor Registry with distance 0.4099999964237213


 42%|████▏     | 100/237 [00:14<00:20,  6.62it/s]

---
Registry: VieCuri Medical Center - EHR (VMC - EHR)
EMA VieCuri Medical Center - EHR (VMC - EHR) has no close match within threshold 0.2
Closest match: Pediatric Radio Frequency Ablation Registry (PRFAR) with distance 0.6380000114440918
---
Registry: AU EMR Data - OMOP


 43%|████▎     | 102/237 [00:14<00:18,  7.18it/s]

EMA AU EMR Data - OMOP has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.6610000133514404
---
Registry: EUROmediCAT central database (EUROmediCAT)
EMA EUROmediCAT central database (EUROmediCAT) has no close match within threshold 0.2
Closest match: EUROmediCAT (EUROmediCAT) with distance 0.296999990940094
---
Registry: European Register for Multiple Sclerosis (EUReMS)


 44%|████▍     | 104/237 [00:14<00:18,  7.36it/s]

EMA European Register for Multiple Sclerosis (EUReMS) assigned to cluster 1677_1 with distance 0.11400000005960464
---
Registry: POMME (POMME)
EMA POMME (POMME) has no close match within threshold 0.2
Closest match: ABIOMED World Registry (BVS) with distance 0.5899999737739563
---
Registry: Cancer Analysis System (CAS)


 45%|████▍     | 106/237 [00:15<00:17,  7.44it/s]

EMA Cancer Analysis System (CAS) assigned to cluster 0_634 with distance 0.0
---
Registry: TaUH patient cohort (FinOMOP) (FinOMOP_Tampere)
EMA TaUH patient cohort (FinOMOP) (FinOMOP_Tampere) has no close match within threshold 0.2
Closest match: STABILITY with distance 0.4819999933242798
---
Registry: British Society for Rheumatology Biologics Register for Rheumatoid Arthritis (BSRBR-RA)


 46%|████▌     | 108/237 [00:15<00:17,  7.46it/s]

EMA British Society for Rheumatology Biologics Register for Rheumatoid Arthritis (BSRBR-RA) assigned to cluster 772_1_1 with distance 0.0
---
Registry: Casiopea Plus (C+)
EMA Casiopea Plus (C+) has no close match within threshold 0.2
Closest match: CAP2 (Continued Access to PREVAIL registry) (CAP2) with distance 0.6710000038146973
---
Registry: Netherlands Cancer Registry (NCR)


 46%|████▋     | 110/237 [00:15<00:17,  7.29it/s]

EMA Netherlands Cancer Registry (NCR) assigned to cluster 6_29 with distance 0.0
---
Registry: Hospitalizations database (Spain)
EMA Hospitalizations database (Spain) has no close match within threshold 0.2
Closest match: Perinatal Register of South-East Queensland (SEQPR) with distance 0.4320000112056732
---
Registry: RIOJA SALUD (RIOJA SALUD)


 47%|████▋     | 112/237 [00:15<00:17,  7.12it/s]

EMA RIOJA SALUD (RIOJA SALUD) has no close match within threshold 0.2
Closest match: Regional Register of AIDS cases in La Rioja (RRACLR) with distance 0.6039999723434448
---
Registry: Danish Health Data Registries (DKHD)
EMA Danish Health Data Registries (DKHD) assigned to cluster 4596 with distance 0.1979999989271164
---
Registry: Big Data in Healthcare from Aragon (BiGAN)


 48%|████▊     | 114/237 [00:16<00:18,  6.83it/s]

EMA Big Data in Healthcare from Aragon (BiGAN) has no close match within threshold 0.2
Closest match: Public Data Analysis for Health Research and Innovation Program of Catalonia (PADRIS) with distance 0.5299999713897705
---
Registry: Biologika in der Kinderrheumatologie (BIKER)
EMA Biologika in der Kinderrheumatologie (BIKER) assigned to cluster solo_1206 with distance 0.1589999943971634
---
Registry: Cystic Fibrosis Registry of Ireland (Irish CF Registry)


 49%|████▉     | 116/237 [00:16<00:16,  7.23it/s]

EMA Cystic Fibrosis Registry of Ireland (Irish CF Registry) has no close match within threshold 0.2
Closest match: Irish Cystic Fibrosis Registry (ICFR) with distance 0.20800000429153442
---
Registry: Clinical Data Warehouse of the Bordeaux University Hospital (CDWBordeaux)
EMA Clinical Data Warehouse of the Bordeaux University Hospital (CDWBordeaux) has no close match within threshold 0.2
Closest match: Patient registry of ROFlumilast In real LifE (PROFILE) with distance 0.4699999988079071
---
Registry: The Cancer Registry of Norway (CRN)


 50%|████▉     | 118/237 [00:16<00:16,  7.31it/s]

EMA The Cancer Registry of Norway (CRN) assigned to cluster 6_19_1 with distance 0.0
---
Registry: The United Kingdom Facioscapulohumeral Dystrophy Patient Registry (The UK FSHD Patient Registry)
EMA The United Kingdom Facioscapulohumeral Dystrophy Patient Registry (The UK FSHD Patient Registry) has no close match within threshold 0.2
Closest match: French National Registry for Thrombotic Microangiopathies (REA-Microangiopathies) with distance 0.31700000166893005
---
Registry: Semmelweis University Clinical Data (SUCD)


 51%|█████     | 120/237 [00:17<00:15,  7.58it/s]

EMA Semmelweis University Clinical Data (SUCD) has no close match within threshold 0.2
Closest match: Moorfields Genetic Register (MGR) with distance 0.47099998593330383
---
Registry: CENTOGENE Biodatabank in Parkinson Disease (CNTG BDB)
EMA CENTOGENE Biodatabank in Parkinson Disease (CNTG BDB) has no close match within threshold 0.2
Closest match: Chinese SLE Treatment and Research Group (CSTAR) with distance 0.45899999141693115
---
Registry: Belgian Cystic Fibrosis Registry (BCFR)


 51%|█████▏    | 122/237 [00:17<00:15,  7.41it/s]

EMA Belgian Cystic Fibrosis Registry (BCFR) assigned to cluster 1975 with distance 0.0
---
Registry: International Cartilage Regeneration & Joint Preservation Society Patient Registry (ICRS Patient Registry)
EMA International Cartilage Regeneration & Joint Preservation Society Patient Registry (ICRS Patient Registry) assigned to cluster solo_16727 with distance 0.18700000643730164
---
Registry: Disease Analyzer - OMOP (DA France)


 52%|█████▏    | 124/237 [00:17<00:14,  7.68it/s]

EMA Disease Analyzer - OMOP (DA France) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.5870000123977661
---
Registry: Portugal North Region Cancer Registry (RORENO - Oncology)
EMA Portugal North Region Cancer Registry (RORENO - Oncology) has no close match within threshold 0.2
Closest match: North Region Cancer Registry of Portugal (RORENO) with distance 0.2329999953508377
---
Registry: Hospital District of Helsinki and Uusimaa patient cohort (FinOMOP) (FinOMOP_Helsinki)


 53%|█████▎    | 126/237 [00:17<00:14,  7.56it/s]

EMA Hospital District of Helsinki and Uusimaa patient cohort (FinOMOP) (FinOMOP_Helsinki) has no close match within threshold 0.2
Closest match: French National Reference Laboratory for Blood Groups (CNRGS) with distance 0.36899998784065247
---
Registry: Oslo University Hospital Clinical Data Warehouse (OUHCDW)
EMA Oslo University Hospital Clinical Data Warehouse (OUHCDW) has no close match within threshold 0.2
Closest match: Oslo University Hospital Clinical Database (OUHCD) with distance 0.28700000047683716
---
Registry: Centro Oncologico Modenese (COMNET) EHDEN database (COMNET)


 54%|█████▍    | 128/237 [00:18<00:14,  7.41it/s]

EMA Centro Oncologico Modenese (COMNET) EHDEN database (COMNET) has no close match within threshold 0.2
Closest match: Zhongshan Ophthalmic Center-Brien Holden Vision Institute High Myopia Registry (ZOC-BHVI) with distance 0.5049999952316284
---
Registry: Longitudinal Data Collection from Patients with Spinal Muscular Atrophy (SMArtCARE)
EMA Longitudinal Data Collection from Patients with Spinal Muscular Atrophy (SMArtCARE) has no close match within threshold 0.2
Closest match: NCI-supported Breast Cancer Cooperative Family Registry (BCCFR) with distance 0.42100000381469727
---
Registry: Lääketoimitukset (Kanta - Reseptikeskus)


 54%|█████▍    | 129/237 [00:18<00:14,  7.35it/s]

EMA Lääketoimitukset (Kanta - Reseptikeskus) has no close match within threshold 0.2
Closest match: Kanta database (Kanta) with distance 0.6679999828338623
---
Registry: Other data source (Other DS)


 55%|█████▌    | 131/237 [00:18<00:16,  6.24it/s]

EMA Other data source (Other DS) has no close match within threshold 0.2
Closest match: Uniform Data Set (UDS) with distance 0.6330000162124634
---
Registry: Psoriasis Observatory
EMA Psoriasis Observatory has no close match within threshold 0.2
Closest match: Neoral MOST (MOST) with distance 0.45899999141693115
---
Registry: Ospedale Bambino Gesù Pediatric Oncology Health Data database (OBG-POHD)


 56%|█████▌    | 133/237 [00:19<00:18,  5.75it/s]

EMA Ospedale Bambino Gesù Pediatric Oncology Health Data database (OBG-POHD) has no close match within threshold 0.2
Closest match: Danish Medicinal Product Statistics (DMPS) with distance 0.4000000059604645
---
Registry: Longitudinal Patients Database - OMOP (LPD ITA)
EMA Longitudinal Patients Database - OMOP (LPD ITA) has no close match within threshold 0.2
Closest match: Observational Medical Outcomes Partnership Common Data Model (OMOP) with distance 0.4740000069141388
---
Registry: Central Registry of Rare Diseases (CRRD)


 57%|█████▋    | 135/237 [00:19<00:15,  6.62it/s]

EMA Central Registry of Rare Diseases (CRRD) has no close match within threshold 0.2
Closest match: Registry of the Centre for Rare Disorders (CRD) with distance 0.3490000069141388
---
Registry: Unidade Local de Saúde de Matosinhos (ULSM)
EMA Unidade Local de Saúde de Matosinhos (ULSM) has no close match within threshold 0.2
Closest match: TREAT-NMD Duchenne Muscular Dystrophy Registries (TREAT-NMD) with distance 0.6259999871253967
---
Registry: Danny Platform (Danny Platform)


 58%|█████▊    | 137/237 [00:19<00:14,  6.96it/s]

EMA Danny Platform (Danny Platform) has no close match within threshold 0.2
Closest match: Canadian Network for Observational Drug Effect Studies (CNODES) with distance 0.7070000171661377
---
Registry: InGef Research Database (InGef RDB)
EMA InGef Research Database (InGef RDB) has no close match within threshold 0.2
Closest match: Finnish ELBW Infant Register (FELBWIR) with distance 0.5649999976158142
---
Registry: Odense Pharmacoepidemiological Database (OPED)


 59%|█████▊    | 139/237 [00:19<00:13,  7.08it/s]

EMA Odense Pharmacoepidemiological Database (OPED) assigned to cluster 6_176 with distance 0.0
---
Registry: Optimum Patient Care Research Database Australia (OPCRDA) (OPCRDA)
EMA Optimum Patient Care Research Database Australia (OPCRDA) (OPCRDA) has no close match within threshold 0.2
Closest match: Optimum Patient Care Research Database (OPCRD) with distance 0.43799999356269836
---
Registry: Translational Research in Europe - Assessment and Treatment of Neuromuscular Diseases (TREAT-NMD)


 59%|█████▉    | 141/237 [00:20<00:13,  6.92it/s]

EMA Translational Research in Europe - Assessment and Treatment of Neuromuscular Diseases (TREAT-NMD) assigned to cluster 997 with distance 0.09300000220537186
---
Registry: Medical University of Vienna data source (MUV)
EMA Medical University of Vienna data source (MUV) has no close match within threshold 0.2
Closest match: Medical University of Vienna Heart Failure Registry (MUVHFR) with distance 0.515999972820282
---
Registry: European Rare Kidney Disease Registry (ERKReg)


 60%|██████    | 143/237 [00:20<00:13,  7.06it/s]

EMA European Rare Kidney Disease Registry (ERKReg) assigned to cluster solo_15621 with distance 0.1420000046491623
---
Registry: RITA-MAITT (RITA-MAITT)
EMA RITA-MAITT (RITA-MAITT) has no close match within threshold 0.2
Closest match: Alpha-1 Antitrypsin Deficiency U.K. Registry (AATD U.K. Registry) with distance 0.5609999895095825
---
Registry: Health Data Research Platform of the Balearic Islands (PRISIB) (PRISIB)


 61%|██████    | 145/237 [00:20<00:13,  6.60it/s]

EMA Health Data Research Platform of the Balearic Islands (PRISIB) (PRISIB) has no close match within threshold 0.2
Closest match: Public Data Analysis for Health Research and Innovation Program of Catalonia (PADRIS) with distance 0.46000000834465027
---
Registry: CUF Database (CUF DB)
EMA CUF Database (CUF DB) has no close match within threshold 0.2
Closest match: Consortium for the Evaluation of African-Americans with Early Rheumatoid Arthritis (CLEAR) with distance 0.5659999847412109
---
Registry: Belgian Neuromuscular Diseases Registry - Spinal Muscular Atrophy (BNMDR-SMA)


 62%|██████▏   | 147/237 [00:21<00:12,  6.93it/s]

EMA Belgian Neuromuscular Diseases Registry - Spinal Muscular Atrophy (BNMDR-SMA) has no close match within threshold 0.2
Closest match: Belgian Neuromuscular Disease Registry (BNMDR) with distance 0.31200000643730164
---
Registry: TREAT NL/BE registry (TREatment of Atopic eczema, the Netherlands and Belgium) (TREAT NL/BE registry)
EMA TREAT NL/BE registry (TREatment of Atopic eczema, the Netherlands and Belgium) (TREAT NL/BE registry) has no close match within threshold 0.2
Closest match: TREatment of ATopic eczema, the Netherlands (TREAT NL) with distance 0.2879999876022339
---
Registry: Hospital Universitario Virgen Macarena EHR (HUVM)


 63%|██████▎   | 149/237 [00:21<00:12,  7.32it/s]

EMA Hospital Universitario Virgen Macarena EHR (HUVM) has no close match within threshold 0.2
Closest match: Helsinki University Hospital Medical Records (HUHMR) with distance 0.5410000085830688
---
Registry: EULAR COVID-19 Registry
EMA EULAR COVID-19 Registry has no close match within threshold 0.2
Closest match: EULAR COVID-19 register (EULAR) with distance 0.22100000083446503
---
Registry: European network of population-based registries for the epidemiological surveillance of congenital anomalies (EUROCAT)


 64%|██████▎   | 151/237 [00:21<00:11,  7.33it/s]

EMA European network of population-based registries for the epidemiological surveillance of congenital anomalies (EUROCAT) assigned to cluster 6_74_2_1_1 with distance 0.0
---
Registry: Galilee - Kineret (GKHDL)
EMA Galilee - Kineret (GKHDL) has no close match within threshold 0.2
Closest match: German National Hospital Discharge Registry (GNHDR) with distance 0.7440000176429749
---
Registry: Initiative for quality improvement and epidemiology among children and adolescents with diabetes (IQECAD)


 65%|██████▍   | 153/237 [00:21<00:11,  7.30it/s]

EMA Initiative for quality improvement and epidemiology among children and adolescents with diabetes (IQECAD) has no close match within threshold 0.2
Closest match: British Columbia Burn Registry (BCBR) with distance 0.38600000739097595
---
Registry: E-health medical database (EMD)
EMA E-health medical database (EMD) has no close match within threshold 0.2
Closest match: European Reference Networks for Rare Endocrine Disorders (Endo-ERN) with distance 0.41200000047683716
---
Registry: SAIL Databank (SAIL)


 65%|██████▌   | 155/237 [00:22<00:11,  7.32it/s]

EMA SAIL Databank (SAIL) has no close match within threshold 0.2
Closest match: Secured Anonymised Information Linkage Databank (SAIL) with distance 0.44699999690055847
---
Registry: EpiChron Cohort
EMA EpiChron Cohort has no close match within threshold 0.2
Closest match: EpiCom Inception Cohort (EpiCom) with distance 0.47099998593330383
---
Registry: The United Kingdom National Registry for Myotonic Dystrophy (UK DM Patient Registry)


 66%|██████▌   | 157/237 [00:22<00:11,  6.98it/s]

EMA The United Kingdom National Registry for Myotonic Dystrophy (UK DM Patient Registry) has no close match within threshold 0.2
Closest match: UK Myotonic Dystrophy Patient Registry (UKDM) with distance 0.20999999344348907
---
Registry: Egas Moniz Database (EMDB)
EMA Egas Moniz Database (EMDB) has no close match within threshold 0.2
Closest match: European Database for Multiple Sclerosis (EDMUS) with distance 0.550000011920929
---
Registry: Pharmacovigilance Program from Laboratory Signals at La Paz University Hospital (PPLS-LPUH)


 67%|██████▋   | 158/237 [00:22<00:11,  7.05it/s]

EMA Pharmacovigilance Program from Laboratory Signals at La Paz University Hospital (PPLS-LPUH) has no close match within threshold 0.2
Closest match: Spanish Pharmacovigilance Database of Adverse Drug Reactions (FEDRA) with distance 0.42399999499320984
---
Registry: IQVIA CTcue NL Network (CTcue NL)


 68%|██████▊   | 160/237 [00:22<00:11,  6.52it/s]

EMA IQVIA CTcue NL Network (CTcue NL) has no close match within threshold 0.2
Closest match: Italian Haemophilia Register (IHR) with distance 0.6240000128746033
---
Registry: Servicio Cántabro de Salud and IDIVAL (SCIVAL)
EMA Servicio Cántabro de Salud and IDIVAL (SCIVAL) has no close match within threshold 0.2
Closest match: Khorasan Razavi Multiple Sclerosis Registry (KRMSR) with distance 0.5370000004768372
---
Registry: University Medicine, University Hospital Carl Gustav Carus Dresden (UM Dresden)


 68%|██████▊   | 162/237 [00:23<00:10,  6.97it/s]

EMA University Medicine, University Hospital Carl Gustav Carus Dresden (UM Dresden) has no close match within threshold 0.2
Closest match: Clinical Cancer Registry Dresden (CCRD) with distance 0.5820000171661377
---
Registry: AIFA Monitoring Registries Platform (wMRs)
EMA AIFA Monitoring Registries Platform (wMRs) has no close match within threshold 0.2
Closest match: Serbian Vascular Registry (SerbVasc) with distance 0.3179999887943268
---
Registry: PHARMACOVIGILANCE IN JUVENILE IDIOPATHIC ARTHRITIS PATIENTS (PHARMACHILD) TREATED WITH BIOLOGIC AGENTS AND/OR METHOTREXATE. (Pharmachild)


 69%|██████▉   | 164/237 [00:23<00:10,  6.88it/s]

EMA PHARMACOVIGILANCE IN JUVENILE IDIOPATHIC ARTHRITIS PATIENTS (PHARMACHILD) TREATED WITH BIOLOGIC AGENTS AND/OR METHOTREXATE. (Pharmachild) has no close match within threshold 0.2
Closest match: Hearts in Rhythm Organization Registry (HiRO) with distance 0.4129999876022339
---
Registry: Ukrainian Lymphoma Registry (ULR)
EMA Ukrainian Lymphoma Registry (ULR) assigned to cluster 0_1_515 with distance 0.0
---
Registry: Save Sight Registries (SSR)


 70%|███████   | 166/237 [00:23<00:10,  7.00it/s]

EMA Save Sight Registries (SSR) assigned to cluster solo_7485 with distance 0.0
---
Registry: The PRES European Network of Registries for Autoinflammatory Diseases in Childhood (EUROFEVER)
EMA The PRES European Network of Registries for Autoinflammatory Diseases in Childhood (EUROFEVER) has no close match within threshold 0.2
Closest match: World Health Organization International Clinical Trials Registry (WHO ICTR) with distance 0.3930000066757202
---
Registry: Healthcare Emergency Information System (HEIS)


 71%|███████   | 168/237 [00:24<00:09,  6.97it/s]

EMA Healthcare Emergency Information System (HEIS) has no close match within threshold 0.2
Closest match: Swansea IBD Registry (SICSI) with distance 0.5040000081062317
---
Registry: IQVIA Disease Analyzer Germany (IQVIA DA Germany)
EMA IQVIA Disease Analyzer Germany (IQVIA DA Germany) has no close match within threshold 0.2
Closest match: GENEVA (GENEVA) with distance 0.5770000219345093
---
Registry: Global Fukutin-Related Protein Registry (Global FKRP Registry - LGMDR9)


 71%|███████▏  | 169/237 [00:24<00:09,  7.03it/s]

EMA Global Fukutin-Related Protein Registry (Global FKRP Registry - LGMDR9) has no close match within threshold 0.2
Closest match: Inkosi Albert Luthuli Central Hospital Colorectal Cancer Registry (IALCH CRC Registry) with distance 0.3840000033378601
---
Registry: Rheumatoid Arthritis - Observation of Biologic Therapies (RABBIT)


 72%|███████▏  | 170/237 [00:24<00:11,  5.92it/s]

EMA Rheumatoid Arthritis - Observation of Biologic Therapies (RABBIT) assigned to cluster 0_1133 with distance 0.17499999701976776
---
Registry: Registo Nacional de Doentes Reumáticos (Reuma.pt)


 73%|███████▎  | 172/237 [00:24<00:10,  5.96it/s]

EMA Registo Nacional de Doentes Reumáticos (Reuma.pt) has no close match within threshold 0.2
Closest match: Head Injury Registry in Taiwan (HIRT) with distance 0.31299999356269836
---
Registry: German Cystic fibrosis registry - Muko.web (German CF Registry)
EMA German Cystic fibrosis registry - Muko.web (German CF Registry) has no close match within threshold 0.2
Closest match: German Cystic Fibrosis Registry (GCR) with distance 0.40799999237060547
---
Registry: The European HBV Registry - A joint initiative of TherVacB and DZIF (HBV Registry)


 73%|███████▎  | 174/237 [00:25<00:10,  5.89it/s]

EMA The European HBV Registry - A joint initiative of TherVacB and DZIF (HBV Registry) has no close match within threshold 0.2
Closest match: HBV Diagnostic Registry (HBVDR) with distance 0.43799999356269836
---
Registry: South East Scoltand Cancer Database (DataLoch) (SESCD / DataLoch)
EMA South East Scoltand Cancer Database (DataLoch) (SESCD / DataLoch) has no close match within threshold 0.2
Closest match: Scottish Cancer Register (SCR) with distance 0.460999995470047
---
Registry: Genomics England (GEL) (GEL)


 74%|███████▍  | 176/237 [00:25<00:09,  6.33it/s]

EMA Genomics England (GEL) (GEL) has no close match within threshold 0.2
Closest match: GEL Registry (GEL) with distance 0.5429999828338623
---
Registry: UK National Neonatal Research Database (NNRD)
EMA UK National Neonatal Research Database (NNRD) assigned to cluster solo_12903 with distance 0.13199999928474426
---
Registry: European patient registry on TRAPS syndrome (EUROTRAPS - Autoinflammatory diseases)


 75%|███████▌  | 178/237 [00:25<00:08,  6.59it/s]

EMA European patient registry on TRAPS syndrome (EUROTRAPS - Autoinflammatory diseases) has no close match within threshold 0.2
Closest match: World Health Organization International Clinical Trials Registry (WHO ICTR) with distance 0.44200000166893005
---
Registry: Longitudinal Patient Data - France (LPD FRA)
EMA Longitudinal Patient Data - France (LPD FRA) has no close match within threshold 0.2
Closest match: Health Search/IQVIA Health LPD Longitudinal Patient Database (Health Search/IQVIA LPD) with distance 0.4860000014305115
---
Registry: Drug claims information system (PHARM)


 76%|███████▌  | 180/237 [00:25<00:08,  6.91it/s]

EMA Drug claims information system (PHARM) has no close match within threshold 0.2
Closest match: Ambillikai Registry (AR) with distance 0.492000013589859
---
Registry: Mortality Information System (MIS)
EMA Mortality Information System (MIS) has no close match within threshold 0.2
Closest match: Mortality Information System (SIM) with distance 0.38499999046325684
---
Registry: Caserta claims database (Caserta database)


 77%|███████▋  | 182/237 [00:26<00:07,  6.88it/s]

EMA Caserta claims database (Caserta database) has no close match within threshold 0.2
Closest match: Malmö Myocardial Infarction Registry (MMIR) with distance 0.6060000061988831
---
Registry: Secured Access to innovative medicines for CHildren, adolescents and young adults with cAncer (SACHA France)
EMA Secured Access to innovative medicines for CHildren, adolescents and young adults with cAncer (SACHA France) has no close match within threshold 0.2
Closest match: French National Childhood Cancer Registry (SFCE) with distance 0.4659999907016754
---
Registry: European Porphyria Registry (EPR)


 78%|███████▊  | 184/237 [00:26<00:08,  6.57it/s]

EMA European Porphyria Registry (EPR) has no close match within threshold 0.2
Closest match: BELIEVE SVT Registry (BELIEVE SVT) with distance 0.375
---
Registry: The United Kingdom Spinal Muscular Atrophy Patient Registry (UK SMA Patient Registry)
EMA The United Kingdom Spinal Muscular Atrophy Patient Registry (UK SMA Patient Registry) has no close match within threshold 0.2
Closest match: National Health Service Central Registers (NHSCR) with distance 0.34299999475479126
---
Registry: The World Federation of Hemophilia Gene Therapy Registry (The WFH Gene Therapy Registry)


 78%|███████▊  | 186/237 [00:26<00:07,  7.00it/s]

EMA The World Federation of Hemophilia Gene Therapy Registry (The WFH Gene Therapy Registry) has no close match within threshold 0.2
Closest match: World Federation of Haemophilia Registries (WFH) with distance 0.3790000081062317
---
Registry: Team Gesundheit GKV Claims data (GKV, SHI)
EMA Team Gesundheit GKV Claims data (GKV, SHI) has no close match within threshold 0.2
Closest match: German Statutory Health Insurance Registry (GSHIR) with distance 0.5870000123977661
---
Registry: UK Biobank (UKB)


 79%|███████▉  | 187/237 [00:26<00:07,  6.67it/s]

EMA UK Biobank (UKB) assigned to cluster 627 with distance 0.11400000005960464
---
Registry: Advancing the Patient Experience in Chronic Obstructive Pulmonary Disease (COPD) Registry (APEX COPD Registry)


 79%|███████▉  | 188/237 [00:27<00:09,  5.17it/s]

EMA Advancing the Patient Experience in Chronic Obstructive Pulmonary Disease (COPD) Registry (APEX COPD Registry) assigned to cluster 97_3 with distance 0.1340000033378601
---
Registry: Collaboration on Quality Improvement Initiative for Achieving Excellence in Standards of COPD Care (CONQUEST - COPD)


 80%|████████  | 190/237 [00:27<00:09,  4.91it/s]

EMA Collaboration on Quality Improvement Initiative for Achieving Excellence in Standards of COPD Care (CONQUEST - COPD) has no close match within threshold 0.2
Closest match: Clinical Oncology Data Integration project (CODI) with distance 0.42500001192092896
---
Registry: Initiative for Quality improvement and Epidemiology in Multidisciplinary Diabetic Foot Clinics (IQED-Foot)
EMA Initiative for Quality improvement and Epidemiology in Multidisciplinary Diabetic Foot Clinics (IQED-Foot) has no close match within threshold 0.2
Closest match: Westmead Hospital's Foot Wound Clinic Registry (WHFWCR) with distance 0.4970000088214874
---
Registry: Poriya - Kineret (PKHDL)


 81%|████████  | 192/237 [00:28<00:07,  5.81it/s]

EMA Poriya - Kineret (PKHDL) has no close match within threshold 0.2
Closest match: Duke Cancer Institute Patient Care Monitor (PCM) with distance 0.6570000052452087
---
Registry: BIFAP - Base de Datos para la Investigación Farmacoepidemiológica en el Ámbito Público (Pharmacoepidemiological Research Database for Public Health Systems) (BIFAP)
EMA BIFAP - Base de Datos para la Investigación Farmacoepidemiológica en el Ámbito Público (Pharmacoepidemiological Research Database for Public Health Systems) (BIFAP) has no close match within threshold 0.2
Closest match: Registry to Evaluate Early and Long-term PAH Management (REVEAL) with distance 0.25200000405311584
---
Registry: European registry and network for intoxication type metabolic diseases (E-IMD)


 82%|████████▏ | 194/237 [00:28<00:06,  6.56it/s]

EMA European registry and network for intoxication type metabolic diseases (E-IMD) assigned to cluster 497_1 with distance 0.11500000208616257
---
Registry: Lung Cancer Focus DATASET (LUCAS)
EMA Lung Cancer Focus DATASET (LUCAS) has no close match within threshold 0.2
Closest match: LUCAS Lung Cancer Clinical Registry (LUCAS) with distance 0.3919999897480011
---
Registry: Clinical Hospital Center Zvezdara - Heliant (CHCZ)


 83%|████████▎ | 196/237 [00:28<00:06,  6.23it/s]

EMA Clinical Hospital Center Zvezdara - Heliant (CHCZ) has no close match within threshold 0.2
Closest match: Central Health Information System of the Republic of Croatia (CEZIH) with distance 0.593999981880188
---
Registry: Rhekiss: Rheuma - Kinderwunsch und Schwangerschaft (Rhekiss)
EMA Rhekiss: Rheuma - Kinderwunsch und Schwangerschaft (Rhekiss) has no close match within threshold 0.2
Closest match: German Pregnancy Register Rhekiss (Rhekiss) with distance 0.5929999947547913
---
Registry: MS-Register of the National MS-Society of Germany (DMSG, Bundesverband e.V.) (German MS-Register)


 84%|████████▎ | 198/237 [00:28<00:05,  6.85it/s]

EMA MS-Register of the National MS-Society of Germany (DMSG, Bundesverband e.V.) (German MS-Register) has no close match within threshold 0.2
Closest match: Global Research on Acute Conditions Team (GREAT) with distance 0.28299999237060547
---
Registry: CDM – Charge Detail Master - OMOP (CDM)
EMA CDM – Charge Detail Master - OMOP (CDM) has no close match within threshold 0.2
Closest match: Optum Clinformatics® Data Mart (CDM) with distance 0.5120000243186951
---
Registry: Federated Hospital Data Network (FHDN)


 84%|████████▍ | 199/237 [00:29<00:05,  7.05it/s]

EMA Federated Hospital Data Network (FHDN) has no close match within threshold 0.2
Closest match: National Healthcare Data System (NHDS) with distance 0.5040000081062317
---
Registry: Barzilai - Kineret (BKHDL)


 85%|████████▍ | 201/237 [00:29<00:05,  6.02it/s]

EMA Barzilai - Kineret (BKHDL) has no close match within threshold 0.2
Closest match: National Registry of Home Enteral Nutrition (NADYASENPE) with distance 0.6729999780654907
---
Registry: Auria Clinical Informatics (FinOMOP) (ACI)
EMA Auria Clinical Informatics (FinOMOP) (ACI) has no close match within threshold 0.2
Closest match: Survey Mirror Slovakia (SMS) with distance 0.5699999928474426
---
Registry: Chronic Toxicities Related to Treatment in Patients With Localized Cancer (CANTO) (CANTO)


 86%|████████▌ | 203/237 [00:29<00:05,  6.60it/s]

EMA Chronic Toxicities Related to Treatment in Patients With Localized Cancer (CANTO) (CANTO) has no close match within threshold 0.2
Closest match: Leipzig Prospective Vascular Ultrasound Registry (LPVUR) with distance 0.3529999852180481
---
Registry: AZ Oostende (previously: AZ Damiaan) (AZ Oostende)
EMA AZ Oostende (previously: AZ Damiaan) (AZ Oostende) has no close match within threshold 0.2
Closest match: Antwerp Cancer Registry (ACR) with distance 0.6949999928474426
---
Registry: Actionable Real-world evidence network (ARWEN)


 86%|████████▋ | 205/237 [00:30<00:05,  6.23it/s]

EMA Actionable Real-world evidence network (ARWEN) has no close match within threshold 0.2
Closest match: ART Italian Registry (ART-IR) with distance 0.5370000004768372
---
Registry: Neurage-DB (Neurage-DB)
EMA Neurage-DB (Neurage-DB) has no close match within threshold 0.2
Closest match: EUROCAT (European Surveillance of Congenital Anomalies) (EUROCAT) with distance 0.5809999704360962
---
Registry: BIG-PAC (BIG-PAC)


 87%|████████▋ | 207/237 [00:30<00:04,  6.24it/s]

EMA BIG-PAC (BIG-PAC) has no close match within threshold 0.2
Closest match: BIG-register (BIG) with distance 0.5839999914169312
---
Registry: Prostate Cancer Observatory
EMA Prostate Cancer Observatory has no close match within threshold 0.2
Closest match: Genetic and Imaging of Familial Hypercholesterolemia in Han Nationality Study (GIFH) with distance 0.47099998593330383
---
Registry: Danish registries (access/analysis) (Danish registries (access/analysis))


 88%|████████▊ | 209/237 [00:30<00:04,  6.71it/s]

EMA Danish registries (access/analysis) (Danish registries (access/analysis)) has no close match within threshold 0.2
Closest match: Danish Health Registries with distance 0.4059999883174896
---
Registry: RABBIT-SpA: Disease register for axial spondyloarthritis and psoriatic arthritis (RABBIT-SpA)
EMA RABBIT-SpA: Disease register for axial spondyloarthritis and psoriatic arthritis (RABBIT-SpA) has no close match within threshold 0.2
Closest match: Prostate Testing for Cancer and Treatment Study (ProtecT) with distance 0.3089999854564667
---
Registry: Fin-CARING2 (Fin-CARING2)


 89%|████████▉ | 211/237 [00:30<00:03,  6.66it/s]

EMA Fin-CARING2 (Fin-CARING2) has no close match within threshold 0.2
Closest match: National CJD Surveillance Unit Register (NCJDSU) with distance 0.5699999928474426
---
Registry: Clinical Practice Research Datalink (CPRD)
EMA Clinical Practice Research Datalink (CPRD) assigned to cluster 70_1 with distance 0.0
---
Registry: aggregate Gargano Mortality Study (aGMS)


 90%|████████▉ | 213/237 [00:31<00:03,  6.90it/s]

EMA aggregate Gargano Mortality Study (aGMS) has no close match within threshold 0.2
Closest match: UK General Mortality Register (GMR) with distance 0.574999988079071
---
Registry: Optimum Patient Care Research Database (OPCRD)
EMA Optimum Patient Care Research Database (OPCRD) assigned to cluster solo_8426 with distance 0.0
---
Registry: EURAP International Registry of Antiepileptic Drugs and Pregnancy (EURAP)


 91%|█████████ | 215/237 [00:31<00:03,  6.61it/s]

EMA EURAP International Registry of Antiepileptic Drugs and Pregnancy (EURAP) assigned to cluster 673_1 with distance 0.16200000047683716
---
Registry: Polish Hospital Claims Database (HCD)
EMA Polish Hospital Claims Database (HCD) has no close match within threshold 0.2
Closest match: European Blood and Marrow Transplantation Group (EBMT) with distance 0.45100000500679016
---
Registry: Dutch Haemophilia Registry (HemoNED - Haemophilia)


 92%|█████████▏| 217/237 [00:31<00:02,  7.12it/s]

EMA Dutch Haemophilia Registry (HemoNED - Haemophilia) has no close match within threshold 0.2
Closest match: National Hemophilia Registry of the Netherlands (NHRN) with distance 0.3610000014305115
---
Registry: Specialist Cohort Event Monitoring (SCEM)
EMA Specialist Cohort Event Monitoring (SCEM) has no close match within threshold 0.2
Closest match: Registry for chronic radiation syndrome (CRS Registry) with distance 0.6169999837875366
---
Registry: ITCC International Data Integration Platform (ITCC-IDIP)


 92%|█████████▏| 219/237 [00:32<00:02,  7.51it/s]

EMA ITCC International Data Integration Platform (ITCC-IDIP) has no close match within threshold 0.2
Closest match: EARLY-MYO trial (EARLY-MYO) with distance 0.4970000088214874
---
Registry: University Clinical Center of Serbia - Heliant (UCCS)
EMA University Clinical Center of Serbia - Heliant (UCCS) has no close match within threshold 0.2
Closest match: Treatment Registry of the Clinic of Neurology, University Clinical Center of Serbia (TRCN-UCCS) with distance 0.6460000276565552
---
Registry: Comparative, Prospective Registry of Newly Initiated Therapies for Pulmonary Hypertension (COMPERA)


 93%|█████████▎| 221/237 [00:32<00:02,  7.23it/s]

EMA Comparative, Prospective Registry of Newly Initiated Therapies for Pulmonary Hypertension (COMPERA) assigned to cluster 534_1 with distance 0.0
---
Registry: International Severe Asthma Registry (ISAR)
EMA International Severe Asthma Registry (ISAR) assigned to cluster solo_138 with distance 0.0
---
Registry: Multiple Sclerosis Centre of Catalonia (Cemcat)


 94%|█████████▍| 223/237 [00:32<00:01,  7.25it/s]

EMA Multiple Sclerosis Centre of Catalonia (Cemcat) has no close match within threshold 0.2
Closest match: South Carolina Vascular Surgical Society Registry (SCVSSR) with distance 0.37599998712539673
---
Registry: Sweden National Cancer Register / Cancerregistret (NCR)
EMA Sweden National Cancer Register / Cancerregistret (NCR) assigned to cluster 6_2_4_2 with distance 0.1599999964237213
---
Registry: Norwegian Porphyria Registry (Norwegian Porphyria Registry)


 95%|█████████▍| 225/237 [00:32<00:01,  6.71it/s]

EMA Norwegian Porphyria Registry (Norwegian Porphyria Registry) assigned to cluster solo_13042 with distance 0.17599999904632568
---
Registry: Medicines Intelligence Data Platform (MedIntel)
EMA Medicines Intelligence Data Platform (MedIntel) has no close match within threshold 0.2
Closest match: Sundhedsstyrelsens Centrale Odontologiske Register (SCOR) with distance 0.5640000104904175
---
Registry: SABI (SABI)


 96%|█████████▌| 227/237 [00:33<00:01,  7.24it/s]

EMA SABI (SABI) has no close match within threshold 0.2
Closest match: ORBI (ORBI) with distance 0.6349999904632568
---
Registry: Integrated Primary Care Information (IPCI) (IPCI)
EMA Integrated Primary Care Information (IPCI) (IPCI) has no close match within threshold 0.2
Closest match: Maccabi Health Services Cancer and Osteoporosis Registries (MHS) with distance 0.49000000953674316
---
Registry: Nivel Primary Care Database (Nivel-PCD)


 97%|█████████▋| 229/237 [00:33<00:01,  6.69it/s]

EMA Nivel Primary Care Database (Nivel-PCD) assigned to cluster 840 with distance 0.0
---
Registry: Hospital Information System (HIS)
EMA Hospital Information System (HIS) has no close match within threshold 0.2
Closest match: Illustration of the Management and Prognosis of Japanese Patients with CS registry (IMP-JCS) with distance 0.42399999499320984
---
Registry: Belgian Neuromuscular Diseases Registry (BNMDR)


 97%|█████████▋| 230/237 [00:33<00:01,  6.75it/s]

EMA Belgian Neuromuscular Diseases Registry (BNMDR) assigned to cluster 4449 with distance 0.03799999877810478
---
Registry: Casa di Cura Igea OMOP CDM (CCI - CDM)


 98%|█████████▊| 232/237 [00:34<00:00,  6.30it/s]

EMA Casa di Cura Igea OMOP CDM (CCI - CDM) has no close match within threshold 0.2
Closest match: Maccabi Healthcare Services Cancer Registry (MHS Cancer Registry) with distance 0.6830000281333923
---
Registry: ARS Toscana (ARS)
EMA ARS Toscana (ARS) has no close match within threshold 0.2
Closest match: Malmö Myocardial Infarction Registry (MMIR) with distance 0.5350000262260437
---
Registry: Biobanco-iMM (BB-iMM)


 99%|█████████▊| 234/237 [00:34<00:00,  6.74it/s]

EMA Biobanco-iMM (BB-iMM) has no close match within threshold 0.2
Closest match: Bank of Biological Materials (BBM) with distance 0.4970000088214874
---
Registry: Terveydenhuollon hoitoilmoitusrekisteri (Finland Care Register for Health Care) (Terveys-Hilmo)
EMA Terveydenhuollon hoitoilmoitusrekisteri (Finland Care Register for Health Care) (Terveys-Hilmo) has no close match within threshold 0.2
Closest match: Patient Registries at Slone: MDS (PRS:MDS) with distance 0.38100001215934753
---
Registry: IADB.nl (IADB)


100%|█████████▉| 236/237 [00:34<00:00,  7.25it/s]

EMA IADB.nl (IADB) has no close match within threshold 0.2
Closest match: Haute-Garonne Cancer Registry (FR-HGA) with distance 0.5799999833106995
---
Registry: French National Registry of Rare Diseases (Banque Nationale de Données Maladies Rares) (BNDMR)
EMA French National Registry of Rare Diseases (Banque Nationale de Données Maladies Rares) (BNDMR) has no close match within threshold 0.2
Closest match: French National Rare Disease Registry (BNDMR) with distance 0.21699999272823334
---
Registry: HTI – Hospital Treatment Insights (HTI)


100%|██████████| 237/237 [00:34<00:00,  6.84it/s]

EMA HTI – Hospital Treatment Insights (HTI) has no close match within threshold 0.2
Closest match: French TTP Registry (TTP) with distance 0.6840000152587891





Unnamed: 0,ema_object_id,ema_full_name,assigned_cluster,distance_to_closest,closest,closest_nb_occ,total_aliases,total_occurrences,N1_alias,N1_alias_nb_occ
0,028ac269-7f7f-56d3-8dd3-0bd93913d864,Clinical Practice Research Datalink (CPRD) GOL...,70_3,0.118468,Clinical Practice Research Datalink GOLD (CPRD...,11,2,12,Clinical Practice Research Datalink GOLD (CPRD...,11.0
1,0446b0c0-0bec-5999-8c59-f0e4107dabc3,Cancer Registry of Instituto Português de Onco...,,0.294426,The Nationwide Danish 1905 Cohort Study (1905 ...,1,0,0,,
2,060d06a8-0a14-5b8b-9fe4-882f08412d7f,European Registry of Patients with McArdle dis...,3354,0.145912,European registry for patients with McArdle di...,2,2,3,European registry for patients with McArdle di...,2.0
3,060ede1e-35d6-5dab-9e1e-d7e1e2f78c2a,The UK-Irish Atopic Eczema Systemic Therapy Re...,0_1_24,0.105932,International Network of Obstetric Survey Syst...,1,3,4,Italian Obstetric Surveillance System (ItOSS),2.0
4,08920b82-907a-5d03-828e-f76f7b3c735e,syndena GmbH (former OncoTyrol) (syndena GmbH),,0.598728,Cancer Registry of Tyrol (CRT),5,0,0,,


## 5. Save Results
Save the results DataFrame to Excel for further analysis.

In [10]:
results_df.to_excel(ema_prediction_results_xlsx, index=False)
print(f'Results saved to {ema_prediction_results_xlsx}')

Results saved to data/W02/R03_evaluate_model_performance_on_ema_registries/new_eucl_0.2/ema_prediction_results.xlsx


## 6. Analysis & Metrics

In [12]:
# reload results
exp_name='new_eucl_0.2'
ema_prediction_results_xlsx=f"data/W02/R03_evaluate_model_performance_on_ema_registries/{exp_name}/ema_prediction_results.xlsx"
results_df = pd.read_excel(ema_prediction_results_xlsx)

In [15]:
# Identify noise clusters: '0' or ending with '_0'
def is_noise(cluster):
    return str(cluster) == '0' or str(cluster).endswith('_0')

# Compute transformation rate: how many EMA registries were transformed into clusters (assigned_cluster Not None)
transformed_count = results_df['assigned_cluster'].notna().sum()
print(f"Transformation rate: {transformed_count}/ {results_df.shape[0]} ({transformed_count/results_df.shape[0]*100:.2f}%)")

# Number of distinct clusters with at least one EMA registry
distinct_clusters = results_df['assigned_cluster'].dropna().unique()
print(f"Distinct clusters with at least one EMA registry: {len(distinct_clusters)}")

# Clusters with multiple EMA registries
multiple_ema_clusters = results_df.groupby('assigned_cluster').filter(lambda x: len(x) > 1)
print(f"Clusters with multiple EMA registries: {multiple_ema_clusters['assigned_cluster'].nunique()}")
display(multiple_ema_clusters[['assigned_cluster']])

# Number of Aliases per transformed EMA registry (set to int)
results_df['total_aliases'] = results_df['total_aliases'].fillna(0).astype(int)
# Number of occurrences per transformed EMA registry (set to int)
results_df['total_occurrences'] = results_df['total_occurrences'].fillna(0).astype(int)
print(f"Mean Number of aliases per transformed EMA registry: {results_df['total_aliases'].mean():.2f}")
print(f"Mean Number of occurrences per transformed EMA registry: {results_df['total_occurrences'].mean():.2f}")

Transformation rate: 51/ 237 (21.52%)
Distinct clusters with at least one EMA registry: 51
Clusters with multiple EMA registries: 0


Unnamed: 0,assigned_cluster


Mean Number of aliases per transformed EMA registry: 0.95
Mean Number of occurrences per transformed EMA registry: 14.78


In [14]:
# display the 10 first rows of transformed clusters ranked by total_aliases
top_clusters = results_df[results_df['assigned_cluster'].notna()].sort_values('total_aliases', ascending=False).head(10)
print("Top 10 clusters by number of aliases:")
display(top_clusters[['assigned_cluster', 'total_aliases', 'total_occurrences']])

Top 10 clusters by number of aliases:


Unnamed: 0,assigned_cluster,total_aliases,total_occurrences
149,6_74_2_1_1,17,115
116,6_19_1,15,605
108,6_29,14,1076
106,772_1_1,14,106
213,673_1,14,49
222,6_2_4_2,12,122
51,69_1,12,165
63,1636_1,9,23
83,26_1,8,70
6,235_2,8,63
