In [95]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import pycountry

from src.p04_official_reg_db_creation import config
import llm_backends
from llm_backends.cache import DiskCacheStorage
from llm_backends.mistral import dummy_config
from llm_backends.openai import dummy_config

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation


In [96]:
FIELD = "geographical_area"
MODEL = "small_mistral"

# Load environment variables from .env file and get API key
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [97]:
# INPUTS
registry_dataset = f"../../datasets/006_registry_names_datasets/dedup_100_famous_european_registries.json"
publis_dataset_template = f"../../datasets/006_registry_names_datasets/famous_european_registries_sample_publi_data/1.json"
prompt_txt = prompt_txt=f"etc/prompts/extraction/prompt_{FIELD}.txt"
model_config=f"etc/configs/{MODEL}_config.json"

In [98]:
# OUTPUTS
output_json = f"data/W01/R01_extraction/{MODEL}/{FIELD}/{FIELD}_extractions.json"
output_records_jsonl = f"data/W01/R01_extraction/{MODEL}/{FIELD}/{FIELD}_extractions_records.json"

In [99]:
# Ensure output directory exists
out_dir = Path(output_json).parent
out_dir.mkdir(parents=True, exist_ok=True)

# Ensure output records directory exists
records_dir = Path(output_records_jsonl).parent
records_dir.mkdir(parents=True, exist_ok=True)

# Load model configuration
with open(model_config, "r", encoding="utf-8") as f:
    model_cfg = json.load(f)

model_name = model_cfg.get("model", "unknown")
print(f"Using model: {model_name}")

# Load the annotation prompt
with open(prompt_txt, "r", encoding="utf-8") as f:
    prompt_template = f.read().strip()

# Load registry_dataset
with open(registry_dataset, "r") as file:
    registry_dataset = json.load(file)

# Load the publications dataset
# first get input directory from template
input_dir = Path(publis_dataset_template).parent
# Get all input batch files and sort them by batch number
batch_files = sorted(input_dir.glob("*.json"), key=lambda p: int(p.stem))
publis_dataset = []
# Load each batch file
for batch_file in batch_files:
    with open(batch_file, "r", encoding="utf-8") as f:
        batch_data = json.load(f)
        publis_dataset.extend(batch_data)

Using model: mistral-small-latest


In [100]:
# # select a small subset of the registry_dataset (first 5 elements)
# registry_dataset = registry_dataset[:5]

In [101]:
# display 'registry_name' and 'number_of_occurrences' for each registry, in a pandas dataframe with display()
df = pd.DataFrame(registry_dataset)
print("Registry Dataset:")
display(df[["registry_name", "number_of_occurrences"]].head())

Registry Dataset:


Unnamed: 0,registry_name,number_of_occurrences
0,Get With The Guidelines-Resuscitation,86
1,Swedish National Inpatient Register,53
2,National Registry for Radiation Workers,19
3,National Pathology Registry,11
4,Swedish Registry for Cognitive/Dementia Disorders,12


In [102]:
# Prepare prompts for LLMs
prompts_items = []
total_prompts = 0
for registry in registry_dataset:
    registry_id = registry.get("object_id", None)
    # get "list_publi_ids" from registry
    list_publi_ids = registry.get("list_publi_ids", [])
    for publi_id in list_publi_ids:
        # find publication in publis_dataset
        publication = next(
            (pub for pub in publis_dataset if pub["object_id"] == publi_id), None
        )
        if publication:
            title = publication.get("title", "<no title>")
            abstract = publication.get("abstract", "<no abstract>")
            # Prepare the prompt for the LLM. replace registry_name_to_add and registry_acronym_to_add with needed values, then
            prompt = prompt_template.replace(
                "{{registry_name_to_add}}", registry["registry_name"]
            ).replace("{{registry_acronym_to_add}}", registry["acronym"])
            prompt = f"{prompt}\nTitle: {title}\nAbstract: {abstract}"
            prompts_items.append(
                {
                    "prompt": prompt,
                    "custom_id": f"{registry_id}_{publi_id}",
                    "registry_id": registry_id,
                    "publi_id": publi_id,
                }
            )
            total_prompts += 1
            
# print the number of prompts prepared
print(f"Prepared {total_prompts} prompts for LLM processing.")

# Create a list to store the records with object_id, prompt, and raw response
prompt_response_records = []

Prepared 3753 prompts for LLM processing.


In [103]:
# show fisrt item of prompts_items
print(f"First prompt item: {prompts_items[0]}")

First prompt item: {'prompt': 'Context:\nPatient registries are structured, systematic databases that collect standardized, longitudinal, patient-level clinical data on populations defined by diseases, conditions, exposures, or treatments. They support observational research, epidemiological monitoring, clinical-outcomes evaluation, trend analyses, data-quality assessments, comparative studies, and linkage with other data sources. Administrative registries (e.g., hospital discharge, national cancer, stroke, chronic disease surveillance systems) qualify if they systematically collect patient-level data suitable for clinical or epidemiological analysis. Simple vital-statistics databases (birth/death) do NOT qualify.\n\nSituation:\nThe publication that is given (title and abstract concatenated as Text_to_analyze) is registry related (it was manually checked), meaning that:\n- a least one patient registry is mentioned in the text\n- the publication/study uses or analyzes data from that reg

In [104]:
batch_prompts_list = []
batch_size = 500
for i in range(0, len(prompts_items), batch_size):
    batch_prompts_list.append(prompts_items[i:i + batch_size])

In [105]:
batch_raw_responses_list = []
for batch_prompts in batch_prompts_list:
    # Run batch inference based on model type
    print(f"Starting batch inference with {model_name}...")

    is_openai_model = "openai" in model_config.lower()
    # if "istral" in the name of llm_judge_model_config, then we are using Mistral model
    is_mistral_model = "istral" in model_config.lower()
    if is_mistral_model:
        backend = llm_backends.MistralBatchBackend(
            api_key=os.getenv("MISTRAL_API_KEY"), cache_storage=DiskCacheStorage()
        )
    elif is_openai_model:
        backend = llm_backends.OpenAIAsyncBackend(
            api_key=os.getenv("OPENAI_API_KEY"), cache_storage=DiskCacheStorage()
        )

    batch_raw_responses = backend.infer_many(
        prompt_items=batch_prompts,
        model_config=model_cfg,
    )
    batch_raw_responses_list.append(batch_raw_responses)

Starting batch inference with mistral-small-latest...
2025-07-22 15:26:04,569 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-22 15:26:04,682 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-22 15:26:04,784 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-22 15:26:04,892 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistra

In [106]:
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

In [107]:
# batch_number = 1
# batch_llm_responses_list = []
# initial_time = time.time()
# for batch_raw_responses in batch_raw_responses_list:
#     start_time = time.time()
#     llm_responses = []
#     for raw_response in batch_raw_responses:
#         # Store the raw response with object_id and prompt for the records file
#         prompt_obj = next(
#             (p for p in prompts_items if p["custom_id"] == raw_response["custom_id"]), None
#         )
#         if prompt_obj:
#             prompt_response_records.append(
#                 {
#                     "custom_id": raw_response["custom_id"],
#                     "registry_id": prompt_obj["registry_id"],
#                     "publi_id": prompt_obj["publi_id"],
#                     "prompt": prompt_obj["prompt"],
#                     "llm_response": raw_response,
#                 }
#             )
#             # parse raw response
#             parsed_response = backend._parse_response(raw_response)
#             parsed_response["custom_id"] = raw_response.get("custom_id", "")
#             parsed_response["registry_id"] = prompt_obj["registry_id"]
#             parsed_response["publi_id"] = prompt_obj["publi_id"]
#             # print(response)
#             llm_responses.append(parsed_response)

#     # print batch_number
#     print(f" --- Batch N°{batch_number} --- ")
#     print(f"Batch inference completed with {len(llm_responses)} responses")
#     elapsed_total = (time.time() - start_time)/ 60  # Convert to minutes
#     print(f"Total time for inference : {elapsed_total:.1f} minutes")
#     print("")
#     batch_number += 1
#     batch_llm_responses_list.append(llm_responses)

# total_computation_time = (time.time() - initial_time) / 60  # Convert to minutes
# print(f"--> Total computation time for all batches: {total_computation_time:.1f} minutes <--")

In [108]:
from tqdm import tqdm

# Precompute a mapping from custom_id to prompt object
prompt_map = {p["custom_id"]: p for p in prompts_items}

batch_number = 1
batch_llm_responses_list = []
initial_time = time.time()

for batch_raw_responses in tqdm(batch_raw_responses_list, desc="Processing batches"):
    print(f"--- Processing Batch N°{batch_number} ---")
    start_time = time.time()
    llm_responses = []
    inference_number = 1
    for raw_response in tqdm(batch_raw_responses, desc=f"Batch {batch_number} processing", leave=False):
        custom_id = raw_response.get("custom_id", "")
        prompt_obj = prompt_map.get(custom_id)
        if prompt_obj:
            prompt_response_records.append({
                "custom_id": custom_id,
                "registry_id": prompt_obj["registry_id"],
                "publi_id": prompt_obj["publi_id"],
                "prompt": prompt_obj["prompt"],
                "llm_response": raw_response,
            })
            # Parse raw response and add additional info
            parsed_response = backend._parse_response(raw_response)
            parsed_response["custom_id"] = custom_id
            parsed_response["registry_id"] = prompt_obj["registry_id"]
            parsed_response["publi_id"] = prompt_obj["publi_id"]
            llm_responses.append(parsed_response)
        inference_number += 1
    
    elapsed_total = (time.time() - start_time) / 60  # Convert to minutes
    print(f"Batch inference completed with {len(llm_responses)} responses")
    print(f"Total time for inference : {elapsed_total:.1f} minutes\n")
    batch_llm_responses_list.append(llm_responses)
    batch_number += 1

total_computation_time = (time.time() - initial_time) / 60  # Convert to minutes
print(f"--> Total computation time for all batches: {total_computation_time:.1f} minutes <--")

Processing batches:   0%|          | 0/8 [00:00<?, ?it/s]

--- Processing Batch N°1 ---




2025-07-22 15:26:05,538 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 0007ea35d5d5915399b3cb0e0a05c1ac078cf4767cec6522b7dba0ee3e04e0ea
2025-07-22 15:26:05,542 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: 0007ea35d5d5915399b3cb0e0a05c1ac078cf4767cec6522b7dba0ee3e04e0ea




Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°2 ---




2025-07-22 15:26:05,665 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: e8c496285d4354ddb3b5c5a0b9d05edacf6b8e97c13d5788232491e1ba2a7d8e
2025-07-22 15:26:05,669 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: e8c496285d4354ddb3b5c5a0b9d05edacf6b8e97c13d5788232491e1ba2a7d8e


Processing batches:  25%|██▌       | 2/8 [00:00<00:00, 12.20it/s]

Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°3 ---




2025-07-22 15:26:05,704 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 7ea24822fa1e01e1413e045c7d467a7afd77dcf6f65b8164a2b37c0762df819b
2025-07-22 15:26:05,708 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: 7ea24822fa1e01e1413e045c7d467a7afd77dcf6f65b8164a2b37c0762df819b




Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°4 ---




2025-07-22 15:26:05,739 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 5e3af4b45da28e76c8c0eb6367819a7def22545a1133e6f6f44606cd9d829c1c
2025-07-22 15:26:05,743 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: 5e3af4b45da28e76c8c0eb6367819a7def22545a1133e6f6f44606cd9d829c1c




Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°5 ---




2025-07-22 15:26:05,770 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 469837e2f65cd6f51db3aa3a6cef401b896ed1e4ea25b8155e27a47fceee12b0
2025-07-22 15:26:05,775 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: 469837e2f65cd6f51db3aa3a6cef401b896ed1e4ea25b8155e27a47fceee12b0


Processing batches:  62%|██████▎   | 5/8 [00:00<00:00, 20.39it/s]

Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°6 ---




2025-07-22 15:26:05,803 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: fa975d595c6884d27ee784ba69052849a1e03fb64ffc5738b993b8cac49cdba5
2025-07-22 15:26:05,807 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: fa975d595c6884d27ee784ba69052849a1e03fb64ffc5738b993b8cac49cdba5




Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°7 ---




2025-07-22 15:26:05,837 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: b570ab7c65b9b06fbaaad1cfcd4b406ffe001e0fca00f4aa8a3ee767a26d75e6
2025-07-22 15:26:05,844 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: b570ab7c65b9b06fbaaad1cfcd4b406ffe001e0fca00f4aa8a3ee767a26d75e6




Batch inference completed with 500 responses
Total time for inference : 0.0 minutes

--- Processing Batch N°8 ---




2025-07-22 15:26:05,880 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 1d22dcbaca4d391235337e40f818353259eac2bc2350fa7179b19beb1c00c68a
2025-07-22 15:26:05,884 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache hit for key: 1d22dcbaca4d391235337e40f818353259eac2bc2350fa7179b19beb1c00c68a


Processing batches: 100%|██████████| 8/8 [00:00<00:00, 21.20it/s]

Batch inference completed with 253 responses
Total time for inference : 0.0 minutes

--> Total computation time for all batches: 0.0 minutes <--





In [109]:
def format_string(string):
    """Format string to remove unwanted characters."""
    # remove punctuation and special characters, lower case
    return ''.join(e for e in string if e.isalnum() or e.isspace()).lower().strip()

In [110]:
# list all european countries from west to east
european_countries = [
    "albania",
    "andorra",
    "armenia",
    "austria",
    "azerbaijan",
    "belarus",
    "belgium",
    "bosnia and herzegovina",
    "bulgaria",
    "croatia",
    "cyprus",
    "czech republic",
    "denmark",
    "estonia",
    "finland",
    "france",
    "georgia",
    "germany",
    "greece",
    "hungary",
    "iceland",
    "ireland",
    "italy",
    "kazakhstan",  # part of it in Europe
    "kosovo",
    "latvia",
    "liechtenstein",
    "lithuania",
    "luxembourg",
    "malta",
    "moldova",  # part of it in Europe
    "monaco",
    "montenegro",
    "netherlands",
    "north macedonia",  # part of it in Europe
    "norway",  # part of it in Europe
    "poland",
    "portugal",
    "romania",
    "russia",  # part of it in Europe
    "san marino",
    "serbia",
    "slovakia",
    "slovenia",
    "spain",
    "sweden",
    "switzerland",
    "turkey",  # part of it in Europe
    "ukraine",
    "uk",
    "united kingdom",
    "scotland",
    "wales",
    "england",
    "northern ireland",
    "great britain",
    "europe",
    "european union",
]

# select only the registries that have a geographical area in the list of european countries
european_countries_set = set(format_string(country) for country in european_countries)

In [111]:
# Get all countries
countries = list(pycountry.countries)

# Display country names
country_names = [country.name for country in countries]
# complete with some geogrpahical areas such as coontinents, Scandinavia or other
geo_areas_list = [
    "Europe",
    "European Union",
    "Great Britain"
    "Asia",
    "Africa",
    "America",
    "North America",
    "South America",
    "Oceania",
    "Scandinavia",
    "Middle East",
    "Caribbean",
    "Central America",
    "Eastern Europe",
    "Western Europe",
    "Northern Europe",
    "Southern Europe",
    "Central Asia",
    "South Asia",
    "Southeast Asia",
    "East Asia",
    "Western Asia",
    "North Africa",
    "Sub-Saharan Africa",
    "Latin America",
    "International",
    "Multinational",
    "Worldwide",
]
all_geo_areas = set(country_names + geo_areas_list)

In [112]:
# set a new key "European" set to false tp all registries in the registry_dataset
for registry in registry_dataset:
    registry["geographical_area"] = []  # Initialize geographical_area as an empty list
    registry["is_european"] = False  # Initialize is_european as False
    registry["is_multinational"] = False  # Initialize is_multinational as False
    registry["is_international"] = False  # Initialize is_international as False
    registry["is_worldwide"] = False  # Initialize is_worldwide as False

# now update geographical_area (list) of each registry of registry_dataset with the llm_responses (list of all reponses related to this specific registry)
for llm_responses in batch_llm_responses_list:
    if not llm_responses:
        continue  # Skip empty response batches
    for response in llm_responses:
        registry_id = response.get("registry_id", "")
        geographical_area = response.get("geographical_area", None) # a string of multiple geo areas separated by commas
        if geographical_area:
            # Find the registry in the original dataset and update its geographical_area
            for registry in registry_dataset:
                if registry["object_id"] == registry_id:
                    # format the response string geographical_area, and when a substring of it is in the european_registries_set, and not already in the geographical_area list, then append the found european country to the geographical_area list
                    for region in all_geo_areas:
                        if format_string(region) in format_string(geographical_area):
                            if region not in registry["geographical_area"]:
                                registry["geographical_area"].append(region.title())
                            if format_string(region) in european_countries_set and not registry["is_european"]:
                                registry["is_european"] = True
                    # Check for multinational, international, or worldwide and update
                    if "multinational" in format_string(geographical_area) and not registry["is_multinational"]:
                        registry["is_multinational"] = True
                    if "international" in format_string(geographical_area) and not registry["is_international"]:
                        registry["is_international"] = True
                    if "worldwide" in format_string(geographical_area) and not registry["is_worldwide"]:
                        registry["is_worldwide"] = True


# show result for first 3 registries
for registry in registry_dataset[:3]:
    print(f"Registry ID: {registry['object_id']}, Geographical Area: {registry.get('geographical_area', [])}")
    print(f"- Is European: {registry.get('is_european', False)},\n"
          f"- Is Multinational: {registry.get('is_multinational', False)},\n"
          f"- Is International: {registry.get('is_international', False)},\n"
          f"- Is Worldwide: {registry.get('is_worldwide', False)}\n")

Registry ID: 820, Geographical Area: ['America', 'United States', 'Canada', 'Multinational']
- Is European: False,
- Is Multinational: True,
- Is International: False,
- Is Worldwide: False

Registry ID: 128, Geographical Area: ['Sweden', 'Norway', 'Finland', 'Denmark', 'Multinational', 'Scandinavia']
- Is European: True,
- Is Multinational: True,
- Is International: False,
- Is Worldwide: False

Registry ID: 2940, Geographical Area: ['United Kingdom']
- Is European: True,
- Is Multinational: False,
- Is International: False,
- Is Worldwide: False



In [113]:
# Now update geographical_area (list) of each publication of publis_dataset with the llm_responses (list of all responses related to this specific publication)
# But this time it can be any country, not only european countries
for llm_responses in batch_llm_responses_list:
    if not llm_responses:
        continue  # Skip empty response batches
    for response in llm_responses:
        # registry_id = response.get("registry_id", "")
        geographical_area = response.get("geographical_area", None)  # a string of multiple geo areas separated by commas
        # once extracted, it is going to be updated, so first make it empty
        # make response geo area empty
        response["geographical_area"] = []
        if geographical_area:
            # Find the publication in the original dataset and update its geographical_area
            for publication in publis_dataset:
                if publication["object_id"] == response.get("publi_id", ""):
                    # set its geographical_area to an empty list
                    publication["geographical_area"] = []
                    # format the response string geographical_area, and when a substring of it is not already in the geographical_area list, then append the found country to the geographical_area list
                    for region in all_geo_areas:
                        if format_string(region) in format_string(geographical_area):
                            if region not in publication["geographical_area"]:
                                publication["geographical_area"].append(region.title())

# show result for first 3 publications
for publication in publis_dataset[:3]:
    print(f"Publication ID: {publication['object_id']}, Geographical Area: {publication.get('geographical_area', [])}")

Publication ID: 0010390c-e875-510a-a898-4f296e4a294a, Geographical Area: ['Sweden']
Publication ID: 001772e5-c3ba-5732-89fe-1d235cf3aef6, Geographical Area: ['Netherlands']
Publication ID: 00353829-eeef-56a0-a79d-8f55d47eb8dc, Geographical Area: ['Australia']


In [None]:
output_file_path = working_dir + "/data/from_notebooks/NW02/R03_update_geo_areas/dedup_100_famous_european_registries_with_geo_area.json"
# make sure the output directory exists
output_dir = Path(output_file_path).parent
output_dir.mkdir(parents=True, exist_ok=True)
# save in projects/P04_official_reg_db_creation/data/from_notebooks/NW02/R02_create_dataset_dedup/test/dedup_100_famous_european_registries.json
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(registry_dataset, f, indent=4, ensure_ascii=False)

In [None]:
# save this file as famous_european_registries_sample_publi_data/{batch_number}.json wwith batch size of 500
output_folder = working_dir + "/data/from_notebooks/NW02/R03_update_geo_areas/famous_european_reg_publi_data_with_geo_area"
Path(output_folder).mkdir(parents=True, exist_ok=True)
for i in range(0, len(publis_dataset), 500):
    batch = publis_dataset[i:i + 500]
    batch_number = i // 500 + 1
    file_name = f"{batch_number}.json"
    output_file_path = os.path.join(output_folder, file_name)
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(batch, f, indent=4, ensure_ascii=False)

In [116]:
# display first 30 items of registry_dataset with columns 'registry_name', 'geographical_area', 'is_european', 'is_multinational', 'is_international', 'is_worldwide', 'number_of_occurrences'
df_registry = pd.DataFrame(registry_dataset)
print("Registry Dataset with Geographical Areas:")
display(df_registry[["registry_name", "geographical_area", "is_european", "is_multinational", "is_international", "is_worldwide", "number_of_occurrences"]].head(30))

Registry Dataset with Geographical Areas:


Unnamed: 0,registry_name,geographical_area,is_european,is_multinational,is_international,is_worldwide,number_of_occurrences
0,Get With The Guidelines-Resuscitation,"[America, United States, Canada, Multinational]",False,True,False,False,86
1,Swedish National Inpatient Register,"[Sweden, Norway, Finland, Denmark, Multination...",True,True,False,False,53
2,National Registry for Radiation Workers,[United Kingdom],True,False,False,False,19
3,National Pathology Registry,"[Denmark, Australia, Netherlands]",True,False,False,False,11
4,Swedish Registry for Cognitive/Dementia Disorders,[Sweden],True,False,False,False,12
5,National Death Index,"[Australia, America, United States, Spain]",True,False,False,False,109
6,Netherlands Heart Registration,[Netherlands],True,False,False,False,26
7,Côte d'Or Breast Cancer Registry,[France],True,False,False,False,10
8,Swedish Coronary Angiography and Angioplasty R...,[Sweden],True,False,False,False,10
9,South-Verona Psychiatric Case Register,[Italy],True,False,False,False,16


In [118]:
# count how many are european, multinational, international, worldwide
european_count = sum(1 for r in registry_dataset if r.get("is_european", False))
multinational_count = sum(1 for r in registry_dataset if r.get("is_multinational", False))
international_count = sum(1 for r in registry_dataset if r.get("is_international", False))
worldwide_count = sum(1 for r in registry_dataset if r.get("is_worldwide", False))
print(f"Total European Registries: {european_count}")
print(f"Total Multinational Registries: {multinational_count}")
print(f"Total International Registries: {international_count}")
print(f"Total Worldwide Registries: {worldwide_count}")

Total European Registries: 83
Total Multinational Registries: 21
Total International Registries: 9
Total Worldwide Registries: 3
