In [18]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import pycountry

from src.p04_official_reg_db_creation import config
import llm_backends
from llm_backends.cache import DiskCacheStorage
from llm_backends.mistral import dummy_config
from llm_backends.openai import dummy_config

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation


# 0. Define variables and load data

In [19]:
FIELD = "medical_condition"
MODEL = "small_mistral"

# Load environment variables from .env file and get API key
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [20]:
# INPUTS
registry_dataset = f"../../datasets/006_bis_registry_names_datasets/tool_reg_data/current_state_of_data/dedup_100_famous_european_registries.json"
publis_dataset_template = f"../../datasets/006_bis_registry_names_datasets/tool_reg_data/current_state_of_data/famous_european_registries_sample_publi_data/1.json"
prompt_txt = prompt_txt=f"etc/prompts/extraction/prompt_{FIELD}.txt"
model_config=f"etc/configs/{MODEL}_config.json"

In [21]:
# # OUTPUTS
# output_json = f"data/W01/R01_extraction/{MODEL}/{FIELD}/{FIELD}_extractions.json"
# output_records_jsonl = f"data/W01/R01_extraction/{MODEL}/{FIELD}/{FIELD}_extractions_records.json"

In [22]:
# # Ensure output directory exists
# out_dir = Path(output_json).parent
# out_dir.mkdir(parents=True, exist_ok=True)

# # Ensure output records directory exists
# records_dir = Path(output_records_jsonl).parent
# records_dir.mkdir(parents=True, exist_ok=True)

# Load model configuration
with open(model_config, "r", encoding="utf-8") as f:
    model_cfg = json.load(f)

model_name = model_cfg.get("model", "unknown")
print(f"Using model: {model_name}")

# Load the annotation prompt
with open(prompt_txt, "r", encoding="utf-8") as f:
    prompt_template = f.read().strip()

# Load registry_dataset
with open(registry_dataset, "r") as file:
    registry_dataset = json.load(file)

# Load the publications dataset
# first get input directory from template
input_dir = Path(publis_dataset_template).parent
# Get all input batch files and sort them by batch number
batch_files = sorted(input_dir.glob("*.json"), key=lambda p: int(p.stem))
publis_dataset = []
# Load each batch file
for batch_file in batch_files:
    with open(batch_file, "r", encoding="utf-8") as f:
        batch_data = json.load(f)
        publis_dataset.extend(batch_data)
# print how many registries and publications we have
print(f"Loaded {len(registry_dataset)} registries and {len(publis_dataset)} publications from {len(batch_files)} batches.")

Using model: mistral-small-latest
Loaded 100 registries and 3730 publications from 8 batches.


In [23]:
# # select a small subset of the publis_dataset (first 5 elements)
# publis_dataset = publis_dataset[:20]

# 1. Prepare prompts

In [24]:
# Prepare prompts for LLMs
prompts_items = []
total_prompts = 0
for publi in publis_dataset:
    object_id = publi.get("object_id", "<unknown>")
    title = publi.get("title", "<no title>")
    abstract = publi.get("abstract", "<no abstract>")
    full_prompt = f"{prompt_template}\nText_to_analyze:\nTitle: {title}\nAbstract: {abstract}"
    prompts_items.append({"prompt": full_prompt, "custom_id": object_id})
    total_prompts += 1
            
# print the number of prompts prepared
print(f"Prepared {total_prompts} prompts for LLM processing.")

Prepared 3730 prompts for LLM processing.


In [25]:
# show fisrt item of prompts_items
print(f"First prompt item: {prompts_items[0]}")

First prompt item: {'prompt': 'CONTEXT:\nYou are an expert of real-world clinical studies, especially at litterature review. You are provided with a publication\'s title and abstract extracted from PubMed or Semantic Scholar, in which use or analysis of patient/medical registry was proven.\nRegister and registry are synonyms. \n\nDEFINITION:\nIn a clinical study or its publication, a medical condition is typically defined as a specific disease, disorder, health-related state, or procedure (or even environmental/demographic/lifestyle factors) that is the focus of the research and directly characterizes/defines the studied population or cohort\n\nDIFFERENCES.\nThis is to be distinguished from:\n1. Outcome measure / endpoint: Results or effects observed in a study.\n2. Risk factors: Characteristics increasing the likelihood of a condition.\n3. Main variables / descriptive statistics: Primary factors or data summaries analyzed in a study.\n4. Target/study population: The specific group of 

In [26]:
batch_prompts_list = []
batch_size = 500
for i in range(0, len(prompts_items), batch_size):
    batch_prompts_list.append(prompts_items[i:i + batch_size])
# print how many batches we have
print(f"Total batches created: {len(batch_prompts_list)}")

Total batches created: 8


# 2. Make Inferences

In [27]:
batch_raw_responses_list = []
for batch_prompts in batch_prompts_list:
    # Run batch inference based on model type
    print(f"Starting batch inference with {model_name}...")

    is_openai_model = "openai" in model_config.lower()
    # if "istral" in the name of llm_judge_model_config, then we are using Mistral model
    is_mistral_model = "istral" in model_config.lower()
    if is_mistral_model:
        backend = llm_backends.MistralBatchBackend(
            api_key=os.getenv("MISTRAL_API_KEY"), cache_storage=DiskCacheStorage()
        )
    elif is_openai_model:
        backend = llm_backends.OpenAIAsyncBackend(
            api_key=os.getenv("OPENAI_API_KEY"), cache_storage=DiskCacheStorage()
        )

    batch_raw_responses = backend.infer_many(
        prompt_items=batch_prompts,
        model_config=model_cfg,
    )
    batch_raw_responses_list.append(batch_raw_responses)

Starting batch inference with mistral-small-latest...
2025-07-23 15:44:46,207 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-23 15:44:46,360 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-23 15:44:46,466 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistral-small-latest...
2025-07-23 15:44:46,560 - llm_backends.cache.disk.DiskCacheStorage - INFO - Disk cache initialized at: /home/gpinon/more_europa/clean_rdc_experiments/src/llm_backends/llm_backends/.cache
Starting batch inference with mistra

In [28]:
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

In [29]:
from tqdm import tqdm

# Create a list to store the records with object_id, prompt, and raw response
prompt_response_records = []

# Precompute a mapping from custom_id to prompt object
prompt_map = {p["custom_id"]: p for p in prompts_items}

batch_number = 1
batch_llm_responses_list = []
initial_time = time.time()

for batch_raw_responses in tqdm(batch_raw_responses_list, desc="Processing batches"):
    print(f"--- Processing Batch N°{batch_number} ---")
    start_time = time.time()
    llm_responses = []
    inference_number = 1
    for raw_response in tqdm(batch_raw_responses, desc=f"Batch {batch_number} processing", leave=False):
        custom_id = raw_response.get("custom_id", "")
        prompt_obj = prompt_map.get(custom_id)
        if prompt_obj:
            prompt_response_records.append({
                "custom_id": custom_id,
                "prompt": prompt_obj["prompt"],
                "llm_response": raw_response,
            })
            # Parse raw response and add additional info
            parsed_response = backend._parse_response(raw_response)
            parsed_response["custom_id"] = custom_id
            llm_responses.append(parsed_response)
        inference_number += 1
    
    elapsed_total = (time.time() - start_time) / 60  # Convert to minutes
    print(f"Batch inference completed with {len(llm_responses)} responses")
    print(f"Total time for inference : {elapsed_total:.1f} minutes\n")
    batch_llm_responses_list.append(llm_responses)
    batch_number += 1

total_computation_time = (time.time() - initial_time) / 60  # Convert to minutes
print(f"--> Total computation time for all batches: {total_computation_time:.1f} minutes <--")

Processing batches:   0%|          | 0/8 [00:00<?, ?it/s]

--- Processing Batch N°1 ---




2025-07-23 15:44:47,104 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 54f1d4d54c667156f9612a89f78dda3c5a801d586b8cba6792797013fc9b3279
2025-07-23 15:44:47,105 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 54f1d4d54c667156f9612a89f78dda3c5a801d586b8cba6792797013fc9b3279
2025-07-23 15:45:09,257 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 54f1d4d54c667156f9612a89f78dda3c5a801d586b8cba6792797013fc9b3279
2025-07-23 15:45:09,274 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 54f1d4d54c667156f9612a89f78dda3c5a801d586b8cba6792797013fc9b3279


Processing batches:  12%|█▎        | 1/8 [00:22<02:35, 22.22s/it]

Batch inference completed with 500 responses
Total time for inference : 0.4 minutes

--- Processing Batch N°2 ---




2025-07-23 15:45:09,310 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 851859564fdae8d61fd76b925b3d358d1f11dd7684379586c512045ea63a12af
2025-07-23 15:45:09,311 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 851859564fdae8d61fd76b925b3d358d1f11dd7684379586c512045ea63a12af
2025-07-23 15:45:23,213 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 851859564fdae8d61fd76b925b3d358d1f11dd7684379586c512045ea63a12af
2025-07-23 15:45:23,230 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 851859564fdae8d61fd76b925b3d358d1f11dd7684379586c512045ea63a12af


Processing batches:  25%|██▌       | 2/8 [00:36<01:44, 17.36s/it]

Batch inference completed with 500 responses
Total time for inference : 0.2 minutes

--- Processing Batch N°3 ---




2025-07-23 15:45:23,266 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 65d8a374f0327d5c664311ce986523952b7bb1718f12d2e72881cf158ef23ffb
2025-07-23 15:45:23,267 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 65d8a374f0327d5c664311ce986523952b7bb1718f12d2e72881cf158ef23ffb
2025-07-23 15:45:37,270 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 65d8a374f0327d5c664311ce986523952b7bb1718f12d2e72881cf158ef23ffb
2025-07-23 15:45:37,345 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 65d8a374f0327d5c664311ce986523952b7bb1718f12d2e72881cf158ef23ffb


Processing batches:  38%|███▊      | 3/8 [00:50<01:19, 15.89s/it]

Batch inference completed with 500 responses
Total time for inference : 0.2 minutes

--- Processing Batch N°4 ---




2025-07-23 15:45:37,418 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 1c8fe73bc9e12a48f6c98ce583d39d9c14de262facbc882786bae51fa14f82db
2025-07-23 15:45:37,420 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 1c8fe73bc9e12a48f6c98ce583d39d9c14de262facbc882786bae51fa14f82db
2025-07-23 15:45:53,755 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 1c8fe73bc9e12a48f6c98ce583d39d9c14de262facbc882786bae51fa14f82db
2025-07-23 15:45:53,772 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 1c8fe73bc9e12a48f6c98ce583d39d9c14de262facbc882786bae51fa14f82db


Processing batches:  50%|█████     | 4/8 [01:06<01:04, 16.09s/it]

Batch inference completed with 500 responses
Total time for inference : 0.3 minutes

--- Processing Batch N°5 ---




2025-07-23 15:45:53,810 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: d99ba01a3f3827f983228640d7ff5644838e649f1156427e501d32fa5627f4cc
2025-07-23 15:45:53,811 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: d99ba01a3f3827f983228640d7ff5644838e649f1156427e501d32fa5627f4cc
2025-07-23 15:46:08,702 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: d99ba01a3f3827f983228640d7ff5644838e649f1156427e501d32fa5627f4cc
2025-07-23 15:46:08,719 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: d99ba01a3f3827f983228640d7ff5644838e649f1156427e501d32fa5627f4cc


Processing batches:  62%|██████▎   | 5/8 [01:21<00:47, 15.68s/it]

Batch inference completed with 500 responses
Total time for inference : 0.2 minutes

--- Processing Batch N°6 ---




2025-07-23 15:46:08,758 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 66c6e2e3e4f39f1cff203ce812fbbb8c2e778583afa736b6860cdcc7477b7d34
2025-07-23 15:46:08,759 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 66c6e2e3e4f39f1cff203ce812fbbb8c2e778583afa736b6860cdcc7477b7d34
2025-07-23 15:46:23,601 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 66c6e2e3e4f39f1cff203ce812fbbb8c2e778583afa736b6860cdcc7477b7d34
2025-07-23 15:46:23,618 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 66c6e2e3e4f39f1cff203ce812fbbb8c2e778583afa736b6860cdcc7477b7d34


Processing batches:  75%|███████▌  | 6/8 [01:36<00:30, 15.41s/it]

Batch inference completed with 500 responses
Total time for inference : 0.2 minutes

--- Processing Batch N°7 ---




2025-07-23 15:46:23,654 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 56a2b483ad55b0a151d6ccf75f1eeeec57b4bd5806cd9802f1bc60eaf028279f
2025-07-23 15:46:23,655 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 56a2b483ad55b0a151d6ccf75f1eeeec57b4bd5806cd9802f1bc60eaf028279f
2025-07-23 15:46:37,985 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 56a2b483ad55b0a151d6ccf75f1eeeec57b4bd5806cd9802f1bc60eaf028279f
2025-07-23 15:46:38,002 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 56a2b483ad55b0a151d6ccf75f1eeeec57b4bd5806cd9802f1bc60eaf028279f


Processing batches:  88%|████████▊ | 7/8 [01:50<00:15, 15.08s/it]

Batch inference completed with 500 responses
Total time for inference : 0.2 minutes

--- Processing Batch N°8 ---




2025-07-23 15:46:38,026 - llm_backends.cache.disk.DiskCacheStorage - INFO - Attempting to retrieve cache for key: 0f8ce8e2e4bb839cf47f3b1eae6a9f6a630b7f6976d778e579154d840026eadc
2025-07-23 15:46:38,026 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache file not found for key: 0f8ce8e2e4bb839cf47f3b1eae6a9f6a630b7f6976d778e579154d840026eadc
2025-07-23 15:46:49,965 - llm_backends.cache.disk.DiskCacheStorage - INFO - Storing cache for key: 0f8ce8e2e4bb839cf47f3b1eae6a9f6a630b7f6976d778e579154d840026eadc
2025-07-23 15:46:49,974 - llm_backends.cache.disk.DiskCacheStorage - INFO - Cache stored successfully for key: 0f8ce8e2e4bb839cf47f3b1eae6a9f6a630b7f6976d778e579154d840026eadc


Processing batches: 100%|██████████| 8/8 [02:02<00:00, 15.36s/it]

Batch inference completed with 230 responses
Total time for inference : 0.2 minutes

--> Total computation time for all batches: 2.0 minutes <--





# 3. Update the medical condition field in the publications dataset

In [41]:
from tqdm import tqdm

# Prebuild a mapping from publication object_id to the publication record
pub_map = {pub.get("object_id", ""): pub for pub in publis_dataset}

# Update the publis_dataset with the extracted field using progress bars
for llm_responses in tqdm(batch_llm_responses_list, desc="Updating publications batches"):
    if not llm_responses:
        continue  # Skip empty batches
    for response in tqdm(llm_responses, desc="Processing responses", leave=False):
        publi_id = response.get("custom_id", "")
        updated_field = response.get(FIELD, None)       
        if updated_field is not None:
            publi = pub_map.get(publi_id)
            if publi is None:
                continue
            details = None
            formatted_details = []
            if "[" in updated_field and "]" in updated_field:
                start_idx = updated_field.index("[") + 1
                end_idx = updated_field.index("]")
                details = updated_field[start_idx:end_idx]
                formatted_details = [detail.strip() for detail in details.split(";") if detail.strip()]
                updated_field = updated_field.replace(details, "").replace("[", "").replace("]", "").strip()
            formatted_updated_field = [condition.strip() for condition in updated_field.split(";") if condition.strip()]
            publi[FIELD] = formatted_updated_field
            publi[f"{FIELD}_details"] = formatted_details if details else []

# Print the first 20 updated publications (their IDs and the updated field)
n = 20
print(f"First {n} updated publications with extracted field:")
for publi in publis_dataset[:n]:
    publi_id = publi.get("object_id", "<unknown>")
    extracted_field = publi.get(FIELD, "<not extracted>")
    print(f"Publication ID: {publi_id}")
    print(f"Extracted {FIELD}: {extracted_field}")
    print(f"Details: {publi.get(f'{FIELD}_details', '<no details>')}")
    print("---")

Updating publications batches: 100%|██████████| 8/8 [00:00<00:00, 111.21it/s]

First 20 updated publications with extracted field:
Publication ID: 0010390c-e875-510a-a898-4f296e4a294a
Extracted medical_condition: ['Inflammatory Bowel Disease']
Details: ["Crohn's Disease", 'Ulcerative Colitis']
---
Publication ID: 001772e5-c3ba-5732-89fe-1d235cf3aef6
Extracted medical_condition: ['Acute Ischemic Stroke']
Details: ['Anterior Circulation Occlusion']
---
Publication ID: 00353829-eeef-56a0-a79d-8f55d47eb8dc
Extracted medical_condition: ['Cancer']
Details: ['Lung', 'Oesophagus', 'Pharynx', 'Mouth', 'Pancreas', 'Stomach', 'Liver', 'Gallbladder', 'Breast', 'Bowel', 'Prostate', 'Skin (Melanoma)', 'Lip']
---
Publication ID: 004237b4-b88d-5a59-9cbf-2a62b6db833a
Extracted medical_condition: ['Coronary Artery Disease']
Details: ['Coronary Lesions']
---
Publication ID: 00684acf-743e-5882-9f11-09c087b589e4
Extracted medical_condition: ['Inflammatory Bowel Disease']
Details: ['Crohn’s Disease', 'Ulcerative Colitis', 'Unclassified Colitis']
---
Publication ID: 006a71b6-1e34-5f1b-




# 4. Update the updated field in the registry_dataset

In [31]:
def format_string(string):
    """Format string to remove unwanted characters."""
    # remove punctuation and special characters, lower case
    return ''.join(e for e in string if e.isalnum() or e.isspace()).lower().strip()

In [None]:
# # now update updated_field in registry_dataset, as a list of all the medical conditions found in the publications, and the number of times they occurred
# # medical_condition in the publications dataset is already a list of medical conditions
# for registry in registry_dataset:
#     # Get the list_publi_ids from the registry
#     list_publi_ids = registry.get("list_publi_ids", [])
#     # Initialize a dictionary to count occurrences of medical conditions
#     terms_counts = {}
#     formatted_terms_counts = {}
#     # Iterate through the publications in publis_dataset
#     for publi in publis_dataset:
#         if publi.get("object_id") in list_publi_ids:
#             # Get the medical conditions from the publication
#             updated_field = publi.get(FIELD, [])
#             # Iterate through the medical conditions and count occurrences
#             for term in updated_field:
#                 formatted_term = format_string(term)
#                 if formatted_term:
#                     # if formatted_condition is already in the formatted conditions counts (also formatted)
#                     if formatted_term in formatted_terms_counts:
#                         # retrieve the official_term from the terms_counts dictionary
#                         terms_counts[formatted_term.title()] += 1
#                         formatted_terms_counts[formatted_term] += 1
#                     else:
#                         terms_counts[formatted_term.title()] = 1
#                         formatted_terms_counts[formatted_term] = 1

#     # Update the registry with the condition counts
#     registry[FIELD] = terms_counts

# # Print the first 3 registries with their medical condition counts
# for registry in registry_dataset[:3]:
#     print(f"Registry: {registry.get('registry_name', 'Unknown')}")
#     print(f"Medical Conditions Counts: {registry.get(FIELD, {})}\n")
    

-------------
Processing registry: Get With The Guidelines-Resuscitation
-- Processing publi: 04932b72-5eaf-5e22-8381-c8b14d81950f --
term: Pediatric In-Hospital Cardiac Arrest, formatted_term: pediatric inhospital cardiac arrest
New formatted term: pediatric inhospital cardiac arrest
Added counts: {'Pediatric Inhospital Cardiac Arrest': 1}
Added formatted counts: {'pediatric inhospital cardiac arrest': 1}
-- Processing publi: 122d8472-811f-54e4-abcd-94ade2605fb0 --
term: Cardiac Arrest, formatted_term: cardiac arrest
New formatted term: cardiac arrest
Added counts: {'Pediatric Inhospital Cardiac Arrest': 1, 'Cardiac Arrest': 1}
Added formatted counts: {'pediatric inhospital cardiac arrest': 1, 'cardiac arrest': 1}
-- Processing publi: 1292a9e6-bc6e-5a8b-9fe7-768cbc6ca368 --
term: In-Hospital Cardiac Arrest, formatted_term: inhospital cardiac arrest
New formatted term: inhospital cardiac arrest
Added counts: {'Pediatric Inhospital Cardiac Arrest': 1, 'Cardiac Arrest': 1, 'Inhospital Ca

In [38]:
from tqdm import tqdm

# Prebuild a dictionary mapping publication object_ids to their medical conditions
pub_dict = {publi.get("object_id"): publi.get(FIELD, []) for publi in publis_dataset}

# now update updated_field in registry_dataset, as a list of all the medical conditions found in the publications,
# and the number of times they occurred, then rank them by count (highest to lowest)
for registry in tqdm(registry_dataset, desc="Processing registries"):
    terms_counts = {}
    for pub_id in registry.get("list_publi_ids", []):
        updated_field = pub_dict.get(pub_id, [])
        for term in updated_field:
            formatted_term = format_string(term)
            if formatted_term:
                key = formatted_term.title()
                terms_counts[key] = terms_counts.get(key, 0) + 1
    # Rank the medical conditions by count (highest first)
    ranked_terms = dict(sorted(terms_counts.items(), key=lambda item: item[1], reverse=True))
    registry[FIELD] = ranked_terms

# Print the first 3 registries with their ranked medical condition counts
for registry in registry_dataset[:3]:
    print(f"Registry: {registry.get('registry_name', 'Unknown')}")
    print(f"Ranked Medical Conditions Counts: {registry.get(FIELD, {})}\n")

Processing registries: 100%|██████████| 100/100 [00:00<00:00, 5094.56it/s]

Registry: Get With The Guidelines-Resuscitation
Ranked Medical Conditions Counts: {'Inhospital Cardiac Arrest': 35, 'Cardiac Arrest': 33, 'Pediatric Inhospital Cardiac Arrest': 7, 'Cardiopulmonary Arrest': 2, 'Maternal Cardiac Arrest': 2, 'Neonatal Critical Illness': 1, 'Neonatal Cardiopulmonary Resuscitation': 1, 'Covid19': 1, 'Ventricular Fibrillation': 1, 'Cardiac Disease': 1, 'Bradycardia': 1, 'Congenital Heart Disease': 1, 'Pediatric Pulseless Arrest': 1, 'Endstage Kidney Disease': 1}

Registry: Swedish National Inpatient Register
Ranked Medical Conditions Counts: {'Stroke': 3, 'Coeliac Disease': 3, 'Celiac Disease': 3, 'Atrial Fibrillation': 2, 'Covid19': 2, 'Gastric Cancer': 1, 'Chronic Obstructive Pulmonary Disease': 1, 'Rheumatoid Arthritis': 1, 'Bicycle Crash Injuries': 1, 'Influenza': 1, 'Major Depressive Disorder': 1, 'Brain Tumors': 1, 'Serious Arrhythmia': 1, 'Pulmonary Arterial Hypertension': 1, 'Systemic Lupus Erythematosus': 1, 'Elderly Population': 1, 'Porphyria': 1, 




# 5. Save the results: registries and publications datasets

In [39]:
# save this file as famous_european_registries_sample_publi_data/{batch_number}.json wwith batch size of 500
output_folder = working_dir + f"/data/from_notebooks/NW02/R04_update_{FIELD}s/famous_european_reg_publi_data_with_{FIELD}"
Path(output_folder).mkdir(parents=True, exist_ok=True)
for i in range(0, len(publis_dataset), 500):
    batch = publis_dataset[i:i + 500]
    batch_number = i // 500 + 1
    file_name = f"{batch_number}.json"
    output_file_path = os.path.join(output_folder, file_name)
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(batch, f, indent=4, ensure_ascii=False)
# print how many files were saved
print(f"Saved {len(publis_dataset) // 500 + 1} files in {output_folder}")

Saved 8 files in /home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation/data/from_notebooks/NW02/R04_update_medical_conditions/famous_european_reg_publi_data_with_medical_condition


In [40]:
output_file_path = working_dir + f"/data/from_notebooks/NW02/R04_update_{FIELD}s/dedup_100_famous_european_registries_with_{FIELD}.json"
# make sure the output directory exists
output_dir = Path(output_file_path).parent
output_dir.mkdir(parents=True, exist_ok=True)
# save in projects/P04_official_reg_db_creation/data/NW02/R02_create_dataset_dedup/test/dedup_100_famous_european_registries.json
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(registry_dataset, f, indent=4, ensure_ascii=False)