In [1]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import weaviate
import boto3

from src.p04_official_reg_db_creation import config

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation


# Load all new registry names

In [None]:
# create a list of registries with publis info by loading all files with format f"projects/P04_official_reg_db_creation/data/W01/R04_extract_registry_name/registries_to_publis/first_30/{batch_number}.jsonl"
registries_to_publis = []
for batch_number in range(1, 90):
    file_path = Path(
        working_dir
        + "/data/from_scripts/SW01/R04_extract_registry_name/registries_to_publis/"
        + f"{batch_number}.jsonl"
    )
    if file_path.exists():
        with open(file_path, "r") as file:
            for line in file:
                registries_to_publis.append(json.loads(line))
# print count of items
print(f"Number of registries to publis: {len(registries_to_publis)}")

Number of registries to publis: 202623


In [3]:
# show fist 5 items
print("First 5 registries to publis:")
for i, registry in enumerate(registries_to_publis[:5]):
    print(f"{i+1}: {registry}")

First 5 registries to publis:
1: {'index': 1, 'registry_name': 'Spanish ABPM Registry', 'acronym': 'ABPM', 'is_official': True, 'object_id': '993ed7d7-0938-50f6-ab49-e2df8aae17d6'}
2: {'index': 2, 'registry_name': 'A population-based cancer registry', 'acronym': '', 'is_official': False, 'object_id': '27dabc72-9c19-58cd-8b98-12625fb024f8'}
3: {'index': 3, 'registry_name': 'Fasa Registry for Systolic Heart Failure', 'acronym': 'FARSH', 'is_official': True, 'object_id': '2ff2d330-c460-5444-8932-c659d359c03f'}
4: {'index': 4, 'registry_name': 'OnCovid Registry', 'acronym': 'OnCovid', 'is_official': True, 'object_id': '02d7128e-48b2-5dab-8b1a-7422b3a36696'}
5: {'index': 5, 'registry_name': 'New York State Cancer Registry', 'acronym': 'NYSCR', 'is_official': True, 'object_id': 'b16b8cdb-2192-53cc-a7fd-b5c1bfe4b418'}


# Deduplication

In [4]:
def format_string(string):
    """Format string to remove unwanted characters."""
    # remove punctuation and special characters, lower case
    return ''.join(e for e in string if e.isalnum() or e.isspace()).lower().strip()

In [5]:
from tqdm import tqdm

registries_dict = {}
index = 1

for registry in tqdm(registries_to_publis, desc="Processing EMA sources"):
    formatted_name = format_string(registry["registry_name"])
    # if registry["is_official"] is False:
    if registry["is_official"] is True:
        if formatted_name not in registries_dict:
            registries_dict[formatted_name] = {
                "object_id": index,
                "registry_name": registry["registry_name"],
                "acronym": registry.get("acronym", ""),
                "geographical_area": [],
                "number_of_occurrences": 1,
                "list_publi_ids": [registry["object_id"]],
            }
            index += 1
        else:
            registries_dict[formatted_name]["number_of_occurrences"] += 1
            registries_dict[formatted_name]["list_publi_ids"].append(registry["object_id"])

registries_list = list(registries_dict.values())

Processing EMA sources:   0%|          | 0/202623 [00:00<?, ?it/s]

Processing EMA sources: 100%|██████████| 202623/202623 [00:02<00:00, 81024.43it/s]


In [6]:
# count many registries are in the list
print(f"Number of unique official registries: {len(registries_list)}")
# filter out 

Number of unique official registries: 54335


In [7]:
# check how many registries have more than 100 occurrence
registries_with_many_occurrences = [
    r for r in registries_list if r["number_of_occurrences"] > 100
]
print(
    f"Number of registries with more than 100 occurrences: {len(registries_with_many_occurrences)}"
)

Number of registries with more than 100 occurrences: 124


In [8]:
# plot the distribution of the number of occurrences per registry wiht plotly, using a log10 scale for the x-axis
# filter out the registries with more than 100 occurrences for better visualization
import plotly.express as px
df = pd.DataFrame(registries_list)
df_plot = df[df["number_of_occurrences"] <= 100]
fig = px.histogram(
    df_plot,
    x="number_of_occurrences",
    log_x=True,
    title="Distribution of the number of occurrences per registry (log10 scale)",
)
fig.update_layout(
    xaxis_title="Number of occurrences (log10 scale)",
    yaxis_title="Number of registries",
)

## Batch saving

In [None]:
# save the updated registries_dict with metadata to new jsonl files in batches of 2000 in format "/data/from_notebooks/NW02/R02_create_dataset_dedup/test/registries_dataset_v1/{batch_number}.jsonl"
# in s3 bucket BUCKET_NAME_DEV in folder registry_data_catalog_experiments/P04_official_reg_db_creation/registries_dataset_version2/v1/{batch_number}.jsonl
from src.p04_official_reg_db_creation.utils  import load_jsonl_from_s3, upload_jsonl_to_s3

experiment_folder_path = "registry_data_catalog_experiments/P04_official_reg_db_creation/registries_dataset_version2/v4"
folder_path = experiment_folder_path + "/registries_dataset"
for i in range(0, len(registries_list), 2000):
    batch = registries_list[i:i + 2000]
    batch_number = i // 2000 + 1
    file_name = f"{batch_number}.jsonl"
    upload_jsonl_to_s3(
        batch,
        config.BUCKET_NAME_DEV,
        folder_path,
        file_name
    )



# Enhance the metadata with additional information

In [10]:
# reload the files into one list
experiment_folder_path = "registry_data_catalog_experiments/P04_official_reg_db_creation/registries_dataset_version2/v4"
folder_path = experiment_folder_path + "/registries_dataset"
# retrieve total_batches = how many files are in the folder
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket=config.BUCKET_NAME_DEV, Prefix=folder_path)
total_batches = len(response.get("Contents", []))
registries_list = []
for batch_number in range(1, total_batches + 1):
    file_name = f"{batch_number}.jsonl"
    batch = load_jsonl_from_s3(config.BUCKET_NAME_DEV, folder_path, file_name)
    registries_list.extend(batch)



In [11]:
# rank registries by number of publications and show top 10
top_10_registries = sorted(
    registries_list,
    key=lambda x: x["number_of_occurrences"],
    reverse=True
)[:10]
print("Top 10 registries by number of publications:")
# show as pandas dataframe with display
top_10_df = pd.DataFrame(top_10_registries)
display(top_10_df)

Top 10 registries by number of publications:


Unnamed: 0,object_id,registry_name,acronym,geographical_area,number_of_occurrences,list_publi_ids
0,23,Surveillance Epidemiology and End Results Program,SEER,[],5406,"[13547e66-329f-5891-a8cb-3c2888f296ec, a27dca3..."
1,76,Danish National Patient Registry,NPR,[],2523,"[d6581fd6-73c6-52b9-af17-2a515ac77706, 3851550..."
2,86,Scientific Registry of Transplant Recipients,SRTR,[],1300,"[f8c41912-2335-5f56-97b7-150939d34969, 88ece81..."
3,111,Swedish National Patient Register,NPR,[],1076,"[7941dc72-8e68-5fd1-a5ed-f74fb78410cb, 911a193..."
4,281,Netherlands Cancer Registry,NCR,[],996,"[ad3412aa-0b7b-5ce9-a35f-e8949b0d998f, 61be169..."
5,50,Danish Cancer Registry,CR,[],774,"[fe749e1e-8b71-5cfc-af03-7e61e1337a25, 888f937..."
6,495,Danish National Prescription Registry,DNPR,[],684,"[b762284e-101c-52eb-a4fd-8dc633b3b24c, a29200e..."
7,37,Global Registry of Acute Coronary Events,GRACE,[],652,"[73255b6b-21fc-5dc6-a20b-af0fb67c9cde, 0d2124e..."
8,496,Danish Civil Registration System,CRS,[],637,"[b762284e-101c-52eb-a4fd-8dc633b3b24c, a29200e..."
9,71,Medical Birth Registry of Norway,MBRN,[],626,"[b7281010-24ea-552d-80a5-a60e70f3fca2, 2376977..."


## Load metadata from weaviate

In [12]:
# Add the metadata of the publications, previously extracted from Weaviate, to the registries_dict
# track time of execution
start_time = time.time()

weaviate_client = weaviate.connect_to_custom(**config.WEAVIATE_PROD_CONF)
collections = weaviate_client.collections  #
# load publications
collection_publications = collections.get("Publication_v2")
# load data source names
metadata_from_publis = []
for item in collection_publications.iterator(include_vector=False):
    # Extract subset of properties
    metadata_from_publis.append(
        {
            k: v
            for k, v in item.properties.items()
            if k
            in [
                "object_id",
                # "title",
                # "abstract",
                "geographical_area",
                # "medical_condition",
                # "outcome_measure",
                # "population_sex",
                # "population_age_group",
                # "population_size",
                # "population_follow_up",
            ]
        }
    )
# close weaviate connection
weaviate_client.close()

In [13]:
from tqdm import tqdm

# Step 1: Build a mapping from publication object_id to geographical_area
publi_geoarea_map = {
    publi["object_id"]: publi.get("geographical_area", [])
    for publi in metadata_from_publis
}

# Step 2: Fill registry geographical_area efficiently
for registry in tqdm(registries_list, desc="Processing geographical areas"):
    geo_areas = set(registry["geographical_area"])  # Use set for fast lookup
    for pub_id in registry["list_publi_ids"]:
        areas = publi_geoarea_map.get(pub_id, [])
        if areas and isinstance(areas, list):
            for area in areas:
                formatted_area = format_string(area)
                if (
                    formatted_area not in geo_areas
                    and formatted_area != "not found"
                    and formatted_area != "not specified"
                    and formatted_area != ""
                ):
                    geo_areas.add(formatted_area)
    registry["geographical_area"] = list(geo_areas)

Processing geographical areas: 100%|██████████| 54335/54335 [00:01<00:00, 48321.35it/s]


### Select European registries

In [19]:
# list all european countries from west to east
european_registries = [
    "albania",
    "andorra",
    "armenia",
    "austria",
    "azerbaijan",
    "belarus",
    "belgium",
    "bosnia and herzegovina",
    "bulgaria",
    "croatia",
    "cyprus",
    "czech republic",
    "denmark",
    "estonia",
    "finland",
    "france",
    "georgia",
    "germany",
    "greece",
    "hungary",
    "iceland",
    "ireland",
    "italy",
    "kazakhstan",  # part of it in Europe
    "kosovo",
    "latvia",
    "liechtenstein",
    "lithuania",
    "luxembourg",
    "malta",
    "moldova",  # part of it in Europe
    "monaco",
    "montenegro",
    "netherlands",
    "north macedonia",  # part of it in Europe
    "norway",  # part of it in Europe
    "poland",
    "portugal",
    "romania",
    "russia",  # part of it in Europe
    "san marino",
    "serbia",
    "slovakia",
    "slovenia",
    "spain",
    "sweden",
    "switzerland",
    "turkey",  # part of it in Europe
    "ukraine",
    "uk",
    "united kingdom",
    "scotland",
    "wales",
    "england",
    "northern ireland",
    "great britain",
    "europ",
]

# select only the registries that have a geographical area in the list of european countries
european_registries_set = set(format_string(country) for country in european_registries)

In [20]:
# select european registries
european_registries_list = [
    registry
    for registry in registries_list
    if any(
        format_string(area) in european_registries_set
        for area in registry["geographical_area"]
    )
]
print(f"Number of european registries: {len(european_registries_list)}")

# select famous european registries that have at least 10 publications (number_of_occurrences >= 10)
famous_european_registries = [
    registry
    for registry in european_registries_list
    if registry["number_of_occurrences"] >= 10
]
print(f"Number of famous (>=10) european registries: {len(famous_european_registries)}")

Number of european registries: 18137
Number of famous (>=10) european registries: 1155


In [None]:
# select randomly 100 registries from the famous_european_registries_list
import random
random.seed(42)  # for reproducibility
famous_european_registries_sample = random.sample(
    famous_european_registries, 100
)

output_file_path = working_dir + "/data/from_notebooks/NW02/R02_create_dataset_dedup/test/dedup_100_famous_european_registries.json"
# save in projects/P04_official_reg_db_creation/data/from_notebooks/NW02/R02_create_dataset_dedup/test/dedup_100_famous_european_registries.json
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(famous_european_registries_sample, f, indent=4, ensure_ascii=False)

## Create new file with all the info of papers in the famous european registries

In [22]:
# reload the file to check
with open(output_file_path, "r", encoding="utf-8") as f:
    famous_european_registries_sample = json.load(f)

In [23]:
# define set of all distinct publication ids from the sample
distinct_publication_ids = set()
for registry in famous_european_registries_sample:
    distinct_publication_ids.update(registry["list_publi_ids"])
# print number of distinct publication ids
print(f"Number of distinct publication ids in the sample: {len(distinct_publication_ids)}")

Number of distinct publication ids in the sample: 3730


In [27]:
from collections import OrderedDict
from tqdm import tqdm

# Create famous_european_registries_sample_publi_data
# track time of execution
start_time = time.time()

weaviate_client = weaviate.connect_to_custom(**config.WEAVIATE_PROD_CONF)
collections = weaviate_client.collections  #
# load publications
collection_publications = collections.get("Publication_v2")
# load data source names
metadata_from_publis = []
ordered_keys = [
    "object_id",
    "title",
    "abstract",
    "geographical_area",
    "medical_condition",
    "outcome_measure",
    "population_sex",
    "population_age_group",
    "population_size",
    "population_follow_up",
]

# Try to get the total number of items for tqdm progress bar (if supported)
try:
    total_items = collection_publications.count()
except Exception:
    total_items = None

for item in tqdm(collection_publications.iterator(include_vector=False), total=total_items, desc="Extracting publication metadata"):
    # if the publication is in distinct_publication_ids, add it to the metadata_from_publis
    if item.properties.get("object_id") in distinct_publication_ids:
        # Extract subset of properties in the given order
        metadata_from_publis.append(
            OrderedDict(
                (k, item.properties.get(k, None))
                for k in ordered_keys
            )
        )

# close weaviate connection
weaviate_client.close()

Extracting publication metadata: 217736it [01:11, 3060.49it/s]


In [None]:
# save this file as famous_european_registries_sample_publi_data/{batch_number}.json wwith batch size of 500
output_folder = working_dir + "/data/from_notebooks/NW02/R02_create_dataset_dedup/test/famous_european_registries_sample_publi_data"
Path(output_folder).mkdir(parents=True, exist_ok=True)
for i in range(0, len(metadata_from_publis), 500):
    batch = metadata_from_publis[i:i + 500]
    batch_number = i // 500 + 1
    file_name = f"{batch_number}.json"
    output_file_path = os.path.join(output_folder, file_name)
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(batch, f, indent=4, ensure_ascii=False)

## Batch saving

In [18]:
# use the same method to upload the updated registries_list to S3 in batches of 2000
experiment_folder_path = "registry_data_catalog_experiments/P04_official_reg_db_creation/registries_dataset_version2/v4"
folder_path = experiment_folder_path + "/registry_dataset_with_publis_metadata"

for i in range(0, len(registries_list), 2000):
    batch = registries_list[i:i + 2000]
    batch_number = i // 2000 + 1
    file_name = f"{batch_number}.jsonl"
    upload_jsonl_to_s3(
        batch,
        config.BUCKET_NAME_DEV,
        folder_path,
        file_name
    )



