In [1]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv
import weaviate

from src.p04_official_reg_db_creation import config

Changed working directory to /home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation


# 0. EMA data sources loading

In [2]:
# track time of execution
start_time = time.time()

weaviate_client = weaviate.connect_to_custom(**config.WEAVIATE_PROD_CONF)
collections = weaviate_client.collections  #
# load publications
collection_publications = collections.get("DataSource_v2")
# load data source names
ema_sources = []
for item in collection_publications.iterator(include_vector=False):
    # Extract subset of properties
    ema_sources.append(
        {
            k: v
            for k, v in item.properties.items()
            if k
            in [
                "object_id",
                "title",
                "acronym",
                "registry_related",
                "geographical_area",
                # "medical_condition",
            ]
        }
    )
# close weaviate connection
weaviate_client.close()
df_ema_sources = pd.DataFrame(ema_sources)
# print count of items
print(f"Number of items loaded: {len(df_ema_sources)}")
display(df_ema_sources.head())

Number of items loaded: 237


Unnamed: 0,title,object_id,geographical_area,acronym,registry_related
0,Clinical Practice Research Datalink (CPRD) GOLD,028ac269-7f7f-56d3-8dd3-0bd93913d864,[United Kingdom],CPRD GOLD,False
1,Cancer Registry of Instituto Português de Onco...,0446b0c0-0bec-5999-8c59-f0e4107dabc3,[Portugal],IPO-Porto Cancer Registry,True
2,European Registry of Patients with McArdle dis...,060d06a8-0a14-5b8b-9fe4-882f08412d7f,"[Denmark, France, Germany, Greece, Italy, Neth...",EUROMAC,True
3,The UK-Irish Atopic Eczema Systemic Therapy Re...,060ede1e-35d6-5dab-9e1e-d7e1e2f78c2a,"[Ireland, United Kingdom]",A-STAR,True
4,syndena GmbH (former OncoTyrol),08920b82-907a-5d03-828e-f76f7b3c735e,[Austria],syndena GmbH,True


In [None]:
# create a list of registries with publis info by loading all files with format f"projects/P04_official_reg_db_creation/data/SW01/R04_extract_registry_name/registries_to_publis/first_30/{batch_number}.jsonl"
registries_to_publis = []
for batch_number in range(1, 90):
    file_path = Path(
        working_dir
        + "/data/from_scripts/SW01/R04_extract_registry_name/registries_to_publis/"
        + f"{batch_number}.jsonl"
    )
    if file_path.exists():
        with open(file_path, "r") as file:
            for line in file:
                registries_to_publis.append(json.loads(line))
# print count of items
print(f"Number of registries to publis: {len(registries_to_publis)}")

Number of registries to publis: 202623


In [4]:
def format_string(string):
    """Format string to remove unwanted characters."""
    # remove punctuation and special characters, lower case
    return ''.join(e for e in string if e.isalnum() or e.isspace()).lower().strip()

In [None]:
from tqdm import tqdm

registries_list = []

for registry in tqdm(ema_sources, desc="Processing EMA sources"):
    # Initialize the registry info if it doesn't exist
    if registry["object_id"] not in registries_list:
        # Get all matching official publications for the registry using list comprehension
        matched_publi = [
            item["object_id"]
            for item in registries_to_publis
            if item["is_official"] and format_string(item["registry_name"]) == format_string(registry["title"])
        ]
        number_of_occurrences = len(matched_publi)
        list_publi_ids = [item["publication_id"]  for item in matched_publi]
        registries_list.append(
            {
                "object_id": registry["object_id"],
                "registry_name": registry["title"],
                "acronym": registry.get("acronym", ""),
                "geographical_area": registry.get("geographical_area", ""),
                "number_of_occurrences": number_of_occurrences,
                "list_publi_ids": list_publi_ids,
            }
        )

# Count how many registries have publications
count_registries_with_publi = sum(
    1 for registry in registries_list if registry["number_of_occurrences"] > 0
)
print(f"Number of registries with publications: {count_registries_with_publi}")

Processing EMA sources:   0%|          | 0/237 [00:00<?, ?it/s]

Processing EMA sources: 100%|██████████| 237/237 [04:35<00:00,  1.16s/it]

Number of registries with publications: 36





In [6]:
# # show one example of a registry with publications
# example_registry = next(
#     (registry for registry in registries_dict.values() if registry["list_publi_ids"]), None
# )
# example_registry


In [None]:
# save as json file in data/from_notebooks/NW02/R01_create_ema_registries_with_publis/test/registries_dataset_v1.json
output_file_path = Path(
    working_dir + "/data/from_notebooks/NW02/R01_create_ema_registries_with_publis/test/registries_dataset_v4.json"
)
output_file_path.parent.mkdir(parents=True, exist_ok=True)
# with open(output_file_path, "w") as file:
#     for registry_id, registry_info in registries_dict.items():
#         # write each registry as a json line
#         file.write(json.dumps({"object_id": registry_id, **registry_info}) + "\n")

with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(registries_list, f, indent=4, ensure_ascii=False)

In [None]:
# reload the file to reuse it
registries_dataset_path = Path(
    working_dir + "/data/from_notebooks/NW02/R01_create_ema_registries_with_publis/test/registries_dataset_v4.json"
)
with open(registries_dataset_path, "r", encoding="utf-8") as f:
    registries_list = json.load(f)

In [6]:
# rank registries by number of publications and show top 10
top_10_registries = sorted(
    registries_list,
    key=lambda x: x["number_of_occurrences"],
    reverse=True
)[:10]
print("Top 10 registries by number of publications:")
# show as pandas dataframe with display
top_10_df = pd.DataFrame(top_10_registries)
display(top_10_df)

Top 10 registries by number of publications:


Unnamed: 0,object_id,registry_name,acronym,geographical_area,number_of_occurrences,list_publi_ids
0,7a813447-3329-5753-912b-74eff851244e,Netherlands Cancer Registry,NCR,[Netherlands],996,[{'publication_id': 'ad3412aa-0b7b-5ce9-a35f-e...
1,1617d382-bed9-5a20-bfbc-5c25b3546a75,UK Renal Registry,UKRR,[United Kingdom],297,[{'publication_id': 'aef415bb-10ad-5d38-a2ec-d...
2,d1445f95-c673-5364-aeb3-123b949ea722,UK Biobank,UKB,[United Kingdom],203,[{'publication_id': '794ac5a0-216a-5a58-9d05-4...
3,416793b2-a1ca-56f6-ad55-f1902a4260cc,Hospital Episode Statistics,HES,[United Kingdom],135,[{'publication_id': 'ac6a3080-688c-5933-a721-1...
4,e5a4c78e-5b12-5277-89af-e9e5c95dd662,Clinical Practice Research Datalink,CPRD,[United Kingdom],127,[{'publication_id': 'd2b3bd8e-c077-5123-8201-1...
5,6484f2d7-2774-5e14-96f7-0d5779e48932,UK Cystic Fibrosis Registry,UK CF Registry,[United Kingdom],110,[{'publication_id': '35e8f651-145e-5f80-a85b-9...
6,0b3de87c-d9fc-585b-ad4f-39b5911096d0,European Cystic Fibrosis Society Patient Registry,ECFSPR - Cystic Fibrosis,"[Albania, Armenia, Austria, Belarus, Bulgaria,...",32,[{'publication_id': 'f21a64cb-082b-5755-887f-b...
7,ef56a0c9-eb8e-5cd5-b514-53bbfece55dd,"Comparative, Prospective Registry of Newly Ini...",COMPERA,"[Austria, Belgium, Germany, Hungary, Italy, La...",32,[{'publication_id': '7114f1ac-ac4a-51c7-ac63-2...
8,f09c5a8f-c4b1-5377-97d7-d471e5abe1fa,International Severe Asthma Registry,ISAR,"[Argentina, Australia, Bulgaria, Canada, Colom...",26,[{'publication_id': 'e1022a6a-fb40-589a-a2de-6...
9,7a064efd-e4c3-5f3a-a846-a54e94652855,British Society for Rheumatology Biologics Reg...,BSRBR-RA,[United Kingdom],19,[{'publication_id': 'd2890d08-5fb8-5234-b9ff-f...


## Create new file with all the info of papers using EMA data sources

In [None]:
# reload the file to reuse it
registries_dataset_path = Path(
    working_dir + "/data/from_notebooks/NW02/R01_create_ema_registries_with_publis/test/registries_dataset_v4.json"
)
with open(registries_dataset_path, "r", encoding="utf-8") as f:
    registries_list = json.load(f)

In [3]:
# define set of all distinct publication ids from the sample
distinct_publication_ids = set()
for registry in registries_list:
    distinct_publication_ids.update(registry["list_publi_ids"])
# print number of distinct publication ids
print(f"Number of distinct publication ids in the sample: {len(distinct_publication_ids)}")

Number of distinct publication ids in the sample: 2068


In [5]:
from collections import OrderedDict
from tqdm import tqdm

# Create famous_european_registries_sample_publi_data
# track time of execution
start_time = time.time()

weaviate_client = weaviate.connect_to_custom(**config.WEAVIATE_PROD_CONF)
collections = weaviate_client.collections  #
# load publications
collection_publications = collections.get("Publication_v2")
# load data source names
metadata_from_publis = []
ordered_keys = [
    "object_id",
    "title",
    "abstract",
    "geographical_area",
    "medical_condition",
    "outcome_measure",
    "population_sex",
    "population_age_group",
    "population_size",
    "population_follow_up",
]

# Try to get the total number of items for tqdm progress bar (if supported)
try:
    total_items = collection_publications.count()
except Exception:
    total_items = None

for item in tqdm(collection_publications.iterator(include_vector=False), total=total_items, desc="Extracting publication metadata"):
    # if the publication is in distinct_publication_ids, add it to the metadata_from_publis
    if item.properties.get("object_id") in distinct_publication_ids:
        # Extract subset of properties in the given order
        metadata_from_publis.append(
            OrderedDict(
                (k, item.properties.get(k, None))
                for k in ordered_keys
            )
        )

# close weaviate connection
weaviate_client.close()

Extracting publication metadata: 217736it [01:09, 3139.36it/s]


In [None]:
# save this file as famous_european_registries_sample_publi_data/{batch_number}.json wwith batch size of 500
output_folder = working_dir + "/data/from_notebooks/NW02/R01_create_ema_registries_with_publis/test/ema_registries_dataset_publi_data"
Path(output_folder).mkdir(parents=True, exist_ok=True)
for i in range(0, len(metadata_from_publis), 500):
    batch = metadata_from_publis[i:i + 500]
    batch_number = i // 500 + 1
    file_name = f"{batch_number}.json"
    output_file_path = os.path.join(output_folder, file_name)
    with open(output_file_path, "w", encoding="utf-8") as f:
        json.dump(batch, f, indent=4, ensure_ascii=False)