In [1]:
import json
import boto3
import pandas as pd

from p06_search_engine import config

# constants

In [2]:
S3_BUCKET_NAME = "s3-common-dev20231214174437248800000002"
PATH_DATA_REGISTRIES_PUBLICATIONS = "registry_data_catalog_experiments/P04_official_reg_db_creation/datasets_versions/update_medical_condition/registry_data/"

PATH_DATA_REGISTRIES_EMA = "/home/jperrio/registry_data_catalog_experiments/datasets/007_search_engine_datasets/999_ema_registires.json"

# Loading

In [3]:
# Initialize dataset of registries
registries = []

## from publications

In [4]:
# Populate dataset of registries with registries from publications

# instantiate s3 client
s3 = boto3.client("s3")

# iterate over files
for file in s3.list_objects(Bucket=S3_BUCKET_NAME, Prefix=PATH_DATA_REGISTRIES_PUBLICATIONS).get("Contents", []):

    # iterate over lines
    for line in s3.get_object(Bucket=S3_BUCKET_NAME, Key=file["Key"])["Body"].iter_lines():
        
        # convert line to dict
        d_registry = json.loads(line)
        
        # create registry
        registry = {
            "registry_id": d_registry["object_id"], 
            "registry_name": d_registry["registry_name"], 
            "registry_acronym": d_registry["acronym"], 
            "registry_locations": d_registry["geographical_area"] or [], 
            "registry_conditions": d_registry["medical_condition"] or {}, 
            "registry_occurrences": d_registry["number_of_occurrences"], 
        }

        # add registry to dataset
        registries.append(registry)

## from ema data catalogue

In [5]:
# Populate dataset of registries with registries from ema data catalogue

# read registries
with open(PATH_DATA_REGISTRIES_EMA, "r") as file:
    l_registries = json.load(file)

# sort registries by name
l_registries = sorted(l_registries, key=lambda x: x["title"])

# retrieve last id from existing registries (as ema registries do not have an id)
last_id = max(registries, key=lambda registry: registry["registry_id"])["registry_id"]

# iterate over registries
for idx, d_registry in enumerate(l_registries, start=1):

    # create registry
    registry = {
        "registry_id": last_id + idx, 
        "registry_name": d_registry["title"], 
        "registry_acronym": d_registry["acronym"], 
        "registry_locations": d_registry["geographical_area"] or [], 
        "registry_conditions": {condition: None for condition in (d_registry["medical_condition"] or [])}, 
    }

    # add registry to dataset
    registries.append(registry)

# Clean conditions

In [6]:
registries = [
    {
        **registry, 
        "registry_conditions": {condition: occurrence for condition, occurrence in registry["registry_conditions"].items() if occurrence is not None}, 
    }
    for registry in registries
]

# Add ocurrences to locations

In [7]:
# Compute global occurrences per locations
locations_occurences = pd.DataFrame(registries)["registry_locations"].explode().str.capitalize().str.strip().value_counts().to_dict()

# Update registries with occurrences per locations
registries = [
    {
        **registry, 
        "registry_locations": {location.strip().capitalize(): locations_occurences.get(location.strip().capitalize(), 0) for location in registry["registry_locations"]}, 
    }
    for registry in registries
]

# Saving

In [8]:
# Save dataset of registries
with open(config.PATH_DATA_REGISTRIES, "w") as file:
    json.dump(registries, file, indent=4)