In [19]:
from typing import Dict, List, Any, Hashable, Tuple, Set
from collections import defaultdict
from datetime import datetime
import pandas as pd
import numpy as np
import json

from models import StudyResult
from utils import generate_key
from config import SCALAR_FIELDS, NON_SCALAR_FIELDS


In [20]:
EXPECTED_TABLES = StudyResult.expected_tables()

In [21]:
def process_study_file(file_loc: str) -> List[StudyResult]:

    batch_results: List[StudyResult] = []
    
    df_studies = pd.read_parquet(file_loc)
    df_studies = pd.json_normalize(df_studies["studies"])
    
    for idx, study in df_studies.iterrows():
        nct_index = SCALAR_FIELDS["nct_id"]
        nct_id = study.get(nct_index)
    
        if not nct_id:
            continue
    
        try:
            result = transform_single_study(nct_id, study)
            batch_results.append(result)
            
        except Exception as e:
            raise
    
    return batch_results

In [22]:
def merge_batch_results(batch_results: List[StudyResult]) -> Dict[str, List[Dict]]:
    
    merged: Dict[str, List[Dict]] = defaultdict(list)
    # print(f" LEN BATCH BEFORE MERGE ---{len(batch_results)}")
    # print(f" TYPE BATCH Before MERGE ---{type(batch_results)}")

    # print(type(batch_results[0].tables()))
    # print("-----")
    print(f"{type(batch_results)}")
    print(f"{len(batch_results)}")
    for study_result in batch_results:
        for table, rows in study_result.tables().items():
            merged[table].extend(rows)

    missing = set(EXPECTED_TABLES) - merged.keys()
    if missing:
        raise ValueError(f"Missing tables: {missing}")


    return merged


In [23]:
def transform_single_study(nct_id: str, study: pd.Series) -> StudyResult:
    study_key = generate_key(nct_id)
    result = defaultdict(list)

    study_fields = transform_scalar_fields(study_key, study)
    result["studies"].append(study_fields)
    
    # identificationModule
    secondary_ids, nct_aliases = transform_identification_module(study_key, study)
    result["secondary_ids"].extend(secondary_ids)
    result["nct_aliases"].extend(nct_aliases)

    # sponsorCollaboratorsModule
    sponsor, study_sponsor, collaborators, study_collaborators = (
        transform_sponsor_and_collaborators_module(nct_id, study_key, study)
    )
    result["sponsors"].extend(sponsor)
    result["study_sponsors"].extend(study_sponsor)
    result["collaborators"].extend(collaborators)
    result["study_collaborators"].extend(study_collaborators)


    # conditionsModule
    conditions, study_conditions, keywords, study_keywords = transform_conditions_module(
        nct_id, study_key, study
    )
    result["conditions"].extend(conditions)
    result["study_conditions"].extend(study_conditions)
    result["keywords"].extend(keywords)
    result["study_keywords"].extend(study_keywords)


    # armsInterventionsModule
    (
        arm_groups,
        arm_interventions,
        intervention_names,
        study_intervention_names,
        other_interventions_names,
        study_other_interventions_names,
    ) = transform_arms_interventions_module(study_key, study)
    result["arm_groups"].extend(arm_groups)
    result["arm_interventions"].extend(arm_interventions)
    result["intervention_names"].extend(intervention_names)
    result["study_intervention_names"].extend(study_intervention_names)
    result["other_interventions_names"].extend(other_interventions_names)
    result["study_other_interventions_names"].extend(study_other_interventions_names)


    # outcomesModule
    primary_outcomes, secondary_outcomes, other_outcomes = transform_outcomes_module(study_key, study)
    result["primary_outcomes"].extend(primary_outcomes)
    result["secondary_outcomes"].extend(secondary_outcomes)
    result["other_outcomes"].extend(other_outcomes)


    # contactsLocationsModule
    central_contacts, study_central_contacts, locations, study_locations= transform_contacts_location_module(
        study_key, study
    )
    result["central_contacts"].extend(central_contacts)
    result["study_central_contacts"].extend(study_central_contacts)
    result["locations"].extend(locations)
    result["study_locations"].extend(study_locations)


    # referencesModule
    references, link_references, ipd_references = transform_reference_module(study_key, study)
    result["references"].extend(references)
    result["link_references"].extend(link_references)
    result["ipd_references"].extend(ipd_references)


    # outcomeMeasuresModule
    (
        outcome_measures,
        outcome_measure_groups,
        outcome_measure_denom_units,
        outcome_measure_denom_counts,
        outcome_measure_measurements,
        outcome_measure_analyses,
        outcome_measure_comparison_groups,
    ) = transform_outcome_measures_module(study_key, study)
    result["outcome_measures"].extend(outcome_measures)
    result["outcome_measure_groups"].extend(outcome_measure_groups)
    result["outcome_measure_denom_units"].extend(outcome_measure_denom_units)
    result["outcome_measure_denom_counts"].extend(outcome_measure_denom_counts)
    result["outcome_measure_measurements"].extend(outcome_measure_measurements)
    result["outcome_measure_analyses"].extend(outcome_measure_analyses)
    result["outcome_measure_comparison_groups"].extend(
        outcome_measure_comparison_groups
    )


    

    return StudyResult(
        studies=result["studies"],
        secondary_ids=result["secondary_ids"],
        nct_aliases=result["nct_aliases"],
        sponsors=result["sponsors"],
        study_sponsors=result["study_sponsors"],
        collaborators=result["collaborators"],
        study_collaborators=result["study_collaborators"],
        conditions=result["conditions"],
        study_conditions=result["study_conditions"],
        keywords=result["keywords"],
        study_keywords=result["study_keywords"],
        arm_groups=result["arm_groups"],
        arm_interventions=result["arm_interventions"],
        intervention_names=result["intervention_names"],
        study_intervention_names=result["study_intervention_names"],
        other_interventions_names=result["other_interventions_names"],
        study_other_interventions_names=result["study_other_interventions_names"],
        primary_outcomes=result["primary_outcomes"],
        secondary_outcomes=result["secondary_outcomes"],
        other_outcomes=result["other_outcomes"],
        central_contacts=result["central_contacts"],
        study_central_contacts=result["study_central_contacts"],
        locations=result["locations"],
        study_locations=result["study_locations"],
        references=result["references"],
        link_references=result["link_references"],
        ipd_references=result["ipd_references"],
        outcome_measures = result["outcome_measures"],
        outcome_measure_groups = result["outcome_measure_groups"],
        outcome_measure_denom_units = result["outcome_measure_denom_units"],
        outcome_measure_denom_counts = result["outcome_measure_denom_counts"],
        outcome_measure_measurements = result["outcome_measure_measurements"],
        outcome_measure_analyses= result["outcome_measure_analyses"],
        outcome_measure_comparison_groups = result["outcome_measure_comparison_groups"],
        

    )

In [24]:
def transform_scalar_fields(study_key: str, study_data: pd.Series) -> Dict:
    study_record = dict()

    study_record["study_key"] = study_key
    for entity_key in SCALAR_FIELDS:
        index_field = SCALAR_FIELDS.get(entity_key)

        study_record[entity_key] = study_data.get(index_field)

    return study_record

In [25]:
def transform_identification_module(study_key: str, study_data: pd.Series) -> Tuple:

    secondary_ids = []
    nct_aliases = []

    identification_index = NON_SCALAR_FIELDS["identification"]["index_field"]

    #Secondary id infos
    secondary_id_infos = study_data.get(f'{identification_index}.secondaryIdInfos')

    if (
        isinstance(secondary_id_infos, (list, np.ndarray))
        and len(secondary_id_infos) > 0
    ):
        for secondary_id_info in secondary_id_infos:
            secondary_id = secondary_id_info.get("id")
            secondary_id_key = generate_key(study_key, secondary_id)

            secondary_ids.append(
                {
                    "secondary_id_key": secondary_id_key,
                    "study_key": study_key,
                    "id": secondary_id,
                    "type": secondary_id_info.get("type"),
                    "domain": secondary_id_info.get("domain"),
                    "link": secondary_id_info.get("link"),
                }
            )

        nct_id_aliases = study_data.get(f'{identification_index}.nctIdAliases')

        if (
            isinstance(nct_id_aliases, (list, np.ndarray))
            and len(nct_id_aliases) > 0
        ):
            for nct_id_alias in nct_id_aliases:
                nct_aliases.append(
                    {
                        "study_key": study_key,
                        "id_alias": nct_id_alias,
                    }
                )

    return nct_aliases, secondary_ids



In [26]:

def transform_sponsor_and_collaborators_module(
    nct_id: str, study_key: str, study_data: pd.Series
) -> Tuple:
    sponsor = []
    study_sponsor = []
    collaborators = []
    study_collaborators = []

    sponsor_collaborator_index = NON_SCALAR_FIELDS["sponsor_collaborators"][
        "index_field"
    ]

    ## sponsor name and class are scalar values and MUST be transformed as so
    lead_sponsor_name = study_data.get(f"{sponsor_collaborator_index}.leadSponsor.name")
    lead_sponsor_class = study_data.get(
        f"{sponsor_collaborator_index}.leadSponsor.class"
    )

    if pd.notna(lead_sponsor_name) and pd.notna(lead_sponsor_class):

        sponsor_key = generate_key(lead_sponsor_name, lead_sponsor_class)
        sponsor.append(
            {
                "sponsor_key": sponsor_key,
                "name": lead_sponsor_name,
                "sponsor_class": lead_sponsor_class,
            }
        )

        study_sponsor.append({"study_key": study_key, "sponsor_key": sponsor_key})

    # collaborators
    collaborators_list = study_data.get(
        f"{sponsor_collaborator_index}.collaborators"
    )

    if (
        isinstance(collaborators_list, (list, np.ndarray))
        and len(collaborators_list) > 0
    ):
        for collaborator in collaborators_list:
            collaborator_key = generate_key(
                collaborator.get("name"), collaborator.get("class")
            )

            collaborators.append(
                {
                    "collaborator_key": collaborator_key,
                    "name": collaborator.get("name"),
                    "collaborator_class": collaborator.get("class"),
                }
            )

            study_collaborators.append(
                {
                    "study_key": study_key,
                    "collaborator_key": collaborator_key,
                }
            )


    return sponsor, study_sponsor, collaborators, study_collaborators

          


In [27]:
def transform_conditions_module(nct_id: str, study_key: str, study_data: pd.Series) -> Tuple:
    conditions = []
    study_conditions = []
    keywords = []
    study_keywords = []

    conditions_index = NON_SCALAR_FIELDS["conditions"]["index_field"]

    conditions_list = study_data.get(f"{conditions_index}.conditions")

    if isinstance(conditions_list, (list, np.ndarray)) and len(conditions_list) > 0:
        for condition in conditions_list:
            condition_key = generate_key(condition)

            conditions.append(
                {"condition_key": condition_key, "condition_name": condition}
            )

            study_conditions.append(
                {
                    "study_key": study_key,
                    "condition_key": condition_key,
                }
            )

    keywords_list = study_data.get(f"{conditions_index}.keywords")

    if isinstance(keywords_list, (list, np.ndarray)) and len(keywords_list) > 0:

        for keyword in keywords_list:
            keyword_key = generate_key(keyword)

            keywords.append({"keyword_key": keyword_key, "keyword_name": keyword})

            study_keywords.append(
                {
                    "study_key": study_key,
                    "keyword_key": keyword_key,
                }
            )

    return conditions, study_conditions, keywords, study_keywords


In [28]:

def transform_arms_interventions_module(study_key: str, study_data: pd.Series) -> Tuple:

    arm_groups = []
    arm_interventions = []

    intervention_names = []
    study_intervention_names = []
    other_interventions_names = []
    study_other_interventions_names = []

    arms_interventions_index = NON_SCALAR_FIELDS["arms_interventions"]["index_field"]
    arm_groups_list = study_data.get(f"{arms_interventions_index}.armGroups")

    if isinstance(arm_groups_list, (list, np.ndarray)) and len(arm_groups_list) > 0:
        for arm_group in arm_groups_list:
            arm_label = arm_group.get("label")
            arm_description = arm_group.get("description")
            arm_type = arm_group.get("type")

            arm_group_key = generate_key(
                study_key, arm_label, arm_description, arm_type
            )

            arm_groups.append(
                {
                    "study_key": study_key,
                    "arm_group_key": arm_group_key,
                    "arm_label": arm_label,
                    "arm_description": arm_description,
                    "arm_type": arm_type,
                }
            )

            arm_interventions_list = arm_group.get("interventionNames")
            if (
                isinstance(arm_interventions_list, (list, np.ndarray))
                and len(arm_interventions_list) > 0
            ):

                for arm_intervention in arm_interventions_list:
                    arm_interventions.append(
                        {
                            "study_key": study_key,
                            "arm_group_key": arm_group_key,
                            "arm_intervention_name": arm_intervention,
                        }
                    )

    interventions_list = study_data.get(f"{arms_interventions_index}.interventions")
    if (
        isinstance(interventions_list, (list, np.ndarray))
        and len(interventions_list) > 0
    ):
        for intervention in interventions_list:
            main_name = intervention.get("name")
            intervention_type = intervention.get("type")
            description = intervention.get("description")

            intervention_key = generate_key(main_name, intervention_type)
            intervention_names.append(
                {
                    "intervention_key": intervention_key,
                    "intervention_name": main_name,
                    "intervention_type": intervention_type,
                    "description": description,
                }
            )

            study_intervention_names.append(
                {"study_key": study_key,
                 "intervention_key": intervention_key,
                 "is_primary_name": True
                 }
            )

            other_names = intervention.get("otherNames")

            if isinstance(other_names, (list, np.ndarray)) and len(other_names) > 0:
                for other_name in other_names:
                    if other_name == main_name:
                        continue  # some studies put the main name in the list of other names

                    intervention_key = generate_key(other_name, intervention_type)
                    other_interventions_names.append(
                        {
                            "intervention_key": intervention_key,
                            "intervention_name": other_name,
                            "intervention_type": intervention_type, #inherit from parent
                            "description": description,  #inherit from parent
                        }
                    )

                    study_other_interventions_names.append(
                        {
                            "study_key": study_key,
                            "intervention_key": intervention_key,
                            "is_primary_name": False,
                        }
                    )
            # armGroupLabels is excluded. check docs/excluded_fields.md for reasons

    return (
        arm_groups,
        arm_interventions,
        intervention_names,
        study_intervention_names,
        other_interventions_names,
        study_other_interventions_names,
    )


In [29]:

def transform_outcomes_module(study_key: str, study_data: pd.Series) -> Tuple:
    primary_outcomes = []
    secondary_outcomes = []
    other_outcomes = []

    outcomes_module_index = NON_SCALAR_FIELDS["outcomes"]["index_field"]

    primary_outcomes_list = study_data.get(f"{outcomes_module_index}.primaryOutcomes")

    if (
        isinstance(primary_outcomes_list, (list, np.ndarray))
        and len(primary_outcomes_list) > 0
    ):
        for primary_outcome in primary_outcomes_list:

            primary_outcomes.append(
                {
                    "study_key": study_key,
                    "measure": primary_outcome.get("measure"),
                    "description": primary_outcome.get("description"),
                    "time_frame": primary_outcome.get("timeFrame"),
                }
            )

        secondary_outcomes_list = study_data.get(f"{outcomes_module_index}.secondaryOutcomes")
        if (
            isinstance(secondary_outcomes_list, (list, np.ndarray))
            and len(secondary_outcomes_list) > 0
        ):
            for secondary_outcome in secondary_outcomes_list:
                secondary_outcomes.append(
                    {
                        "study_key": study_key,
                        "measure": secondary_outcome.get("measure"),
                        "description": secondary_outcome.get("description"),
                        "time_frame": secondary_outcome.get("timeFrame"),
                    }
                )

        other_outcomes_list = study_data.get(f"{outcomes_module_index}.otherOutcomes")
        if (
            isinstance(other_outcomes_list, (list, np.ndarray))
            and len(other_outcomes_list) > 0
        ):
            for other_outcome in other_outcomes_list:
                other_outcomes.append(
                    {
                        "study_key": study_key,
                        "measure": other_outcome.get("measure"),
                        "description": other_outcome.get("description"),
                        "time_frame": other_outcome.get("timeFrame"),
                    }
                )


    return primary_outcomes, secondary_outcomes, other_outcomes


In [30]:

def transform_contacts_location_module(study_key: str, study_data: pd.Series) -> Tuple:
    central_contacts = []
    study_central_contacts = []
    locations = []
    study_locations = []

    contacts_locations_index = NON_SCALAR_FIELDS["contacts_location"]["index_field"]

    #contacts
    central_contacts_list = study_data.get(f"{contacts_locations_index}.centralContacts")


    if (
        isinstance(central_contacts_list, (list, np.ndarray))
        and len(central_contacts_list) > 0
    ):

        for central_contact in central_contacts_list:
            name = central_contact.get("name")
            role = central_contact.get("role")
            phone = central_contact.get("phone")
            email = central_contact.get("email")

            central_contact_key = generate_key(name, role, phone, email)

            central_contacts.append(
                {
                    "contact_key": central_contact_key,
                    "contact_name": name,
                    "contact_role": role,
                    "contact_phone": phone,
                    "contact_email": email,
                }
            )

            study_central_contacts.append(
                {
                    "study_key": study_key,
                    "contact_key": central_contact_key,
                }
            )

    #locations
    locations_list = study_data.get(f"{contacts_locations_index}.locations")

    if isinstance(locations_list, (list, np.ndarray)) and len(locations_list) > 0:
        for location in locations_list:
            facility = location.get("facility")
            city = location.get("city")
            state = location.get("state")
            country = location.get("country")

            location_key = generate_key(facility, city, state, country)
            curr_location = {
                "location_key": location_key,
                "facility": facility,
                "city": city,
                "state": state,
                "country": state
            }
            geopoint = location.get("geoPoint")

            if isinstance(geopoint, dict) and geopoint:
                curr_location["lat"] = (
                    float(geopoint.get("lat")) if geopoint.get("lat") else None
                )
                curr_location["lon"] = (
                    float(geopoint.get("lon")) if geopoint.get("lon") else None
                )

            locations.append(curr_location)

            study_locations.append(
                {
                    "study_key": study_key,
                    "location_key": location_key,
                    "status": location.get("status"),
                    "contacts": location.get("contacts"), #json blob
                }
            )

    return central_contacts, study_central_contacts, locations, study_locations



In [31]:


def transform_reference_module(study_key: str, study_data: pd.Series) -> Tuple:
    study_references = []
    link_references = []
    ipd_references = []


    references_index = NON_SCALAR_FIELDS["references"]["index_field"]

    references_list = study_data.get(f"{references_index}.references")

    if isinstance(references_list, (list, np.ndarray)) and len(references_list) > 0:

        for reference in references_list:
            pmid = reference.get("pmid")
            reference_key = generate_key(study_key, pmid)
            study_references.append(
                {
                    "study_key": study_key,
                    "ref_key": reference_key,
                    "pmid": pmid,
                    "type": reference.get("type"),
                    "citation": reference.get("citation"),
                }
            )

    links_list = study_data.get(f"{references_index}.seeAlsoLinks")
    if isinstance(links_list, (list, np.ndarray)) and len(links_list) > 0:

        for link in links_list:
            label = link.get("label")
            link_references.append(
                {
                    "study_key": study_key,
                    "link_key": link,
                    "label": label,
                    "url": link.get("url"),
                }
            )

    ipds_list = study_data.get(f"{references_index}.availIpds")

    if isinstance(ipds_list, (list, np.ndarray)) and len(ipds_list) > 0:

        for ipd in ipds_list:
            ipd_id = ipd.get("id")
            ipd_type = ipd.get("type")
            ipd_url = ipd.get("url")

            ipd_key = generate_key(study_key, ipd_id, ipd_type, ipd_url)
            ipd_references.append(
                {
                    "study_key": study_key,
                    "ipd_key": ipd_key,
                    "id": ipd_id,
                    "type": ipd_type,
                    "url": ipd_url,
                    "comment": ipd.get("comment"),
                }
            )

    return study_references, link_references, ipd_references


In [32]:


def transform_outcome_measures_module(study_key: str, study_data: pd.Series) -> Tuple:
    outcome_measures = []
    outcome_measure_groups = []
    outcome_measure_denom_units = []
    outcome_measure_denom_counts = []
    outcome_measure_measurements = []
    outcome_measure_analyses = []
    outcome_measure_comparison_groups = []

    outcomes_index = NON_SCALAR_FIELDS["outcome_measures"]["index_field"]
    outcomes_measures_list = study_data.get(f"{outcomes_index}.outcomeMeasures")

    if (
        isinstance(outcomes_measures_list, (list, np.ndarray))
        and len(outcomes_measures_list) > 0
    ):
        for outcome_measure in outcomes_measures_list:
            outcome_measure_title = outcome_measure.get("title")
            outcome_measure_type = outcome_measure.get("type")
            outcome_measure_key = generate_key(
                study_key, outcome_measure_title, outcome_measure_type
            )

            outcome_measures.append(
                {
                    "outcome_measure_key": outcome_measure_key,
                    "study_key": study_key,
                    "outcome_type": outcome_measure_type,
                    "title": outcome_measure_title,
                    "description": outcome_measure.get("description"),
                    "population_description": outcome_measure.get(
                        "populationDescription"
                    ),
                    "reporting_status": outcome_measure.get("reportingStatus"),
                    "anticipated_posting_date": outcome_measure.get(
                        "anticipatedPostingDate"
                    ),
                    "param_type": outcome_measure.get("paramType"),
                    "dispersion_type": outcome_measure.get("dispersionType"),
                    "unit_of_measure": outcome_measure.get("unitOfMeasure"),
                    "calculate_pct": outcome_measure.get("calculatePct"),
                    "time_frame": outcome_measure.get("timeFrame"),
                    "denom_units_selected": outcome_measure.get("denomUnitsSelected"),
                }
            )

            groups = outcome_measure.get("groups")
            if isinstance(groups, (list, np.ndarray)) and len(groups) > 0:
                for group in groups:
                    group_id = group.get("id")
                    outcome_group_key = generate_key(
                        study_key, outcome_measure_key, group_id
                    )

                    outcome_measure_groups.append(
                        {
                            "outcome_group_key": outcome_group_key,
                            "study_key": study_key,
                            "outcome_measure_key": outcome_measure_key,
                            "group_id": group_id,
                            "title": group.get("title"),
                            "description": group.get("description"),
                        }
                    )

            denoms_list = outcome_measure.get("denoms")
            if isinstance(denoms_list, (list, np.ndarray)) and len(denoms_list) > 0:
                for denom in denoms_list:
                    denom_unit = denom.get("units")
                    denom_unit = denom_unit.upper() if denom_unit else "UNKNOWN"
                    denom_unit_key = generate_key(denom_unit)

                    outcome_measure_denom_units.append(
                        {
                            "denom_unit_key": denom_unit_key,
                            "denom_unit": denom_unit,
                        }
                    )

                    denom_counts = denom.get("counts")
                    if (
                        isinstance(denom_counts, (list, np.ndarray))
                        and len(denom_counts) > 0
                    ):
                        for denom_count in denom_counts:
                            denom_count_group_id = denom_count.get("groupId")
                            denom_count_group_key = generate_key(
                                study_key, outcome_measure_key, denom_count_group_id
                            )
                            denom_count_key = generate_key(
                                study_key,
                                outcome_measure_key,
                                denom_unit_key,
                                denom_count_group_id,
                            )

                            outcome_measure_denom_counts.append(
                                {
                                    "denom_count_key": denom_count_key,
                                    "study_key": study_key,
                                    "outcome_measure_key": outcome_measure_key,
                                    "denom_unit_key": denom_unit_key,
                                    "denom_group_key": denom_count_group_key,
                                    "group_id": denom_count_group_id,
                                    "denom_value": denom_count.get("value"),
                                }
                            )

            classes = outcome_measure.get("classes")
            if isinstance(classes, (list, np.ndarray)) and len(classes) > 0:
                for single_class in classes:
                    # class is not a true container and only wraps categories contrary to what the docs say
                    categories = single_class.get("categories")

                    if (
                        isinstance(categories, (list, np.ndarray))
                        and len(categories) > 0
                    ):
                        for cat_idx, category in enumerate(categories):
                            measurements = category.get("measurements")
                            if (
                                isinstance(measurements, (list, np.ndarray))
                                and len(measurements) > 0
                            ):
                                for measurement in measurements:
                                    meas_group_id = measurement.get("groupId")
                                    meas_group_key = generate_key(
                                        study_key, outcome_measure_key, meas_group_id
                                    )  # group keys must be created the same way

                                    outcome_measure_measurements.append(
                                        {
                                            "measurement_key": meas_group_key,
                                            "outcome_measure_key": outcome_measure_key,
                                            "study_key": study_key,
                                            "group_id": meas_group_id,
                                            "value": measurement.get("value"),
                                            "lower_limit": measurement.get(
                                                "lowerLimit"
                                            ),
                                            "upper_limit": measurement.get(
                                                "upperLimit"
                                            ),
                                            "spread": measurement.get("spread"),
                                            "comment": measurement.get("comment"),
                                        }
                                    )

            analyses = outcome_measure.get("analyses")
            if isinstance(analyses, (list, np.ndarray)) and len(analyses) > 0:
                for analysis in analyses:
                    param_type = (analysis.get("paramType"),)
                    param_value = analysis.get("paramValue")

                    analysis_key = generate_key(
                        study_key, outcome_measure_key, param_type, param_value
                    )

                    outcome_measure_analyses.append(
                        {
                            "analysis_key": analysis_key,
                            "study_key": study_key,
                            "outcome_measure_key": outcome_measure_key,
                            "param_type": param_type,
                            "param_value": param_value,
                            "dispersion_type": analysis.get("dispersionType"),
                            "dispersion_value": analysis.get("dispersionValue"),
                            "statistical_method": analysis.get("statisticalMethod"),
                            "statistical_comment": analysis.get("statisticalComment"),
                            "p_value": analysis.get("pValue"),
                            "p_value_comment": analysis.get("pValueComment"),
                            "non_inferiority_type": analysis.get("nonInferiorityType"),
                        }
                    )

                    analysis_comparison_groups = analysis.get("groupIds")
                    if (
                        isinstance(analysis_comparison_groups, (list, np.ndarray))
                        and len(analysis_comparison_groups) > 0
                    ):
                        for group_id in analysis_comparison_groups:
                            group_key = generate_key(
                                study_key, outcome_measure_key, group_id
                            )
                            outcome_measure_comparison_groups.append(
                                {   "study_key": study_key,
                                    "outcome_measure_key": outcome_measure_key,
                                    "analysis_key": analysis_key,
                                    "group_key": group_key,
                                    "group_id": group_id,
                                }
                            )

    return (
        outcome_measures,
        outcome_measure_groups,
        outcome_measure_denom_units,
        outcome_measure_denom_counts,
        outcome_measure_measurements,
        outcome_measure_analyses,
        outcome_measure_comparison_groups,
    )


In [33]:
def post_process_tables(results: Dict[str, List[Dict]]) -> List[pd.DataFrame]:

    df_studies = pd.DataFrame(results["studies"])

    # identificationModule
    df_secondary_ids = pd.DataFrame(results["secondary_ids"])
    df_nct_aliases = pd.DataFrame(results["nct_aliases"])

    # sponsorCollaboratorsModule
    df_sponsors = pd.DataFrame(results["sponsors"])
    df_study_sponsors = pd.DataFrame(results["study_sponsors"])
    df_collaborators = pd.DataFrame(results["collaborators"])
    df_study_collaborators = pd.DataFrame(results["study_collaborators"])

    # conditionsModule
    df_conditions = pd.DataFrame(results["conditions"])
    df_study_conditions = pd.DataFrame(results["study_conditions"])
    df_keywords = pd.DataFrame(results["keywords"])
    df_study_keywords = pd.DataFrame(results["study_keywords"])

    # armsInterventionsModule
    df_arm_groups = pd.DataFrame(results["arm_groups"])
    df_arm_interventions = pd.DataFrame(results["arm_interventions"])
    df_intervention_names = pd.DataFrame(results["intervention_names"])
    df_study_intervention_names = pd.DataFrame(results["study_intervention_names"])
    df_other_interventions_names = pd.DataFrame(results["other_interventions_names"])
    df_study_other_interventions_names = pd.DataFrame(results["study_other_interventions_names"])

    # outcomesModule
    df_primary_outcomes = pd.DataFrame(results["primary_outcomes"])
    df_secondary_outcomes = pd.DataFrame(results["secondary_outcomes"])
    df_other_outcomes = pd.DataFrame(results["other_outcomes"])

    # contactsLocationsModule
    df_central_contacts = pd.DataFrame(results["central_contacts"])
    df_study_central_contacts = pd.DataFrame(results["study_central_contacts"])
    df_locations = pd.DataFrame(results["locations"])
    df_study_locations = pd.DataFrame(results["study_locations"])

    
    # referencesModule
    df_references = pd.DataFrame(results["references"])
    df_link_references = pd.DataFrame(results["link_references"])
    df_ipd_references = pd.DataFrame(results["ipd_references"])

    
     # outcomeMeasuresModule
    df_outcome_measures = pd.DataFrame(results["outcome_measures"])
    df_outcome_measure_groups= pd.DataFrame(results["outcome_measure_groups"])
    df_outcome_measure_denom_units = pd.DataFrame(results["outcome_measure_denom_units"])
    df_outcome_measure_denom_counts = pd.DataFrame(results["outcome_measure_denom_counts"])
    df_outcome_measure_measurements = pd.DataFrame(results["outcome_measure_measurements"])
    df_outcome_measure_analyses = pd.DataFrame(results["outcome_measure_analyses"])
    df_outcome_measure_comparison_groups = pd.DataFrame(results["outcome_measure_comparison_groups"])


    

    return [
        df_studies,
        df_secondary_ids,
        df_nct_aliases,
        df_sponsors,
        df_study_sponsors,
        df_collaborators,
        df_study_collaborators,
        df_conditions,
        df_study_conditions,
        df_keywords,
        df_study_keywords,
        df_arm_groups,
        df_arm_interventions,
        df_intervention_names,
        df_study_intervention_names,
        df_other_interventions_names,
        df_study_other_interventions_names,
        df_primary_outcomes,
        df_secondary_outcomes,
        df_other_outcomes,
        df_central_contacts,
        df_study_central_contacts,
        df_locations,
        df_study_locations,
        df_references,
        df_link_references,
        df_ipd_references,
        df_outcome_measures,
        df_outcome_measure_groups,
        df_outcome_measure_denom_units,
        df_outcome_measure_denom_counts,
        df_outcome_measure_measurements,
        df_outcome_measure_analyses,
        df_outcome_measure_comparison_groups
        
    ]



In [34]:
def transform_studies_batch():
    try:
        batch_result = process_study_file("1.parquet")
        merged_batch_results = merge_batch_results(batch_result)
        dfs = post_process_tables(merged_batch_results)

        return dfs

    except Exception as e:
        raise


In [35]:
(
    df_studies,
    df_secondary_ids,
    df_nct_aliases,
    df_sponsors,
    df_study_sponsors,
    df_collaborators,
    df_study_collaborators,
    df_conditions,
    df_study_conditions,
    df_keywords,
    df_study_keywords,
    df_arm_groups,
    df_arm_interventions,
    df_intervention_names,
    df_study_intervention_names,
    df_other_interventions_names,
    df_study_other_interventions_names,
    df_primary_outcomes,
    df_secondary_outcomes,
    df_other_outcomes,
    df_central_contacts,
    df_study_central_contacts,
    df_locations,
    df_study_locations,
    df_references,
    df_link_references,
    df_ipd_references,
    df_outcome_measures,
    df_outcome_measure_groups,
    df_outcome_measure_denom_units,
    df_outcome_measure_denom_counts,
    df_outcome_measure_measurements,
    df_outcome_measure_analyses,
    df_outcome_measure_comparison_groups
    

) = transform_studies_batch()


<class 'list'>
1000


In [17]:
# print(df_studies.head())

In [18]:
df_studies.to_csv("data/study_data.csv", index=False)

In [41]:
df_secondary_ids.to_csv("data/secondary_ids.csv", index=False)
df_nct_aliases.to_csv("data/nct_aliases.csv", index=False)

In [42]:
df_sponsors.to_csv("data/sponsors.csv", index=False)
df_study_sponsors.to_csv("data/bridge_study_sponsors.csv", index=False)
df_collaborators.to_csv("data/collab.csv", index=False)
df_study_collaborators.to_csv("data/bridge_study_collab.csv", index=False)

In [32]:
df_conditions.to_csv("data/conditions.csv", index=False)
df_study_conditions.to_csv("data/study_conditions.csv", index=False)
df_keywords.to_csv("data/keywords.csv", index=False)
df_study_keywords.to_csv("data/study_keywords.csv", index=False)

In [19]:
df_arm_groups.to_csv("data/arms.csv", index=False)
df_arm_interventions.to_csv("data/arm_interventions.csv", index=False)
df_intervention_names.to_csv("data/interventions.csv", index=False)
df_study_intervention_names.to_csv("data/study_intervention_names.csv", index=False)
df_other_interventions_names.to_csv("data/other_interventions_names.csv", index=False)
df_study_other_interventions_names.to_csv("data/study_other_interventions_names.csv", index=False)

In [17]:
df_primary_outcomes.to_csv("data/df_primary_outcomes.csv", index=False)
df_secondary_outcomes.to_csv("data/df_secondary_outcomes.csv", index=False)
df_other_outcomes.to_csv("data/df_other_outcomes.csv", index=False)

In [21]:
df_central_contacts.to_csv("data/contacts.csv", index=False)
df_study_central_contacts.to_csv("data/study_contacts.csv", index=False)
df_study_locations.to_csv("data/study_locations.csv", index=False)
df_locations.to_csv("data/locations.csv", index=False)

In [104]:
df_references.to_csv("data/refs.csv", index=False)
df_link_references.to_csv("data/links.csv", index=False)
df_ipd_references.to_csv("data/ipds.csv", index=False)

In [36]:
df_outcome_measures.to_csv("data/df_outcome_measures.csv", index=False)
df_outcome_measure_groups.to_csv("data/df_outcome_measure_groups.csv", index=False)
df_outcome_measure_denom_units.to_csv("data/df_outcome_measure_denom_units.csv", index=False)
df_outcome_measure_denom_counts.to_csv("data/df_outcome_measure_denom_counts.csv", index=False)
df_outcome_measure_measurements.to_csv("data/df_outcome_measure_measurements.csv", index=False)
df_outcome_measure_analyses.to_csv("data/df_outcome_measure_analyses.csv", index=False)
df_outcome_measure_comparison_groups.to_csv("data/df_outcome_measure_comparison_groups.csv", index=False)

In [21]:
duplicates.to_csv("data/dups.csv", index=False)

In [None]:
# studies[studies['study_key'] == '9de216aef0c75756']