In [300]:
from typing import Dict, List, Any, Hashable, Tuple, Set
from datetime import datetime
import pandas as pd
import numpy as np
import hashlib
import json

In [301]:
SINGLE_FIELDS = {
    # Identification
    "nct_id": "protocolSection.identificationModule.nctId",
    "brief_title": "protocolSection.identificationModule.briefTitle",
    "official_title": "protocolSection.identificationModule.officialTitle",
    "acronym": "protocolSection.identificationModule.acronym",
    "org_study_id": "protocolSection.identificationModule.orgStudyIdInfo.id",
    # Description
    "brief_summary": "protocolSection.descriptionModule.briefSummary",
    "detailed_desc": "protocolSection.descriptionModule.detailedDescription",

    # Sponsor
    "responsible_party": "protocolSection.sponsorCollaboratorsModule.responsibleParty.type",

    # Design (single values)
    "study_type": "protocolSection.designModule.studyType",
    "patient_registry": "protocolSection.designModule.patientRegistry",
    "enrollment_type": "protocolSection.designModule.enrollmentInfo.type",
    "enrollment_count": "protocolSection.designModule.enrollmentInfo.count",
    "design_allocation": "protocolSection.designModule.designInfo.allocation",
    "design_intervention_model": "protocolSection.designModule.designInfo.interventionModel",
    "design_intervention_model_desc": "protocolSection.designModule.designInfo.interventionModelDescription",
    "design_primary_purpose": "protocolSection.designModule.designInfo.primaryPurpose",
    "design_observational_model": "protocolSection.designModule.designInfo.observationalModel",
    "design_time_perspective": "protocolSection.designModule.designInfo.timePerspective",
    "design_masking": "protocolSection.designModule.designInfo.maskingInfo.masking",
    # Biospecimen
    "biospec_retention": "protocolSection.designModule.bioSpec.retention",
    "biospec_desc": "protocolSection.designModule.bioSpec.description",
    # Eligibility
    "eligibility_criteria": "protocolSection.eligibilityModule.eligibilityCriteria",
    "healthy_volunteers": "protocolSection.eligibilityModule.healthyVolunteers",
    "sex": "protocolSection.eligibilityModule.sex",
    "min_age": "protocolSection.eligibilityModule.minimumAge",
    "max_age": "protocolSection.eligibilityModule.maximumAge",
    "population_desc": "protocolSection.eligibilityModule.studyPopulation",
    "sampling_method": "protocolSection.eligibilityModule.samplingMethod",
    # Status
    "overall_status": "protocolSection.statusModule.overallStatus",
    "last_known_status": "protocolSection.statusModule.lastKnownStatus",
    "status_verified_date": "protocolSection.statusModule.statusVerifiedDate",
    "start_date": "protocolSection.statusModule.startDateStruct.date",
    "start_date_type": "protocolSection.statusModule.startDateStruct.type",
    "first_submit_date": "protocolSection.statusModule.studyFirstSubmitDate",
    "last_update_submit_date": "protocolSection.statusModule.lastUpdateSubmitDate",
    "completion_date": "protocolSection.statusModule.completionDateStruct.date",
    "completion_date_type": "protocolSection.statusModule.completionDateStruct.type",
    "why_stopped": "protocolSection.statusModule.whyStopped",
    "has_expanded_access": "protocolSection.statusModule.expandedAccessInfo.hasExpandedAccess",
    # Oversight
    "has_dmc": "protocolSection.oversightModule.oversightHasDmc",
    "is_fda_regulated_drug": "protocolSection.oversightModule.isFdaRegulatedDrug",
    "is_fda_regulated_device": "protocolSection.oversightModule.isFdaRegulatedDevice",
    "is_unapproved_device": "protocolSection.oversightModule.isUnapprovedDevice",
    "is_us_export": "protocolSection.oversightModule.isUsExport",
    # Individual participant data
    "ipd_sharing": "protocolSection.ipdSharingStatementModule.ipdSharing",
    "ipd_desc": "protocolSection.ipdSharingStatementModule.description",
    "ipd_time_frame": "protocolSection.ipdSharingStatementModule.timeFrame",
    "ipd_access_criteria": "protocolSection.ipdSharingStatementModule.accessCriteria",
    "ipd_url": "protocolSection.ipdSharingStatementModule.url",
    
    # contacts
    "poc_title": "resultsSection.moreInfoModule.pointOfContact.title",
    "poc_organization": "resultsSection.moreInfoModule.pointOfContact.organization",
    "poc_email": "resultsSection.moreInfoModule.pointOfContact.email",
    "poc_phone": "resultsSection.moreInfoModule.pointOfContact.phone",
    "poc_phone_ext": "resultsSection.moreInfoModule.pointOfContact.phoneExt",

    # Participant flow
    "flow_pre_assignment_details": "resultsSection.participantFlowModule.preAssignmentDetails",
    "flow_recruitment_details": "resultsSection.participantFlowModule.recruitmentDetails",
    "flow_type_units_analysed": "resultsSection.participantFlowModule.typeUnitsAnalyzed",
    
    # Certain agreements
    "certain_agreement_pi_sponsor_employee": "resultsSection.moreInfoModule.certainAgreement.piSponsorEmployee",
    "certain_agreement_restrictive": "resultsSection.moreInfoModule.certainAgreement.restrictiveAgreement",
    "certain_agreement_other_details": "resultsSection.moreInfoModule.certainAgreement.otherDetails",
    "certain_agreement_restriction_type": "resultsSection.moreInfoModule.certainAgreement.restrictionType",
    
    
    # Submission tracking
    "sub_tracking_estimated_results_date": "derivedSection.miscInfoModule.submissionTracking.estimatedResultsFirstSubmitDate",
    
    # Miscellaneous
    "version_holder": "derivedSection.miscInfoModule.versionHolder",
    "has_results": "hasResults",
    "last_updated": "protocolSection.statusModule.lastUpdatePostDateStruct.date",
    "limitations_desc": "resultsSection.moreInfoModule.limitationsAndCaveats.description",

}


In [302]:
NESTED_FIELDS = {
    "sponsor": { #NOT NESTED BUT TREATED AS A SEPARATE DIM
        "index_field": "protocolSection.sponsorCollaboratorsModule.leadSponsor",
        "object_type": "simple dict",
        "fields": [
            ("lead_sponsor_name", "name"),
            ("lead_sponsor_class", "class"),
        ],
        "table_name": "sponsors",
        "bridge_table_name": "study_sponsors",
        "transformer_method": "extract_sponsors",
    },
    "collaborators": {
        "index_field": "protocolSection.sponsorCollaboratorsModule.collaborators",
        "object_type": "array_of_dicts",
        "fields": [
            ("sponsor_name", "name"),
            ("sponsor_class", "class"),
        ],
        "table_name": "sponsor",
        "bridge_table_name": "study_sponsors",
        "transformer_method": "extract_sponsors",
    },
    "conditions": {
        "index_field": "protocolSection.conditionsModule.conditions",
        "object_type": "simple_array",
        "table_name": "conditions",
        "bridge_table_name": "bridge_study_conditions",
        "field_name": "condition_name",
        "transformer_method": "extract_conditions",
    },
    "keywords": {
        "index_field": "protocolSection.conditionsModule.keywords",
        "object_type": "simple_array",
        "table_name": "keywords",
        "bridge_table_name": "bridge_study_keywords",
        "field_name": "keyword",
        "transformer_method": "extract_keywords",
    },
    "interventions": {
        "index_field": "protocolSection.armsInterventionsModule.interventions",
        "object_type": "array_of_dicts",
        "fields": [
            ("intervention_name", "name"),
            ("intervention_desc", "description"),
            ("intervention_type", "type"),
        ],
        "table_name": "interventions",
        "bridge_table_name": "bridge_study_interventions",
        "transformer_method": "extract_interventions",
    },
    "arm_groups": {
        "index_field": "protocolSection.armsInterventionsModule.armGroups",
        "object_type": "array_of_dicts",
        "table_name": "study_arm_group_interventions",
        "fields": [
            ("arm_group_label", "label"),
            ("arm_group_type", "type"),
            ("arm_group_desc", "description"),
        ],
        "transformer_method": "extract_arm_groups",
    },

    "central_contacts": {
        "index_field": "protocolSection.contactsLocationsModule.centralContacts",
        "object_type": "array_of_dicts",
        "table_name": "contacts",
        "bridge_table_name": "study_contacts",
        "fields": [
            ("name", "name"),
            ("role", "role"),
            ("email", "email"),
            ("phone", "phone"),
            ("phoneExt", "phoneExt"),
        ],
        "transformer_method": "extract_central_contacts",
    },
    "locations": {
        "index_field": "protocolSection.contactsLocationsModule.locations",
        "object_type": "array_of_dicts",
        "table_name": "sites",
        "bridge_table_name": "study_sites",
        "fields": [
            ("site_facility", "facility"),
            ("city", "city"),
            ("state", "state"),
            ("zip", "zip"),
            ("country", "country"),
            ("site_status", "status"),
        ],
        "nested": {
            "geoPoint": {
                "object_type": "simple_dict",
                "fields": ["lat", "lon"],
            },
            #contacts are saved as a JSON blob
            "contacts": {
                "object_type": "nested_array_of_dicts",
                "table_name": "contacts",
                "bridge_table_name": "location_contacts",
                "fields": [
                    ("name", "name"),
                    ("role", "role"),
                    ("email", "email"),
                    ("phone", "phone"),
                    ("phoneExt", "phoneExt"),
                ],
                "transformer_method": "extract_contacts",
            },
        },
        "transformer_method": "extract_locations",
    },
    #REFERENCES MODULE

    "references": {
        "index_field": "protocolSection.referencesModule.references",
        "object_type": "array_of_dicts",
        "table_name": "study_publications",
        "fields": ["pmid", "type"],
        "transformer_method": "extract_references",
    },

    "see_also": {
        "index_field": "protocolSection.referencesModule.seeAlsoLinks",
        "object_type": "array_of_dicts",
        "table_name": "study_see_also",
        "fields": ["label",  "url"],
        "transformer_method": "extract_links",
    },

    "avail_ipds": {
        "index_field": "protocolSection.referencesModule.availIpds",
        "object_type": "array_of_dicts",
        "table_name": "study_ipds",
        "fields": ["id", "type", "url", "comment"],
        "transformer_method": "extract_ipds",
    },

    # PARTICIPANT FLOW GROUPS
    'flow_groups': {
        'index_field': 'resultsSection.participantFlowModule.groups',
        'type': 'array_of_dicts',
        'bridge_table_name': 'study_flow_groups',
        'fields': ['id', 'title', 'description'],
        "transformer_method": "extract_flow_groups",
    },

    # PARTICIPANT FLOW PERIODS
    'flow_periods': {
        'index_field': 'resultsSection.participantFlowModule.periods',
        'type': 'array_of_dicts',
        'table_name': 'flow_periods',
        'bridge_table_name': 'study_flow_periods',
        'extract_fields': ['title'],
        'nested': {
            'milestones': ['type', 'comment', 'achievements'],
            'dropWithdraws': ['type', 'comment', 'reasons']
        },
        "transformer_method": "extract_milestone_achievements",
    },

    

}

In [303]:
def generate_key(*args) -> str:
    """Generates a deterministic surrogate key from input values."""
    combined = "|".join(str(arg) for arg in args if arg is not None)
    return hashlib.sha256(combined.encode()).hexdigest()[:16]

In [304]:
def extract_study_fields(study_key: str, study_data: pd.Series) -> Dict:
    study_record = dict()

    study_record['study_key'] = study_key
    for entity_key in SINGLE_FIELDS:
        index_field = SINGLE_FIELDS.get(entity_key)

        study_record[entity_key] = study_data.get(index_field)

    return study_record

In [305]:
def extract_sponsors(idx: int, study_key: str, study_data: pd.Series):

    sponsors = []
    study_sponsors = []

    # Extract lead sponsor
    lead_sponsor_index = NESTED_FIELDS["sponsor"]["index_field"]

    # sponsor name and class are scalar values and MUST be extracted directly
    lead_sponsor_name = study_data.get(f'{lead_sponsor_index}.name')
    lead_sponsor_class = study_data.get(f'{lead_sponsor_index}.class')

    if pd.notna(lead_sponsor_name) and pd.notna(lead_sponsor_class):
        sponsor_key = generate_key(
            lead_sponsor_name, lead_sponsor_class
        )
        sponsors.append(
            {
                "sponsor_key": sponsor_key,
                "name": lead_sponsor_name,
                "sponsor_class": lead_sponsor_class,
            }
        )

        study_sponsors.append(
            {"study_key": study_key, "sponsor_key": sponsor_key, "is_lead": True}
        )
    else:
        print(f"No lead sponsor found for {idx}")

    # Extract collaborators
    collaborators_index = NESTED_FIELDS["collaborators"]["index_field"]
    collaborators_list = study_data.get(collaborators_index)

    if isinstance(collaborators_list, (list, np.ndarray)) and len(collaborators_list) > 0:
        for collaborator in collaborators_list:
            sponsor_key = generate_key(
                collaborator.get("name"), collaborator.get("class")
            )

            sponsors.append(
                {
                    "sponsor_key": sponsor_key,
                    "name": collaborator.get("name"),
                    "sponsor_class": collaborator.get("class"),
                }
            )

            study_sponsors.append(
                {"study_key": study_key, "sponsor_key": sponsor_key, "is_lead": False}
            )

    return sponsors, study_sponsors

In [306]:
def extract_conditions(idx: int, study_key: str, study_data: pd.Series) -> Tuple | None:
    conditions = []
    study_conditions = []

    conditions_index = NESTED_FIELDS["conditions"]["index_field"]
    conditions_list = study_data.get(conditions_index)


    if isinstance(conditions_list, (list, np.ndarray)) and len(conditions_list) > 0:
        for condition in conditions_list:
            condition_key = generate_key(condition)

            conditions.append(
                {"condition_key": condition_key, "condition_name": condition}
            )

            study_conditions.append(
                {
                    "study_key": study_key,
                    "condition_key": condition_key,
                }
            )

        return conditions, study_conditions

    print(f"No conditions found for {idx}")
    return conditions, study_conditions

In [307]:
def extract_keywords(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    keywords = []
    study_keywords = []

    keywords_index = NESTED_FIELDS["keywords"]["index_field"]
    keywords_list = study_data.get(keywords_index)
    if isinstance(keywords_list, (list, np.ndarray)) and len(keywords_list) > 0:
        for keyword in keywords_list:
            keyword_key = generate_key(keyword)

            keywords.append({"keyword_key": keyword_key, "keyword_name": keyword})

            study_keywords.append(
                {
                    "study_key": study_key,
                    "keyword_key": keyword_key,
                }
            )
        
    return keywords, study_keywords

In [308]:
def extract_interventions(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    intervention_names = []
    study_interventions = []

    interventions_index = NESTED_FIELDS["interventions"]["index_field"]
    interventions_list = study_data.get(interventions_index)

    if isinstance(interventions_list, (list, np.ndarray)) and len(interventions_list) > 0:
        for intervention in interventions_list:
            main_name = intervention.get("name")
            intervention_type = intervention.get("type")
            description = intervention.get("description")

            intervention_key = generate_key(main_name, intervention_type)
            intervention_names.append({
                "intervention_key": intervention_key,
                "intervention_name": main_name,
                "intervention_type": intervention_type,
                "description": description,
               
            })

            study_interventions.append({
                "study_key": study_key,
                "intervention_key": intervention_key,
                "is_primary_name": True
            })


            other_names = intervention.get("otherNames")
            if isinstance(other_names, (list, np.ndarray)) and len(other_names) > 0:
                
                for other_name in other_names:
                    if other_name == main_name:
                        continue #some studies put the main name in the list of other names
                    intervention_key = generate_key(other_name, intervention_type)
                    intervention_names.append({
                        "intervention_key": intervention_key,
                        "intervention_name": other_name,
                        "intervention_type": intervention_type,
                        "description": description,  # inherits from parent
                       
                    })

                    study_interventions.append({
                        "study_key": study_key,
                        "intervention_key": intervention_key,
                         "is_primary_name": False
                    })
    else:
        pass
        # print(f"No interventions found for study {study_key}, {idx}")
    return intervention_names, study_interventions

In [309]:
def extract_arm_groups(idx: Hashable, study_key: str, study_data: pd.Series) -> List | None:
    study_arms_interventions = []

    study_arms_index = NESTED_FIELDS["arm_groups"]["index_field"]
    study_arms_list = study_data.get(study_arms_index)

    if isinstance(study_arms_list, (list, np.ndarray)) and len(study_arms_list) > 0:
        for study_arm in study_arms_list:
            study_arm_label = study_arm.get("label")
            study_arm_description = study_arm.get("description")
            study_arm_type = study_arm.get("type")

            arm_intervention_key = generate_key(study_key, study_arm_label, study_arm_description,
                                                     study_arm_type)
            
            arm_interventions = study_arm.get("interventionNames")
            if isinstance(arm_interventions, (list, np.ndarray)) and len(arm_interventions) > 0:

                for intervention in arm_interventions:
                    study_arms_interventions.append(
                        {
                            "study_key": study_key,
                            "arm_intervention_key": arm_intervention_key,
                            "arm_label": study_arm_label,
                            "arm_description": study_arm_description,
                            "arm_type": study_arm_type,
                            "arm_intervention_name": intervention,
                        }
                    )
            else:
                study_arms_interventions.append(
                    {
                        "study_key": study_key,
                        "arm_intervention_key": arm_intervention_key,
                        "arm_label": study_arm_label,
                        "arm_description": study_arm_description,
                        "arm_type": study_arm_type,
                        "arm_intervention_name": None,
                    }
                )

    return study_arms_interventions

In [310]:
def extract_central_contacts(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    central_contacts = []
    study_central_contacts = []

    central_contacts_index = NESTED_FIELDS["central_contacts"]["index_field"]
    central_contacts_list = study_data.get(central_contacts_index)

    if isinstance(central_contacts_list, (list, np.ndarray)) and len(central_contacts_list) > 0:

        for central_contact in central_contacts_list:
            name = central_contact.get("name")
            role = central_contact.get("role")
            phone = central_contact.get("phone")
            email = central_contact.get("email")

            central_contact_key = generate_key(name, role, phone, email)

            central_contacts.append(
                {"contact_key": central_contact_key,
                 "contact_name": name,
                 "contact_role": role,
                 "contact_phone": phone,
                 "contact_email": email,
                 })

            study_central_contacts.append(
                {
                    "study_key": study_key,
                    "contact_key": central_contact_key,
                }
            )
    return central_contacts, study_central_contacts

In [311]:
def resolve_location_status(location_statuses: Set) -> str:
    if not location_statuses:
        return "UNKNOWN"

    if len(location_statuses) == 1:
        return list(location_statuses)[0]

    # progression case
    if location_statuses == {"RECRUITING", "NOT_YET_RECRUITING"}:
        return "RECRUITING"

    final_statuses = ["COMPLETED", "TERMINATED", "WITHDRAWN"]

    if "RECRUITING" in location_statuses:
        for final_status in final_statuses:
            if final_status in location_statuses:
                return final_status  # study ended, can't recruit

        # RECRUITING plus other ambiguous statuses
        return "RECRUITING_STATUS_UNCLEAR"

    # Multiple non-recruiting statuses - anyone works. makes no difference
    return list(location_statuses)[0]

In [312]:
def extract_locations(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    """
    Extract locations with status resolution
    """
    locations = []
    study_locations = []


    locations_index = NESTED_FIELDS["locations"]["index_field"]
    locations_list = study_data.get(locations_index)

    if isinstance(locations_list, (list, np.ndarray)) and len(locations_list) > 0:
        for location in locations_list:
            facility = location.get("facility")
            city = location.get("city")
            state = location.get("state")
            country = location.get("country")
            
            location_key = generate_key(facility, city, state, country)
            curr_location = {
                "location_key": location_key,
                "facility": facility,
                "city": city,
                "state": state,
                "country": state,

            }
            geopoint = location.get("geoPoint")
            if isinstance(geopoint, dict) and geopoint:
                curr_location["lat"] = float(geopoint.get("lat")) if geopoint.get("lat") else None
                curr_location["lon"] = float(geopoint.get("lon")) if geopoint.get("lon") else None

            locations.append(curr_location)

            # resolve location status
            statuses = [loc.get("status") for loc in locations if loc.get("status")]
            unique_statuses = set(statuses)

            resolved_status = resolve_location_status(unique_statuses)

            study_locations.append({
                "study_key": study_key,
                "location_key": location_key,
                "status": resolved_status,
                "status_type": "", #aCTUAL or inferred
                "contacts": location.get("contacts"),

            })

    return locations, study_locations

In [313]:
def extract_references(idx: Hashable, study_key: str, study_data: pd.Series) -> List:

    study_references = []

    references_index = NESTED_FIELDS["references"]["index_field"]
    references_list = study_data.get(references_index)

    if isinstance(references_list, (list, np.ndarray)) and len(references_list) > 0:

        for reference in references_list:
            pmid = reference.get('pmid')
            reference_key = generate_key(study_key, pmid)
            study_references.append({
                "study_key": study_key,
                "ref_key": reference_key,
                "pmid": pmid,
                "type": reference.get("type"),
                "citation": reference.get("citation")
             })

    return study_references



In [314]:
def extract_links(idx: Hashable, study_key: str, study_data: pd.Series) -> List:
    study_links = []

    links_index = NESTED_FIELDS["see_also"]["index_field"]
    links_list = study_data.get(links_index)

    if isinstance(links_list, (list, np.ndarray)) and len(links_list) > 0:
        for link in links_list:
            label = link.get('label')
            link_key = generate_key(study_key, label)
            study_links.append({
                "study_key": study_key,
                "link_key": link_key,
                "label": label,
                "url": link.get("url")
            })

    return study_links

In [315]:
def extract_ipds(idx: Hashable, study_key: str, study_data: pd.Series) -> List:
    study_ipds = []

    ipds_index = NESTED_FIELDS["avail_ipds"]["index_field"]
    ipds_list = study_data.get(ipds_index)

    if isinstance(ipds_list, (list, np.ndarray)) and len(ipds_list) > 0:

        for ipd in ipds_list:
            ipd_id = ipd.get('id')
            ipd_type = ipd.get('type')
            ipd_url = ipd.get('url')

            ipd_key = generate_key(study_key, ipd_id, ipd_type, ipd_url)
            study_ipds.append({
                "study_key": study_key,
                "ipd_key": ipd_key,
                "id": ipd_id,
                "type": ipd_type,
                "url": ipd_url,
                "comment": ipd.get("comment")
            })

    return study_ipds

In [316]:
def extract_flow_groups(idx: Hashable, study_key: str, study_data: pd.Series) -> List:
    study_flow_groups = []

    flow_index = NESTED_FIELDS["flow_groups"]["index_field"]
    flow_group_list = study_data.get(flow_index)

    if isinstance(flow_group_list, (list, np.ndarray)) and len(flow_group_list) > 0:
        for flow in flow_group_list:
            group_id = flow.get('id')
            group_key = generate_key(study_key, group_id)

            study_flow_groups.append({
                "study_key": study_key,
                "group_key": group_key,
                "group_id": group_id,
                "title": flow.get("title"),
                "description": flow.get("description")

            })

    return study_flow_groups



In [317]:
def extract_flow_events(idx: Hashable, study_key: str, study_data: pd.Series) -> List:
    flow_period_events = []

    flow_index = NESTED_FIELDS["flow_periods"]["index_field"]
    flow_period_list = study_data.get(flow_index)

    if isinstance(flow_period_list, (list, np.ndarray)) and len(flow_period_list) > 0:
        for period in flow_period_list:
            period_title = period.get('title')
            period_key = generate_key(study_key, period_title)

            period_milestones = period.get('milestones')
            if isinstance(period_milestones, (list, np.ndarray)) and len(period_milestones) > 0:
                for period_milestone in period_milestones:
                    milestone_type = period_milestone.get('type')
                    milestone_achievements = period_milestone.get('achievements')

                    if isinstance(milestone_achievements, (list, np.ndarray)) and len(milestone_achievements) > 0:
                        for achievement in milestone_achievements:
                            flow_period_events.append({
                                "study_key": study_key,
                                "period_key": period_key,
                                "event_class": "ACHIEVEMENT",
                                "event_type": milestone_type,
                                "period_title": period_title,
                                "group_id": achievement.get('groupId'),
                                "num_subjects": achievement.get('numSubjects'),

                            })
                    else:
                        flow_period_events.append({
                            "study_key": study_key,
                            "event_class": "ACHIEVEMENT",
                            "event_type": milestone_type,
                            "period_key": period_key,
                            "period_title": period_title,

                            "group_id":"UNKNOWN",
                            "num_subjects": None, #not 0

                        })

            period_withdrawals = period.get('dropWithdraws')
            if isinstance(period_withdrawals, (list, np.ndarray)) and len(period_withdrawals) > 0:
                for withdrawal in period_withdrawals:
                    withdrawal_type = withdrawal.get('type')
                    withdrawal_reasons = withdrawal.get('reasons')

                    if isinstance(withdrawal_reasons, (list, np.ndarray)) and len(withdrawal_reasons) > 0:
                        for reason in withdrawal_reasons:
                            flow_period_events.append({
                                "study_key": study_key,
                                "period_key": period_key,
                                "event_class": "WITHDRAWAL",
                                "event_type": withdrawal_type,
                                "period_title": period_title,
                                "group_id": reason.get('groupId'),
                                "num_subjects": reason.get('numSubjects'),

                            })
                    else:
                        flow_period_events.append({
                            "study_key": study_key,
                            "event_class": "WITHDRAWAL",
                            "event_type": withdrawal_type,
                            "period_key": period_key,
                            "period_title": period_title,
                            "group_id":"UNKNOWN",
                            "num_subjects": None, #not 0

                        })



    return flow_period_events

In [None]:
all_studies = []
all_sponsors = []
all_study_sponsors = []

all_conditions = []
all_study_conditions = []

all_keywords = []
all_study_keywords = []

all_arm_group_interventions = []

all_interventions = []
all_interventions_other_names = []
all_study_interventions = []

all_locations = []
all_study_locations = []
all_central_contacts = []
all_study_central_contacts = []

all_study_references = []
all_study_links = []

all_ipds = []

all_flow_groups = []
all_flow_period_events = []


df = pd.read_parquet("1.parquet")
df_studies = pd.json_normalize(df['studies'].tolist())

for idx, study in df_studies.iterrows():
    nct_index = SINGLE_FIELDS['nct_id']
    nct_id = study.get(nct_index)
    
    study_key = generate_key(nct_id)


    #study
    study_record = extract_study_fields(study_key, study)
    all_studies.append(study_record)

    # sponsors
    sponsors, study_sponsors = extract_sponsors(idx, study_key, study)
    all_sponsors.extend(sponsors)
    all_study_sponsors.extend(study_sponsors)

    # conditions and keywords
    conditions, study_conditions = extract_conditions(idx, study_key, study)
    all_conditions.extend(conditions)
    all_study_conditions.extend(study_conditions)

    keywords, study_keywords = extract_keywords(idx, study_key, study)
    all_keywords.extend(keywords)
    all_study_keywords.extend(study_keywords)


    # groups and interventions
    arm_group_interventions = extract_arm_groups(idx, study_key, study)
    all_arm_group_interventions.extend(arm_group_interventions)

    interventions, study_interventions = extract_interventions(idx, study_key, study)
    all_interventions.extend(interventions)
    all_study_interventions.extend(study_interventions)

    # contacts and locations
    central_contacts, study_central_contacts  = extract_central_contacts(idx, study_key, study)
    all_central_contacts.extend(central_contacts)
    all_study_central_contacts.extend(study_central_contacts)

    locations, study_locations = extract_locations(idx, study_key, study)
    all_locations.extend(locations)
    all_study_locations.extend(study_locations)


    #links and references
    references = extract_references(idx, study_key, study)
    all_study_references.extend(references)

    links = extract_links(idx, study_key, study)
    all_study_links.extend(links)

    ipds = extract_ipds(idx, study_key, study)
    all_ipds.extend(ipds)

    flow_groups = extract_flow_groups(idx,study_key, study)
    all_flow_groups.extend(flow_groups)

    flow_period_events = extract_flow_events(idx, study_key, study)
    all_flow_period_events.extend(flow_period_events)




studies = pd.DataFrame(all_studies)

df_sponsors = pd.DataFrame(all_sponsors)
df_study_sponsors = pd.DataFrame(all_study_sponsors)

df_conditions = pd.DataFrame(all_conditions)
df_study_conditions= pd.DataFrame(all_study_conditions)

df_keywords = pd.DataFrame(all_keywords)
df_study_keywords = pd.DataFrame(all_study_keywords)

df_arm_group_interventions = pd.DataFrame(all_arm_group_interventions)
df_interventions = pd.DataFrame(all_interventions)
df_study_interventions = pd.DataFrame(all_study_interventions)


df_central_contacts = pd.DataFrame(all_central_contacts)
df_study_central_contacts = pd.DataFrame(all_study_central_contacts)
df_locations = pd.DataFrame(all_locations)
df_study_locations = pd.DataFrame(all_study_locations)


df_references = pd.DataFrame(all_study_references)
df_links = pd.DataFrame(all_study_links)

df_ipds = pd.DataFrame(all_ipds)

df_flow_groups = pd.DataFrame(all_flow_groups)
df_flow_period_events = pd.DataFrame(all_flow_period_events)


print("----------------")
print(f"STUDIES {len(studies)}")
print("----------------")


#dedupe and inspect
print(f"SPONSORS {len(df_sponsors)}")
df_sponsors = df_sponsors.drop_duplicates(subset=["sponsor_key"])
print(f"DEDUPED SPONSORS {len(df_sponsors)}")

print(f"STUDY SPONSORS {len(df_study_sponsors)}")
df_study_sponsors = df_study_sponsors.drop_duplicates(subset=["sponsor_key", "study_key"])
print(f"DEDUPED STUDY SPONSORS {len(df_study_sponsors)}")
print("----------------")

print(f"CONDITIONS {len(df_conditions)}")
df_conditions = df_conditions.drop_duplicates(subset=["condition_key"])
print(f"DEDUPED CONDITIONS {len(df_conditions)}")

print(f"STUDY CONDITIONS {len(df_study_conditions)}")
df_study_conditions = df_study_conditions.drop_duplicates(subset=["condition_key", "study_key"])
print(f"DEDUPED STUDY CONDITIONS {len(df_study_conditions)}")
print("----------------")

print(f"KEYWORDS {len(df_keywords)}")
df_keywords = df_keywords.drop_duplicates(subset=["keyword_key"])
print(f"DEDUPED KEYWORDS {len(df_keywords)}")

print(f"STUDY KEYWORDS {len(df_study_keywords)}")
df_study_keywords = df_study_keywords.drop_duplicates(subset=["keyword_key", "study_key"])
print(f"DEDUPED STUDY KEYWORDS {len(df_study_keywords)}")
print("----------------")


# duplicates = df_study_interventions[
#     df_study_interventions.duplicated(subset=["intervention_key", "study_key"], keep=False)
# ].sort_values(["study_key", "intervention_key"])
# print(df_study_interventions.groupby(['study_key', 'intervention_key']).size().sort_values(ascending=False).head(20))
# print(duplicates.head(20))


print(f"INTERVENTIONS {len(df_interventions)}")
df_interventions = df_interventions.drop_duplicates(subset=["intervention_key"])
print(f"DEDUPED INTERVENTIONS {len(df_interventions)}")

print(f"STUDY INTERVENTIONS {len(df_study_interventions)}")
df_study_interventions = df_study_interventions.drop_duplicates(subset=["intervention_key", "study_key"])
print(f"DEDUPED STUDY INTERVENTIONS {len(df_study_interventions)}")
print("----------------")


print(f"ARM GROUPS INTERVENTION {len(df_arm_group_interventions)}")
df_arm_group_interventions = df_arm_group_interventions.drop_duplicates(subset=["arm_intervention_key", "study_key", "arm_intervention_name"])
print(f"DEDUPED ARM GROUPS INTERVENTION {len(df_arm_group_interventions)}")
print("----------------")

print(f"LOCATIONS {len(df_locations)}")
df_locations = df_locations.drop_duplicates(subset=["location_key"])
print(f" DEDUPED LOCATIONS {len(df_locations)}")

print(f" STUDY LOCATIONS {len(df_study_locations)}")

# duplicates = df_study_locations[
#     df_study_locations.duplicated(subset=["study_key", "location_key"], keep=False)
# ]
# print(duplicates.head(20))


df_study_locations = df_study_locations.drop_duplicates(subset=["location_key", "study_key"])
print(f" DEDUPED STUDY LOCATIONS {len(df_study_locations)}")
print("----------------")



print(f"CONTACTS {len(df_central_contacts)}")
df_central_contacts = df_central_contacts.drop_duplicates(subset=["contact_key"])
print(f" DEDUPED CONTACTS {len(df_central_contacts)}")

print(f"STUDY CONTACTS {len(df_study_central_contacts)}")
df_study_central_contacts = df_study_central_contacts.drop_duplicates(subset=["contact_key", "study_key"])
print(f"DEDUPED STUDY CONTACTS {len(df_study_central_contacts)}")

print("----------------")

print(f"REFERENCES {len(df_references)}")
df_references = df_references.drop_duplicates(subset=["study_key", "ref_key"])
print(f"DEDUPED REFERENCES {len(df_references)}")

print(f"LINKS {len(df_links)}")
df_links = df_links.drop_duplicates(subset=["study_key", "link_key", "url"])
print(f"LINKS {len(df_links)}")

print("----------------")
print(f"IPDS {len(df_ipds)}")
df_ipds = df_ipds.drop_duplicates(subset=["study_key", "ipd_key"])
print(f"DEDUPED IPDS {len(df_ipds)}")

print("----------------")
print(f"FLOW GROUPS {len(df_flow_groups)}")
df_flow_groups = df_flow_groups.drop_duplicates(subset=["study_key", "group_key"])
print(f"DEDUPED FLOW GROUPS {len(df_flow_groups)}")

duplicates = df_flow_period_events[
    df_flow_period_events.duplicated(subset=["study_key", "period_key", "group_id", "event_class", "event_type" ], keep=False)
]
print(duplicates.head(10))
print(f"EVENTS {len(df_flow_period_events)}")
df_flow_period_events = df_flow_period_events.drop_duplicates(subset=["study_key", "period_key", "group_id", "event_class", "event_type" ])
print(f"DEDUPED Events {len(df_flow_period_events)}")
print("----------------")

In [140]:
studies.to_csv("data/study_data.csv", index=False)

In [None]:
df_sponsors.to_csv("data/sponsors.csv", index=False)
df_study_sponsors.to_csv("data/bridge_study_sponsors.csv", index=False)

In [None]:
df_conditions.to_csv("data/conditions.csv", index=False)
df_study_conditions.to_csv("data/study_conditions.csv", index=False)

In [None]:
df_keywords.to_csv("data/keywords.csv", index=False)
df_study_keywords.to_csv("data/study_keywords.csv", index=False)

In [None]:
df_interventions.to_csv("data/interventions.csv", index=False)
df_study_interventions.to_csv("data/study_interventions.csv", index=False)
df_arm_group_interventions.to_csv("data/arm_groups_intrv.csv", index=False)

In [None]:
df_central_contacts.to_csv("data/contacts.csv", index=False)
df_study_central_contacts.to_csv("data/study_contacts.csv", index=False)

In [85]:
df_study_locations.to_csv("data/study_locations.csv", index=False)
df_locations.to_csv("data/locations.csv", index=False)

In [229]:
# studies[studies['study_key'] == 'e5631c51402e5389']

In [104]:
df_references.to_csv("data/refs.csv", index=False)
df_links.to_csv("data/links.csv", index=False)

In [257]:
df_ipds.to_csv("data/ipds.csv", index=False)

In [320]:
df_flow_groups.to_csv("data/flow_groups.csv", index=False)
df_flow_period_events.to_csv("data/flow_events.csv", index=False)