In [1]:
from typing import Dict, List, Any, Hashable, Tuple
from datetime import datetime
import pandas as pd
import hashlib
import json

In [5]:
SINGLE_FIELDS = {
    # Identification
    "nct_id": "protocolSection.identificationModule.nctId",
    "brief_title": "protocolSection.identificationModule.briefTitle",
    "official_title": "protocolSection.identificationModule.officialTitle",
    "acronym": "protocolSection.identificationModule.acronym",
    "org_study_id": "protocolSection.identificationModule.orgStudyIdInfo.id",
    "org_study_type": "protocolSection.identificationModule.orgStudyIdInfo.type",
    "org_study_link": "protocolSection.identificationModule.orgStudyIdInfo.link",

    # Description
    "brief_summary": "protocolSection.descriptionModule.briefSummary",
    "detailed_desc": "protocolSection.descriptionModule.detailedDescription",

    # ResponsibleParty
    "responsible_party": "protocolSection.sponsorCollaboratorsModule.responsibleParty.type",

    # Design (single values)
    "study_type": "protocolSection.designModule.studyType",
    "patient_registry": "protocolSection.designModule.patientRegistry",
    "enrollment_type": "protocolSection.designModule.enrollmentInfo.type",
    "enrollment_count": "protocolSection.designModule.enrollmentInfo.count",
    "design_allocation": "protocolSection.designModule.designInfo.allocation",
    "design_intervention_model": "protocolSection.designModule.designInfo.interventionModel",
    "design_intervention_model_desc": "protocolSection.designModule.designInfo.interventionModelDescription",
    "design_primary_purpose": "protocolSection.designModule.designInfo.primaryPurpose",
    "design_observational_model": "protocolSection.designModule.designInfo.observationalModel",
    "design_time_perspective": "protocolSection.designModule.designInfo.timePerspective",
    "design_masking": "protocolSection.designModule.designInfo.maskingInfo.masking",
    "design_masking_desc": "protocolSection.designModule.designInfo.maskingInfo.maskingDescription",
    "design_who_masked": "protocolSection.designModule.designInfo.maskingInfo.whoMasked",

    # Expanded access
    "exp_acc_type_individual": "protocolSection.designModule.expandedAccessTypes.individual",
    "exp_acc_type_intermediate": "protocolSection.designModule.expandedAccessTypes.intermediate",
    "exp_acc_type_treatment": "protocolSection.designModule.expandedAccessTypes.treatment",

    # Biospecimen
    "biospec_retention": "protocolSection.designModule.bioSpec.retention",
    "biospec_desc": "protocolSection.designModule.bioSpec.description",

    # Eligibility
    "eligibility_criteria": "protocolSection.eligibilityModule.eligibilityCriteria",
    "healthy_volunteers": "protocolSection.eligibilityModule.healthyVolunteers",
    "sex": "protocolSection.eligibilityModule.sex",
    "gender_based": "protocolSection.eligibilityModule.genderBased",
    "gender_desc": "protocolSection.eligibilityModule.genderDescription",
    "min_age": "protocolSection.eligibilityModule.minimumAge",
    "max_age": "protocolSection.eligibilityModule.maximumAge",
    "population_desc": "protocolSection.eligibilityModule.studyPopulation",
    "sampling_method": "protocolSection.eligibilityModule.samplingMethod",

    # Status
    "overall_status": "protocolSection.statusModule.overallStatus",
    "last_known_status": "protocolSection.statusModule.lastKnownStatus",
    "status_verified_date": "protocolSection.statusModule.statusVerifiedDate",
    "delayed_posting": "protocolSection.statusModule.delayedPosting",
    "start_date": "protocolSection.statusModule.startDateStruct.date",
    "start_date_type": "protocolSection.statusModule.startDateStruct.type",
    "first_submit_date": "protocolSection.statusModule.studyFirstSubmitDate",
    "first_submit_qc_date": "protocolSection.statusModule.studyFirstSubmitQcDate",
    "last_update_submit_date": "protocolSection.statusModule.lastUpdateSubmitDate",
    "completion_date": "protocolSection.statusModule.completionDateStruct.date",
    "completion_date_type": "protocolSection.statusModule.completionDateStruct.type",
    "why_stopped": "protocolSection.statusModule.whyStopped",
    "has_expanded_access": "protocolSection.statusModule.expandedAccessInfo.hasExpandedAccess",

    # Oversight
    "has_dmc": "protocolSection.oversightModule.oversightHasDmc",
    "is_fda_regulated_drug": "protocolSection.oversightModule.isFdaRegulatedDrug",
    "is_fda_regulated_device": "protocolSection.oversightModule.isFdaRegulatedDevice",
    "is_unapproved_device": "protocolSection.oversightModule.isUnapprovedDevice",
    "is_us_export": "protocolSection.oversightModule.isUsExport",

    # Individual participant data
    "ipd_sharing": "protocolSection.ipdSharingStatementModule.ipdSharing",
    "ipd_desc": "protocolSection.ipdSharingStatementModule.description",
    "ipd_time_frame": "protocolSection.ipdSharingStatementModule.timeFrame",
    "ipd_access_criteria": "protocolSection.ipdSharingStatementModule.accessCriteria",
    "ipd_url": "protocolSection.ipdSharingStatementModule.url",

    # Large documents
    "large_doc_no_sap": "documentSection.largeDocumentModule.noSap",

    # Miscellaneous
    "version_holder": "derivedSection.miscInfoModule.versionHolder",
    "has_results": "hasResults",
    "last_updated": "protocolSection.statusModule.lastUpdatePostDateStruct.date",
    "unposted_responsible_party": "annotationSection.annotationModule.unpostedAnnotation.unpostedResponsibleParty",
    "limitations_desc": "resultsSection.moreInfoModule.limitationsAndCaveats.description",

    # Certain agreements
    "certain_agreement_pi_sponsor_employee": "resultsSection.moreInfoModule.certainAgreement.piSponsorEmployee",
    "certain_agreement_restriction_type": "resultsSection.moreInfoModule.certainAgreement.restrictionType",
    "certain_agreement_restrictive": "resultsSection.moreInfoModule.certainAgreement.restrictiveAgreement",
    "certain_agreement_other_details": "resultsSection.moreInfoModule.certainAgreement.otherDetails",

    # Point of contact
    "poc_title": "resultsSection.moreInfoModule.pointOfContact.title",
    "poc_organization": "resultsSection.moreInfoModule.pointOfContact.organization",
    "poc_email": "resultsSection.moreInfoModule.pointOfContact.email",
    "poc_phone": "resultsSection.moreInfoModule.pointOfContact.phone",
    "poc_phone_ext": "resultsSection.moreInfoModule.pointOfContact.phoneExt",

    # Submission tracking
    "sub_tracking_estimated_results_date": "derivedSection.miscInfoModule.submissionTracking.estimatedResultsFirstSubmitDate",
    "sub_tracking_first_mcp_date": "derivedSection.miscInfoModule.submissionTracking.firstMcpInfo.postDateStruct.date",
    "sub_tracking_first_mcp_type": "derivedSection.miscInfoModule.submissionTracking.firstMcpInfo.postDateStruct.type",
}



In [9]:
def generate_key(*args) -> str:
    """Generates a deterministic surrogate key from input values."""
    combined = "|".join(str(arg) for arg in args if arg is not None)
    return hashlib.sha256(combined.encode()).hexdigest()[:16]

In [10]:
def deep_get(data, path: str):
    """Navigate nested dict using dot separated path, or flat Series/dict."""
    #for normalised dfs
    if isinstance(data, pd.Series):
        return data.get(path)
    
    # dict navigation
    keys = path.split('.')
    value = data
    for key in keys:
        if isinstance(value, dict):
            value = value.get(key)
        else:
            return None
    return value

In [11]:
def extract_study_fields(study_key: str, study_df: dict) -> Dict:
    study_record = dict()

    study_record['study_key'] = study_key
    for entity_key in SINGLE_FIELDS:
        index_field = SINGLE_FIELDS.get(entity_key)

        study_record[entity_key] = deep_get(study_df, index_field)

    return study_record

In [16]:
all_studies = []
all_sponsors = []
all_study_sponsors = []

all_conditions = []
all_study_conditions = []

all_keywords = []
all_study_keywords = []

all_study_arm_groups = []
all_study_arm_group_interventions = []

all_interventions = []
all_interventions_other_names = []
all_study_interventions = []

df = pd.read_parquet("page1.parquet")
df_studies = pd.json_normalize(df['studies'].tolist())


for idx, study in df_studies.iterrows():
    nct_id = deep_get(study, 'protocolSection.identificationModule.nctId')
    study_key = generate_key(nct_id)


    #study
    study_record = extract_study_fields(study_key, study)
    all_studies.append(study_record)


df_studies1 = pd.DataFrame(all_studies)

In [17]:
df_studies1.to_csv("output1.csv", index=False)