In [16]:
import pandas as pd
import pyarrow.parquet as pq

In [17]:
pf = pq.ParquetFile("page1.parquet")

In [18]:
print(pf.schema)

<pyarrow._parquet.ParquetSchema object at 0x000001E37DCC6F40>
required group field_id=-1 schema {
  optional group field_id=-1 studies {
    optional group field_id=-1 annotationSection {
      optional group field_id=-1 annotationModule {
        optional group field_id=-1 unpostedAnnotation {
          optional group field_id=-1 unpostedEvents (List) {
            repeated group field_id=-1 list {
              optional group field_id=-1 element {
                optional binary field_id=-1 date (String);
                optional binary field_id=-1 type (String);
              }
            }
          }
          optional binary field_id=-1 unpostedResponsibleParty (String);
        }
      }
    }
    optional group field_id=-1 derivedSection {
      optional group field_id=-1 conditionBrowseModule {
        optional group field_id=-1 ancestors (List) {
          repeated group field_id=-1 list {
            optional group field_id=-1 element {
              optional binary field_i

In [19]:
df_studies = pd.read_parquet("page1.parquet")

In [20]:
all_protocol_data = df_studies['studies'].apply(lambda x: x.get('protocolSection'))

In [21]:
intervention_df = [protocol_data['armsInterventionsModule'] for protocol_data in all_protocol_data]

In [22]:
print(intervention_df[0])

{'armGroups': array([{'description': "Participants in this group will receive an intervention based on the salutogenic model, aimed at enhancing the sense of coherence and promoting healthy aging. The program will focus on improving participants' overall quality of life by strengthening their internal resources and fostering resilience. The intervention will include educational sessions, guided discussions, and practical strategies for maintaining physical and mental health in older adults with hypertension and diabetes. The intervention will be delivered over a specified period, with regular follow-ups to assess progress.", 'interventionNames': array(['Behavioral: Salutogenic Model-Based Healthy Aging Program'],
             dtype=object), 'label': 'Intervention Group', 'type': 'EXPERIMENTAL'}                                                                                                                                                                                                   

In [23]:
df_normalized = pd.json_normalize(df_studies['studies'])

In [24]:
# print(df_normalized)

In [25]:
nct_series = df_normalized['protocolSection.identificationModule.nctId']

In [30]:
study_df = pd.DataFrame({
    #identification
    'nct_id': df_normalized['protocolSection.identificationModule.nctId'],
    'title': df_normalized['protocolSection.identificationModule.officialTitle'],
    'acronym': df_normalized['protocolSection.identificationModule.acronym'],
    'organization_name': df_normalized['protocolSection.identificationModule.organization.fullName'],
    'organization_type': df_normalized['protocolSection.identificationModule.organization.class'],

    #sponsors and collaborators
    'sponsor_name': df_normalized['protocolSection.sponsorCollaboratorsModule.leadSponsor.name'],
    'sponsor_type': df_normalized['protocolSection.sponsorCollaboratorsModule.leadSponsor.class'],
    # 'collaborator_name': df_normalized['protocolSection.sponsorCollaboratorsModule.collaborators.name'],
    # 'collaborator_type': df_normalized['protocolSection.sponsorCollaboratorsModule.collaborators.class'],

    #description
    'brief_summary': df_normalized['protocolSection.descriptionModule.briefSummary'],
    'detailed_description': df_normalized['protocolSection.descriptionModule.detailedDescription'],

    #conditions
    'condition': df_normalized['protocolSection.conditionsModule.conditions'],
    'keyword': df_normalized['protocolSection.conditionsModule.keywords'],

    #design
    'study_type': df_normalized['protocolSection.designModule.studyType'],
    'patient_registry': df_normalized['protocolSection.designModule.patientRegistry'],
    'target_duration': df_normalized['protocolSection.designModule.targetDuration'],
    'study_phase': df_normalized['protocolSection.designModule.targetDuration'],

    'design_allocation': df_normalized['protocolSection.designModule.designInfo.allocation'],
    'design_intervention_model': df_normalized['protocolSection.designModule.designInfo.interventionModel'],
    'design_intervention_model_desc': df_normalized['protocolSection.designModule.designInfo.interventionModelDescription'],

    'design_primary_purpose': df_normalized['protocolSection.designModule.designInfo.primaryPurpose'],
    'design_observational_model': df_normalized['protocolSection.designModule.designInfo.observationalModel'],
    'design_time_perspective': df_normalized['protocolSection.designModule.designInfo.timePerspective'],
    'design_masking': df_normalized['protocolSection.designModule.designInfo.maskingInfo.masking'],
    'design_masking_decsription': df_normalized['protocolSection.designModule.designInfo.maskingInfo.maskingDescription'],
    'design_who_masked': df_normalized['protocolSection.designModule.designInfo.maskingInfo.whoMasked'],

    #expanded access
    # 'exp_acc_type_individual': df_normalized['protocolSection.designModule.expandedAccessTypes.individual'],
    'exp_acc_type_intermediate': df_normalized['protocolSection.designModule.expandedAccessTypes.intermediate'],
    'exp_acc_type_treatment': df_normalized['protocolSection.designModule.expandedAccessTypes.treatment'],
    
    #Biospec
    'biospec_retention': df_normalized['protocolSection.designModule.bioSpec.retention'],
    'biospec_description': df_normalized['protocolSection.designModule.bioSpec.description'],

    #enrollment
    'enrollment_type': df_normalized['protocolSection.designModule.enrollmentInfo.type'],
    'enrollment_count': df_normalized['protocolSection.designModule.enrollmentInfo.count'],

    #arm group
    'arm_group_label': df_normalized['protocolSection.designModule.enrollmentInfo.type'],
    'arm_group_type': df_normalized['protocolSection.designModule.enrollmentInfo.count'],

    # 'arm_group_description': df_normalized['protocolSection.armsInterventionsModule.armGroups.description'],
    # 'arm_group_intervention_names': df_normalized['protocolSection.armsInterventionsModule.armGroups.interventionNames'],

    #intervention
    # 'intervention_type': df_normalized['protocolSection.armsInterventionsModule.interventions.type'],
    # 'intervention_name': df_normalized['protocolSection.armsInterventionsModule.interventions.name'],
    # 'intervention_description': df_normalized['protocolSection.armsInterventionsModule.interventions.description'],
    # 'intervention_arm_group_labels': df_normalized['protocolSection.armsInterventionsModule.interventions.armGroupLabels'],
    # 'intervention_other_names': df_normalized['protocolSection.armsInterventionsModule.interventions.otherNames'],
    
    #outcomes
    # 'primary_outcome_measure': df_normalized['protocolSection.outcomesModule.primaryOutcomes.measure'],
    # 'primary_outcome_description': df_normalized['protocolSection.outcomesModule.primaryOutcomes.description'],
    # 'primary_outcome_time_frame': df_normalized['protocolSection.outcomesModule.primaryOutcomes.timeFrame'],

    # 'secondary_outcome_measure': df_normalized['protocolSection.outcomesModule.secondaryOutcomes.measure'],
    # 'secondary_outcome_description': df_normalized['protocolSection.outcomesModule.secondaryOutcomes.description'],
    # 'secondary_outcome_time_frame': df_normalized['protocolSection.outcomesModule.secondaryOutcomes.timeFrame'],

    # 'other_outcome_measure': df_normalized['protocolSection.outcomesModule.otherOutcomes.measure'],
    # 'other_outcome_description': df_normalized['protocolSection.outcomesModule.otherOutcomes.description'],
    # 'other_outcome_time_frame': df_normalized['protocolSection.outcomesModule.otherOutcomes.timeFrame'],

    #eligibility
    'eligibility_criteria': df_normalized['protocolSection.eligibilityModule.eligibilityCriteria'],
    'healthy_volunteers': df_normalized['protocolSection.eligibilityModule.healthyVolunteers'],

    'sex': df_normalized['protocolSection.eligibilityModule.sex'],
    'gender_based': df_normalized['protocolSection.eligibilityModule.genderBased'],
    'gender_description': df_normalized['protocolSection.eligibilityModule.genderDescription'],
    'min_age': df_normalized['protocolSection.eligibilityModule.genderBased'],
    'max_age': df_normalized['protocolSection.eligibilityModule.sex'],
    'population_description': df_normalized['protocolSection.eligibilityModule.studyPopulation'],
    'sampling_method': df_normalized['protocolSection.eligibilityModule.samplingMethod'],

    #central contact
    # 'central_contact_name': df_normalized['protocolSection.contactsLocationsModule.centralContacts.name'],
    # 'central_contact_role': df_normalized['protocolSection.contactsLocationsModule.centralContacts.role'],
    # 'central_contact_phone': df_normalized['protocolSection.contactsLocationsModule.centralContacts.phone'],
    # 'central_contact_phone_ext': df_normalized['protocolSection.contactsLocationsModule.centralContacts.phoneExt'],
    # 'central_contact_email': df_normalized['protocolSection.contactsLocationsModule.centralContacts.email'],

     #study officials
    # 'official_name': df_normalized['protocolSection.contactsLocationsModule.overallOfficials.name'],
    # 'official_role': df_normalized['protocolSection.contactsLocationsModule.overallOfficials.role'],
    # 'official_affiliation': df_normalized['protocolSection.contactsLocationsModule.overallOfficials.affiliation'],

    #location
    # 'facility_name': df_normalized['protocolSection.contactsLocationsModule.locations.facility'],
    # 'facility_recruitment_status': df_normalized['protocolSection.contactsLocationsModule.locations.status'],
    # 'facility_city': df_normalized['protocolSection.contactsLocationsModule.locations.city'],
    # 'facility_state': df_normalized['protocolSection.contactsLocationsModule.locations.state'],
    # 'facility_zip': df_normalized['protocolSection.contactsLocationsModule.locations.zip'],
    # 'facility_country': df_normalized['protocolSection.contactsLocationsModule.locations.country'],


    #location contacts
    # 'facility_contact_name': df_normalized['protocolSection.contactsLocationsModule.locations.contacts.name'],
    # 'facility_contact_role': df_normalized['protocolSection.contactsLocationsModule.locations.contacts.role'],
    # 'facility_contact_phone': df_normalized['protocolSection.contactsLocationsModule.locations.contacts.phone'],
    # 'facility_contact_phone_ext': df_normalized['protocolSection.contactsLocationsModule.locations.contacts.phoneExt'],
    # 'facility_contact_email': df_normalized['protocolSection.contactsLocationsModule.locations.contacts.email'],


    #reference
    # 'ref_pmid': df_normalized['protocolSection.referencesModule.references.pmid'],
    # 'ref_type': df_normalized['protocolSection.referencesModule.references.type'],
    # 'ref_citation': df_normalized['protocolSection.referencesModule.references.citation'],

    #retraction
    # 'ret_pmid': df_normalized['protocolSection.referencesModule.references.retractions.pmid'],
    # 'ret_source': df_normalized['protocolSection.referencesModule.references.retractions.status'],

    #see also link
    # 'see_also_label': df_normalized['protocolSection.referencesModule.seeAlsoLinks.label'],
    # 'see_also_url': df_normalized['protocolSection.referencesModule.seeAlsoLinks.url'],
    
    #status
    'overall_status': df_normalized['protocolSection.statusModule.lastKnownStatus'],
    'last_known_status': df_normalized['protocolSection.statusModule.lastKnownStatus'],
    
    'start_date': df_normalized['protocolSection.statusModule.startDateStruct.date'],
    'start_date_type': df_normalized['protocolSection.statusModule.startDateStruct.type'],
    
    'first_submit_date': df_normalized['protocolSection.statusModule.studyFirstSubmitDate'],
    'first_submit_qc_date': df_normalized['protocolSection.statusModule.studyFirstSubmitQcDate'],
    
    'last_update_submit': df_normalized['protocolSection.statusModule.lastUpdateSubmitDate'],
    'completion_date': df_normalized['protocolSection.statusModule.completionDateStruct.date'],
    'completion_date_type': df_normalized['protocolSection.statusModule.completionDateStruct.type'],
    
    'why_stopped': df_normalized['protocolSection.statusModule.whyStopped'],
    'has_exp_access': df_normalized['protocolSection.statusModule.expandedAccessInfo.hasExpandedAccess'],
    # 'exp_access_id': df_normalized['protocolSection.statusModule.expandedAccessInfo.nctId'],

    #oversight
    'has_dmc': df_normalized['protocolSection.oversightModule.oversightHasDmc'],
    'is_fda_regulated_drug': df_normalized['protocolSection.oversightModule.isFdaRegulatedDrug'],
    'is_fda_regulated_device': df_normalized['protocolSection.oversightModule.isFdaRegulatedDevice'],
    'is_unapproved_device': df_normalized['protocolSection.oversightModule.isUnapprovedDevice'],
    # 'is_ppsd': df_normalized['protocolSection.oversightModule.isPpsd'],


     # individual participant data
    'ipd_sharing': df_normalized['protocolSection.ipdSharingStatementModule.ipdSharing'],
    'ipd_sharing_description': df_normalized['protocolSection.ipdSharingStatementModule.description'],
    'ipd_sharing_info_type': df_normalized['protocolSection.ipdSharingStatementModule.infoTypes'],
    'ipd_sharing_time_frame': df_normalized['protocolSection.ipdSharingStatementModule.timeFrame'],
    'ipd_sharing_access_criteria': df_normalized['protocolSection.ipdSharingStatementModule.accessCriteria'],
    'ipd_sharing_url': df_normalized['protocolSection.ipdSharingStatementModule.url'],


    # results
    'ipd_sharing': df_normalized['protocolSection.ipdSharingStatementModule.ipdSharing'],
    'ipd_sharing_description': df_normalized['protocolSection.ipdSharingStatementModule.description'],
    'ipd_sharing_info_type': df_normalized['protocolSection.ipdSharingStatementModule.infoTypes'],
    'ipd_sharing_time_frame': df_normalized['protocolSection.ipdSharingStatementModule.timeFrame'],


    # participant_flow
    'flow_pre_assignment_details': df_normalized['resultsSection.participantFlowModule.preAssignmentDetails'],
    'flow_recruitment_details': df_normalized['resultsSection.participantFlowModule.recruitmentDetails'],
    'flow_type_unit_analysed': df_normalized['resultsSection.participantFlowModule.typeUnitsAnalyzed'],
    # 'flow_groups': df_normalized['resultsSection.participantFlowModule.groups'],
          # 'flow_group_id': df_normalized['resultsSection.participantFlowModule.groups.id'],
          # 'flow_group_title': df_normalized['resultsSection.participantFlowModule.groups.title'],
          # 'flow_group_description': df_normalized['resultsSection.participantFlowModule.groups.description'],

    # 'flow_period': df_normalized['resultsSection.participantFlowModule.periods'],
          # 'flow_period_title': df_normalized['resultsSection.participantFlowModule.periods.title'],
          # 'flow_period_milestone': df_normalized['resultsSection.participantFlowModule.periods.milestones'],
              # 'flow_milestone_type': df_normalized['resultsSection.participantFlowModule.periods.milestones.type'],
              # 'flow_milestone_comment': df_normalized['	resultsSection.participantFlowModule.periods.milestones.comment'],
    
          # 'flow_milestone_achievments': df_normalized['	resultsSection.participantFlowModule.periods.milestones.achievements'],
                # 'flow_achievment_grp_id': df_normalized['	resultsSection.participantFlowModule.periods.milestones.achievements.groupId'],
                # 'flow_achievment_grp_comment': df_normalized['resultsSection.participantFlowModule.periods.milestones.achievements.comment'],
                # 'flow_achievment_grp_num_participants': df_normalized['resultsSection.participantFlowModule.periods.milestones.achievements.numSubjects'],
                # 'flow_achievment_grp_num_units': df_normalized['resultsSection.participantFlowModule.periods.milestones.achievements.numUnits'],
    
         # 'flow_drop_withdraws': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws'],
               # 'flow_drop_withdraw_type': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.type'],
               # 'flow_drop_wdraw_comment': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.comment'],

               # 'flow_drop_wdraw_reason': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.reasons'],
                    # 'flow_drop_wdraw_reason_grp_id': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.reasons.groupId'],
                    # 'flow_drop_wdraw_reason_grp_comment': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.comment'],
                    # 'flow_drop_wdraw_reason_grp_num_subject': df_normalized['resultsSection.participantFlowModule.periods.dropWithdraws.reasons.numSubjects'],

    # baseline characteristics
    'bsln_pop_desc': df_normalized['resultsSection.baselineCharacteristicsModule.populationDescription'],
    'bsln_pop_units_analysed': df_normalized['resultsSection.baselineCharacteristicsModule.typeUnitsAnalyzed'],
    
    
    # 'bsln_groups': df_normalized['resultsSection.baselineCharacteristicsModule.groups'],
    #     'bsln_group_id': df_normalized['resultsSection.baselineCharacteristicsModule.groups.id'],
    #     'bsln_group_title': df_normalized['resultsSection.baselineCharacteristicsModule.groups.title'],
    #     'bsln_group_desc': df_normalized['resultsSection.baselineCharacteristicsModule.groups.description'],
    
    # 'bsln_denom': df_normalized['resultsSection.baselineCharacteristicsModule.denoms'],
    #     'bsln_denom_units': df_normalized['resultsSection.baselineCharacteristicsModule.denoms.units'],
    #     'bsln_denom_count': df_normalized['resultsSection.baselineCharacteristicsModule.denoms.counts'],
    
    #         'bsln_denom_grp_id': df_normalized['resultsSection.baselineCharacteristicsModule.denoms.counts.groupId'],
    #         'bsln_denom_group_count': df_normalized['resultsSection.baselineCharacteristicsModule.denoms.counts.value'],

     # 'bsln_measures': df_normalized['resultsSection.baselineCharacteristicsModule.measures'],
     #    'bsln_measures_title': df_normalized['resultsSection.baselineCharacteristicsModule.measures.title'],
     #    'bsln_measures_desc': df_normalized['resultsSection.baselineCharacteristicsModule.measures.description'],
     #    'bsln_measures_pop_desc': df_normalized['resultsSection.baselineCharacteristicsModule.measures.populationDescription'],

     #    'bsln_measures_param_type': df_normalized['resultsSection.baselineCharacteristicsModule.measures.paramType'],
     #    'bsln_dispersion_type': df_normalized['resultsSection.baselineCharacteristicsModule.measures.dispersionType'],
     #    'bsln_measures_unit_of_meas': df_normalized['resultsSection.baselineCharacteristicsModule.measures.unitOfMeasure'],

     #    'bsln_measures_calc_pct': df_normalized['resultsSection.baselineCharacteristicsModule.measures.calculatePct'],
     #    'bsln_denom_units': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denomUnitsSelected'],
    
     #    'bsln_measure_denoms': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denoms'],
             #    'bsln_measure_denom_units': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denoms.units'],
             #    'bsln_measure_denom_counts': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denoms.counts'],

                    # 'bsln_measure_denom_counts_grp_id': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denoms.counts.groupId'],
                    # 'bsln_measure_denom_counts_value': df_normalized['resultsSection.baselineCharacteristicsModule.measures.denoms.counts.value'],
    #    'bsln_class': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes'],
    #    'bsln_class_title': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.denoms'],
    #        'bsln_class_denom_units': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.denoms.units'],
    #        'bsln_class_denom_counts': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.denoms.counts'],

    #            'bsln_class_denom_count_grp_id': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.denoms.counts.groupId'],
    #            'bsln_class_denom_count_value': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.denoms.counts.value'],

    'bsln_category': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories'],
        'bsln_category_title': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.title'],
        'bsln_category_measurments': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments'],

            'bsln_meas_group_id': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.groupId'],
            'bsln_meas_value': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.value'],
            'bsln_meas_spread': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.spread'],
            'bsln_meas_value': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.value'],
            'bsln_meas_u_limit': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.upperLimit'],
            'bsln_meas_l_limit': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.lowerLimit'],
            'bsln_meas_comment': df_normalized['resultsSection.baselineCharacteristicsModule.measures.classes.categories.measurments.comment'],
        
    'last_updated': df_normalized['protocolSection.statusModule.lastUpdatePostDateStruct.date']
})

In [31]:
# print(study_df.head(6))

In [49]:
study_df.head(20).to_csv("sample.csv", index=False)

In [15]:
transposed = (study_df.transpose())