In [116]:
from typing import Dict, List, Any, Hashable, Tuple
from datetime import datetime
import pandas as pd
import numpy as np
import hashlib
import json

In [117]:
SINGLE_FIELDS = {
    # Identification
    "nct_id": "protocolSection.identificationModule.nctId",
    "brief_title": "protocolSection.identificationModule.briefTitle",
    "official_title": "protocolSection.identificationModule.officialTitle",
    "acronym": "protocolSection.identificationModule.acronym",
    "org_study_id": "protocolSection.identificationModule.orgStudyIdInfo.id",
    # Description
    "brief_summary": "protocolSection.descriptionModule.briefSummary",
    "detailed_desc": "protocolSection.descriptionModule.detailedDescription",

    # Sponsor
    "responsible_party": "protocolSection.sponsorCollaboratorsModule.responsibleParty.type",
    
    # Design (single values)
    "study_type": "protocolSection.designModule.studyType",
    "patient_registry": "protocolSection.designModule.patientRegistry",
    "enrollment_type": "protocolSection.designModule.enrollmentInfo.type",
    "enrollment_count": "protocolSection.designModule.enrollmentInfo.count",
    "design_allocation": "protocolSection.designModule.designInfo.allocation",
    "design_intervention_model": "protocolSection.designModule.designInfo.interventionModel",
    "design_intervention_model_desc": "protocolSection.designModule.designInfo.interventionModelDescription",
    "design_primary_purpose": "protocolSection.designModule.designInfo.primaryPurpose",
    "design_observational_model": "protocolSection.designModule.designInfo.observationalModel",
    "design_time_perspective": "protocolSection.designModule.designInfo.timePerspective",
    "design_masking": "protocolSection.designModule.designInfo.maskingInfo.masking",
    # Biospecimen
    "biospec_retention": "protocolSection.designModule.bioSpec.retention",
    "biospec_desc": "protocolSection.designModule.bioSpec.description",
    # Eligibility
    "eligibility_criteria": "protocolSection.eligibilityModule.eligibilityCriteria",
    "healthy_volunteers": "protocolSection.eligibilityModule.healthyVolunteers",
    "sex": "protocolSection.eligibilityModule.sex",
    "min_age": "protocolSection.eligibilityModule.minimumAge",
    "max_age": "protocolSection.eligibilityModule.maximumAge",
    "population_desc": "protocolSection.eligibilityModule.studyPopulation",
    "sampling_method": "protocolSection.eligibilityModule.samplingMethod",
    # Status
    "overall_status": "protocolSection.statusModule.overallStatus",
    "last_known_status": "protocolSection.statusModule.lastKnownStatus",
    "status_verified_date": "protocolSection.statusModule.statusVerifiedDate",
    "start_date": "protocolSection.statusModule.startDateStruct.date",
    "start_date_type": "protocolSection.statusModule.startDateStruct.type",
    "first_submit_date": "protocolSection.statusModule.studyFirstSubmitDate",
    "last_update_submit_date": "protocolSection.statusModule.lastUpdateSubmitDate",
    "completion_date": "protocolSection.statusModule.completionDateStruct.date",
    "completion_date_type": "protocolSection.statusModule.completionDateStruct.type",
    "why_stopped": "protocolSection.statusModule.whyStopped",
    "has_expanded_access": "protocolSection.statusModule.expandedAccessInfo.hasExpandedAccess",
    # Oversight
    "has_dmc": "protocolSection.oversightModule.oversightHasDmc",
    "is_fda_regulated_drug": "protocolSection.oversightModule.isFdaRegulatedDrug",
    "is_fda_regulated_device": "protocolSection.oversightModule.isFdaRegulatedDevice",
    "is_unapproved_device": "protocolSection.oversightModule.isUnapprovedDevice",
    "is_us_export": "protocolSection.oversightModule.isUsExport",
    # Individual participant data
    "ipd_sharing": "protocolSection.ipdSharingStatementModule.ipdSharing",
    "ipd_desc": "protocolSection.ipdSharingStatementModule.description",
    "ipd_time_frame": "protocolSection.ipdSharingStatementModule.timeFrame",
    "ipd_access_criteria": "protocolSection.ipdSharingStatementModule.accessCriteria",
    "ipd_url": "protocolSection.ipdSharingStatementModule.url",
    # Miscellaneous
    "version_holder": "derivedSection.miscInfoModule.versionHolder",
    "has_results": "hasResults",
    "last_updated": "protocolSection.statusModule.lastUpdatePostDateStruct.date",
    "limitations_desc": "resultsSection.moreInfoModule.limitationsAndCaveats.description",
    # Certain agreements
    "certain_agreement_pi_sponsor_employee": "resultsSection.moreInfoModule.certainAgreement.piSponsorEmployee",
    "certain_agreement_restrictive": "resultsSection.moreInfoModule.certainAgreement.restrictiveAgreement",
    "certain_agreement_other_details": "resultsSection.moreInfoModule.certainAgreement.otherDetails",
    "certain_agreement_restriction_type": "resultsSection.moreInfoModule.certainAgreement.restrictionType",
    # Point of contact
    "poc_title": "resultsSection.moreInfoModule.pointOfContact.title",
    "poc_organization": "resultsSection.moreInfoModule.pointOfContact.organization",
    "poc_email": "resultsSection.moreInfoModule.pointOfContact.email",
    "poc_phone": "resultsSection.moreInfoModule.pointOfContact.phone",
    "poc_phone_ext": "resultsSection.moreInfoModule.pointOfContact.phoneExt",
    # Submission tracking
    "sub_tracking_estimated_results_date": "derivedSection.miscInfoModule.submissionTracking.estimatedResultsFirstSubmitDate",
}

NESTED_FIELDS = {
    "sponsor": {
        "index_field": "protocolSection.sponsorCollaboratorsModule.leadSponsor",
        "object_type": "simple dict",
        "fields": [
            ("lead_sponsor_name", "name"),
            ("lead_sponsor_class", "class"),
        ],
        "table_name": "sponsors",
        "bridge_table_name": "study_sponsors",
        "transformer_method": "extract_sponsors",
    },
    "collaborators": {
        "index_field": "protocolSection.sponsorCollaboratorsModule.collaborators",
        "object_type": "array_of_dicts",
        "fields": [
            ("sponsor_name", "name"),
            ("sponsor_class", "class"),
        ],
        "table_name": "sponsor",
        "bridge_table_name": "study_sponsors",
        "transformer_method": "extract_sponsors",
    },
    "conditions": {
        "index_field": "protocolSection.conditionsModule.conditions",
        "object_type": "simple_array",
        "table_name": "conditions",
        "bridge_table_name": "bridge_study_conditions",
        "field_name": "condition_name",
        "transformer_method": "extract_conditions",
    },
    "keywords": {
        "index_field": "protocolSection.conditionsModule.keywords",
        "object_type": "simple_array",
        "table_name": "keywords",
        "bridge_table_name": "bridge_study_keywords",
        "field_name": "keyword",
        "transformer_method": "extract_keywords",
    },
    "interventions": {
        "index_field": "protocolSection.armsInterventionsModule.interventions",
        "object_type": "array_of_dicts",
        "fields": [
            ("intervention_name", "name"),
            ("intervention_desc", "description"),
            ("intervention_type", "type"),
        ],
        "nested": {
            "otherNames": {
                "object_type": "nested_simple_array",
                "table_name": "interventions",
                "bridge_table_name": "bridge_table_name",
            }
        },
        "table_name": "interventions",
        "bridge_table_name": "bridge_study_interventions",
        "transformer_method": "extract_interventions",
    },
    "arm_groups": {
        "index_field": "protocolSection.armsInterventionsModule.armGroups",
        "object_type": "array_of_dicts",
        "table_name": "study_arm_groups",
        "fields": [
            ("arm_group_label", "label"),
            ("arm_group_type", "type"),
            ("arm_group_desc", "description"),
        ],
        "nested": {
            "interventionNames": {
                "object_type": "nested_simple_array",
                "bridge_table_name": "arm_group_interventions",
                "field_name": "intervention_name",
            },
        "transformer_method": "extract_arm_groups",
        }
    },

    "central_contacts": {
        "index_field": "protocolSection.contactsLocationsModule.centralContacts",
        "object_type": "array_of_dicts",
        "table_name": "contacts",
        "bridge_table_name": "study_contacts",
        "fields": [
            ("name", "name"),
            ("role", "role"),
            ("email", "email"),
            ("phone", "phone"),
            ("phoneExt", "phoneExt"),
        ],
        "transformer_method": "extract_central_contacts",
    },
    "locations": {
        "index_field": "protocolSection.contactsLocationsModule.locations",
        "object_type": "array_of_dicts",
        "table_name": "sites",
        "bridge_table_name": "study_sites",
        "fields": [
            ("site_facility", "facility"),
            ("city", "city"),
            ("state", "state"),
            ("zip", "zip"),
            ("country", "country"),
            ("site_status", "status"),
        ],
        "nested": {
            "geoPoint": {
                "object_type": "simple_dict",
                "fields": ["lat", "lon"],
            },
            #contacts are saved as a JSON blob
            "contacts": {
                "object_type": "nested_array_of_dicts",
                "table_name": "contacts",
                "bridge_table_name": "location_contacts",
                "fields": [
                    ("name", "name"),
                    ("role", "role"),
                    ("email", "email"),
                    ("phone", "phone"),
                    ("phoneExt", "phoneExt"),
                ],
                "transformer_method": "extract_contacts",
            },
        },
        "transformer_method": "extract_locations",
    }

    

}

In [118]:
def generate_key(*args) -> str:
    """Generates a deterministic surrogate key from input values."""
    combined = "|".join(str(arg) for arg in args if arg is not None)
    return hashlib.sha256(combined.encode()).hexdigest()[:16]

In [119]:
def extract_study_fields(study_key: str, study_data: pd.Series) -> Dict:
    study_record = dict()

    study_record['study_key'] = study_key
    for entity_key in SINGLE_FIELDS:
        index_field = SINGLE_FIELDS.get(entity_key)

        study_record[entity_key] = study_data.get(index_field)

    return study_record

In [120]:
def extract_sponsors(idx: int, study_key: str, study_data: pd.Series):

    sponsors = []
    study_sponsors = []

    # Extract lead sponsor
    lead_sponsor_index = NESTED_FIELDS["sponsor"]["index_field"]

    # sponsor name and class are scalar values and MUST be extracted directly
    lead_sponsor_name = study_data.get(f'{lead_sponsor_index}.name')
    lead_sponsor_class = study_data.get(f'{lead_sponsor_index}.class')

    if pd.notna(lead_sponsor_name) and pd.notna(lead_sponsor_class):
        sponsor_key = generate_key(
            lead_sponsor_name, lead_sponsor_class
        )
        sponsors.append(
            {
                "sponsor_key": sponsor_key,
                "name": lead_sponsor_name,
                "sponsor_class": lead_sponsor_class,
            }
        )

        study_sponsors.append(
            {"study_key": study_key, "sponsor_key": sponsor_key, "is_lead": True}
        )
    else:
        print(f"No lead sponsor found for {idx}")

    # Extract collaborators
    collaborators_index = NESTED_FIELDS["collaborators"]["index_field"]
    collaborators_list = study_data.get(collaborators_index)

    if isinstance(collaborators_list, (list, np.ndarray)) and len(collaborators_list) > 0:
        for collaborator in collaborators_list:
            sponsor_key = generate_key(
                collaborator.get("name"), collaborator.get("class")
            )

            sponsors.append(
                {
                    "sponsor_key": sponsor_key,
                    "name": collaborator.get("name"),
                    "sponsor_class": collaborator.get("class"),
                }
            )

            study_sponsors.append(
                {"study_key": study_key, "sponsor_key": sponsor_key, "is_lead": False}
            )

    return sponsors, study_sponsors

In [121]:
def extract_conditions(idx: int, study_key: str, study_data: pd.Series) -> Tuple | None:
    conditions = []
    study_conditions = []

    conditions_index = NESTED_FIELDS["conditions"]["index_field"]
    conditions_list = study_data.get(conditions_index)


    if isinstance(conditions_list, (list, np.ndarray)) and len(conditions_list) > 0:
        for condition in conditions_list:
            condition_key = generate_key(condition)

            conditions.append(
                {"condition_key": condition_key, "condition_name": condition}
            )

            study_conditions.append(
                {
                    "study_key": study_key,
                    "condition_key": condition_key,
                }
            )

        return conditions, study_conditions

    print(f"No conditions found for {idx}")
    return conditions, study_conditions

In [122]:
def extract_keywords(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    keywords = []
    study_keywords = []

    keywords_index = NESTED_FIELDS["keywords"]["index_field"]
    keywords_list = study_data.get(keywords_index)
    if isinstance(keywords_list, (list, np.ndarray)) and len(keywords_list) > 0:
        for keyword in keywords_list:
            keyword_key = generate_key(keyword)

            keywords.append({"keyword_key": keyword_key, "keyword_name": keyword})

            study_keywords.append(
                {
                    "study_key": study_key,
                    "keyword_key": keyword_key,
                }
            )
        
    return keywords, study_keywords

In [123]:
def extract_interventions(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    intervention_names = []
    study_interventions = []

    interventions_index = NESTED_FIELDS["interventions"]["index_field"]
    interventions_list = study_data.get(interventions_index)

    if isinstance(interventions_list, (list, np.ndarray)) and len(interventions_list) > 0:
        for intervention in interventions_list:
            main_name = intervention.get("name")
            intervention_type = intervention.get("type")
            description = intervention.get("description")

            intervention_key = generate_key(main_name, intervention_type)
            intervention_names.append({
                "intervention_key": intervention_key,
                "intervention_name": main_name,
                "intervention_type": intervention_type,
                "description": description,
               
            })

            study_interventions.append({
                "study_key": study_key,
                "intervention_key": intervention_key,
                "is_primary_name": True
            })


            other_names = intervention.get("otherNames")
            if isinstance(other_names, (list, np.ndarray)) and len(other_names) > 0:
                
                for other_name in other_names:
                    if other_name == main_name:
                        continue #some studies put the main name in the list of other names
                    intervention_key = generate_key(other_name, intervention_type)
                    intervention_names.append({
                        "intervention_key": intervention_key,
                        "intervention_name": other_name,
                        "intervention_type": intervention_type,
                        "description": description,  # inherits from parent
                       
                    })

                    study_interventions.append({
                        "study_key": study_key,
                        "intervention_key": intervention_key,
                         "is_primary_name": False
                    })
    else:
        pass
        # print(f"No interventions found for study {study_key}, {idx}")
    return intervention_names, study_interventions

In [124]:
def extract_arm_groups(idx: Hashable, study_key: str, study_data: pd.Series) -> List | None:
    study_arms_interventions = []

    study_arms_index = NESTED_FIELDS["arm_groups"]["index_field"]
    study_arms_list = study_data.get(study_arms_index)

    if isinstance(study_arms_list, (list, np.ndarray)) and len(study_arms_list) > 0:
        for study_arm in study_arms_list:
            study_arm_label = study_arm.get("label")
            study_arm_description = study_arm.get("description")
            study_arm_type = study_arm.get("type")

            arm_intervention_key = generate_key(study_key, study_arm_label, study_arm_description,
                                                     study_arm_type)
            
            arm_interventions = study_arm.get("interventionNames")
            if isinstance(arm_interventions, (list, np.ndarray)) and len(arm_interventions) > 0:

                for intervention in arm_interventions:
                    study_arms_interventions.append(
                        {
                            "study_key": study_key,
                            "arm_intervention_key": arm_intervention_key,
                            "arm_label": study_arm_label,
                            "arm_description": study_arm_description,
                            "arm_type": study_arm_type,
                            "arm_intervention_name": intervention,
                        }
                    )
            else:
                study_arms_interventions.append(
                    {
                        "study_key": study_key,
                        "arm_intervention_key": arm_intervention_key,
                        "arm_label": study_arm_label,
                        "arm_description": study_arm_description,
                        "arm_type": study_arm_type,
                        "arm_intervention_name": None,
                    }
                )

    return study_arms_interventions

In [125]:
def extract_central_contacts(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    central_contacts = []
    study_central_contacts = []

    central_contacts_index = NESTED_FIELDS["central_contacts"]["index_field"]
    central_contacts_list = study_data.get(central_contacts_index)

    if isinstance(central_contacts_list, (list, np.ndarray)) and len(central_contacts_list) > 0:

        for central_contact in central_contacts_list:
            name = central_contact.get("name")
            role = central_contact.get("role")
            phone = central_contact.get("phone")
            email = central_contact.get("email")

            central_contact_key = generate_key(name, role, phone, email)

            central_contacts.append(
                {"contact_key": central_contact_key,
                 "contact_name": name,
                 "contact_role": role,
                 "contact_phone": phone,
                 "contact_email": email,
                 })

            study_central_contacts.append(
                {
                    "study_key": study_key,
                    "contact_key": central_contact_key,
                }
            )
    return central_contacts, study_central_contacts

In [126]:
def extract_locations(idx: Hashable, study_key: str, study_data: pd.Series) -> Tuple:
    """
    Extract locations and stores location contact as JSON blob.

    NOTE: Officials are stored denormalized as JSON since not used for filtering/analysis.
    Avoids snowflaking the schema while preserving all contact information for downstream applications.
    """
    locations = []
    study_locations = []

    locations_index = NESTED_FIELDS["locations"]["index_field"]
    locations_list = study_data.get(locations_index)


    if isinstance(locations_list, (list, np.ndarray)) and len(locations_list) > 0:

        for location in locations_list:
            facility = location.get("facility")
            city = location.get("city")
            state = location.get("state")
            country = location.get("country")

            location_key = generate_key(facility, city, state, country)
            
            curr_location = {
                "location_key": location_key,
                 "status": location.get("status"),
                 "facility": facility,
                 "city": city,
                 "state": state,
                 "country": country,
                 "contacts": location.get("contacts"), #json blob
                 }
            
            geopoint = location.get("geoPoint")
            if isinstance(geopoint, (dict, np.ndarray)) and len(geopoint) > 0:
                curr_location["lat"] = geopoint.get("lat"),
                curr_location["lon"] = geopoint.get("lon")

            locations.append(curr_location)

            

            study_locations.append(
                {
                    "study_key": study_key,
                    "location_key": location_key,
                }
            )

    return locations, study_locations

In [127]:
all_studies = []
all_sponsors = []
all_study_sponsors = []

all_conditions = []
all_study_conditions = []

all_keywords = []
all_study_keywords = []

all_arm_group_interventions = []

all_interventions = []
all_interventions_other_names = []
all_study_interventions = []

all_locations = []
all_study_locations = []
all_central_contacts = []
all_study_central_contacts = []

df = pd.read_parquet("1.parquet")
df_studies = pd.json_normalize(df['studies'].tolist())

for idx, study in df_studies.iterrows():
    nct_index = SINGLE_FIELDS['nct_id']
    nct_id = study.get(nct_index)
    
    study_key = generate_key(nct_id)


    #study
    study_record = extract_study_fields(study_key, study)
    all_studies.append(study_record)

    # sponsors
    sponsors, study_sponsors = extract_sponsors(idx, study_key, study)
    all_sponsors.extend(sponsors)
    all_study_sponsors.extend(study_sponsors)

    # conditions and keywords
    conditions, study_conditions = extract_conditions(idx, study_key, study)
    all_conditions.extend(conditions)
    all_study_conditions.extend(study_conditions)

    keywords, study_keywords = extract_keywords(idx, study_key, study)
    all_keywords.extend(keywords)
    all_study_keywords.extend(study_keywords)


    # groups and interventions
    arm_group_interventions = extract_arm_groups(idx, study_key, study)
    all_arm_group_interventions.extend(arm_group_interventions)

    interventions, study_interventions = extract_interventions(idx, study_key, study)
    all_interventions.extend(interventions)
    all_study_interventions.extend(study_interventions)

    # contacts and locations
    central_contacts, study_central_contacts  = extract_central_contacts(idx, study_key, study)
    all_central_contacts.extend(central_contacts)
    all_study_central_contacts.extend(study_central_contacts)

    locations, study_locations = extract_locations(idx, study_key, study)
    all_locations.extend(locations)
    all_study_locations.extend(study_locations)


studies = pd.DataFrame(all_studies)

df_sponsors = pd.DataFrame(all_sponsors)
df_study_sponsors = pd.DataFrame(all_study_sponsors)

df_conditions = pd.DataFrame(all_conditions)
df_study_conditions= pd.DataFrame(all_study_conditions)

df_keywords = pd.DataFrame(all_keywords)
df_study_keywords = pd.DataFrame(all_study_keywords)

df_arm_group_interventions = pd.DataFrame(all_arm_group_interventions)
df_interventions = pd.DataFrame(all_interventions)
df_study_interventions = pd.DataFrame(all_study_interventions)


df_central_contacts = pd.DataFrame(all_central_contacts)
df_study_central_contacts = pd.DataFrame(all_study_central_contacts)
df_locations = pd.DataFrame(all_locations)
df_study_locations = pd.DataFrame(all_study_locations)


print("----------------")
print(f"STUDIES {len(studies)}")
print("----------------")


#dedupe and inspect
print(f"SPONSORS {len(df_sponsors)}")
df_sponsors = df_sponsors.drop_duplicates(subset=["sponsor_key"])
print(f"DEDUPED SPONSORS {len(df_sponsors)}")

print(f"STUDY SPONSORS {len(df_study_sponsors)}")
df_study_sponsors = df_study_sponsors.drop_duplicates(subset=["sponsor_key", "study_key"])
print(f"DEDUPED STUDY SPONSORS {len(df_study_sponsors)}")
print("----------------")

print(f"CONDITIONS {len(df_conditions)}")
df_conditions = df_conditions.drop_duplicates(subset=["condition_key"])
print(f"DEDUPED CONDITIONS {len(df_conditions)}")

print(f"STUDY CONDITIONS {len(df_study_conditions)}")
df_study_conditions = df_study_conditions.drop_duplicates(subset=["condition_key", "study_key"])
print(f"DEDUPED STUDY CONDITIONS {len(df_study_conditions)}")
print("----------------")

print(f"KEYWORDS {len(df_keywords)}")
df_keywords = df_keywords.drop_duplicates(subset=["keyword_key"])
print(f"DEDUPED KEYWORDS {len(df_keywords)}")

print(f"STUDY KEYWORDS {len(df_study_keywords)}")
df_study_keywords = df_study_keywords.drop_duplicates(subset=["keyword_key", "study_key"])
print(f"DEDUPED STUDY KEYWORDS {len(df_study_keywords)}")
print("----------------")


# duplicates = df_study_interventions[
#     df_study_interventions.duplicated(subset=["intervention_key", "study_key"], keep=False)
# ].sort_values(["study_key", "intervention_key"])
# print(df_study_interventions.groupby(['study_key', 'intervention_key']).size().sort_values(ascending=False).head(20))
# print(duplicates.head(20))


print(f"INTERVENTIONS {len(df_interventions)}")
df_interventions = df_interventions.drop_duplicates(subset=["intervention_key"])
print(f"DEDUPED INTERVENTIONS {len(df_interventions)}")

print(f"STUDY INTERVENTIONS {len(df_study_interventions)}")
df_study_interventions = df_study_interventions.drop_duplicates(subset=["intervention_key", "study_key"])
print(f"DEDUPED STUDY INTERVENTIONS {len(df_study_interventions)}")
print("----------------")

# duplicates = df_arm_group_interventions[
#     df_arm_group_interventions.duplicated(subset=["arm_intervention_key", "study_key", "arm_intervention_name"], keep=False)
# ]
# print(duplicates.head(20))


print(f"ARM GROUPS INTERVENTION {len(df_arm_group_interventions)}")
df_arm_group_interventions = df_arm_group_interventions.drop_duplicates(subset=["arm_intervention_key", "study_key", "arm_intervention_name"])
print(f"DEDUPED ARM GROUPS INTERVENTION {len(df_arm_group_interventions)}")
print("----------------")

print(f"LOCATIONS {len(df_locations)}")
df_locations = df_locations.drop_duplicates(subset=["location_key"])
print(f" DEDUPED LOCATIONS {len(df_locations)}")

print(f" STUDY LOCATIONS {len(df_study_locations)}")
df_study_locations = df_study_locations.drop_duplicates(subset=["location_key", "study_key"])
print(f" DEDUPED STUDY LOCATIONS {len(df_study_locations)}")
print("----------------")


print(f"CONTACTS {len(df_central_contacts)}")
df_central_contacts = df_central_contacts.drop_duplicates(subset=["contact_key"])
print(f" DEDUPED CONTACTS {len(df_central_contacts)}")

print(f"STUDY CONTACTS {len(df_study_central_contacts)}")
df_study_central_contacts = df_study_central_contacts.drop_duplicates(subset=["contact_key", "study_key"])
print(f"DEDUPED STUDY CONTACTS {len(df_study_central_contacts)}")


----------------
STUDIES 1000
----------------
SPONSORS 1631
DEDUPED SPONSORS 1176
STUDY SPONSORS 1631
DEDUPED STUDY SPONSORS 1631
----------------
CONDITIONS 1754
DEDUPED CONDITIONS 1355
STUDY CONDITIONS 1754
DEDUPED STUDY CONDITIONS 1754
----------------
KEYWORDS 2419
DEDUPED KEYWORDS 2235
STUDY KEYWORDS 2419
DEDUPED STUDY KEYWORDS 2419
----------------
INTERVENTIONS 2740
DEDUPED INTERVENTIONS 2400
STUDY INTERVENTIONS 2740
DEDUPED STUDY INTERVENTIONS 2668
----------------
ARM GROUPS INTERVENTION 2426
DEDUPED ARM GROUPS INTERVENTION 2426
----------------
LOCATIONS 5247
 DEDUPED LOCATIONS 4782
 STUDY LOCATIONS 5247
 DEDUPED STUDY LOCATIONS 5180
----------------
CONTACTS 381
 DEDUPED CONTACTS 380
STUDY CONTACTS 381
DEDUPED STUDY CONTACTS 381


In [46]:
studies.to_csv("data/study_data.csv", index=False)

In [274]:
df_sponsors.to_csv("data/sponsors.csv", index=False)
df_study_sponsors.to_csv("data/bridge_study_sponsors.csv", index=False)

In [276]:
df_conditions.to_csv("data/conditions.csv", index=False)
df_study_conditions.to_csv("data/study_conditions.csv", index=False)

In [279]:
df_keywords.to_csv("data/keywords.csv", index=False)
df_study_keywords.to_csv("data/study_keywords.csv", index=False)

In [18]:
df_interventions.to_csv("data/interventions.csv", index=False)
df_study_interventions.to_csv("data/study_interventions.csv", index=False)
df_arm_group_interventions.to_csv("data/arm_groups_intrv.csv", index=False)

In [128]:
df_central_contacts.to_csv("data/contacts.csv", index=False)
df_study_central_contacts.to_csv("data/study_contacts.csv", index=False)

In [129]:
df_study_locations.to_csv("data/study_locations.csv", index=False)
df_locations.to_csv("data/locations.csv", index=False)