# Import Libraries

In [245]:
# Import libraries
import requests
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from function_query import query_pdc
import openpyxl
import xlsxwriter


# Define variables and url

In [246]:
# Variables
variables = {
    "pdc_study_identifier": "PDC000585",
    "offset": 0, # set offset records to pull
    "limit": 10,  # limit number of records,
    "data_type": "log2_ratio" # option: unshared_log2_ratio 
}



In [303]:
%run queries.ipynb

# Readme

In [248]:
readme = readme_df

# Program - Project

In [252]:
study_data = query_pdc(query= query_study_info, variables=variables)
matrix = json.loads(study_data.content)['data']['study']
study_df = pd.DataFrame(matrix)

Sending query.


In [253]:
program_project = study_df[["program_id", 'project_id']].transpose()
program_project.columns = ['name']

In [254]:
program_project["id"] = program_project.index
program_project.head()
program_project = program_project.reindex(columns=['id', 'name'])
program_project.head()

Unnamed: 0,id,name
program_id,program_id,fa99a299-0d83-11ea-9bfa-0a42f3c845fe
project_id,project_id,ba34699a-caff-461c-b77e-a27a6bf305f0


# Case_Matrix

In [255]:
speciment_data = query_pdc(query= query_biospecimen, variables=variables)

Sending query.


In [256]:
matrix = json.loads(speciment_data.content)['data']["biospecimenPerStudy"]
biospecimen_df = pd.DataFrame(matrix[1:], columns=matrix[0])

In [257]:
case_matrix = biospecimen_df[['case_submitter_id', "sample_submitter_id", "aliquot_submitter_id"]]
case_matrix = case_matrix[2:]

# Case

In [259]:
case_data = query_pdc(query= query_case, variables=variables)
matrix = json.loads(case_data.content)['data']['case']
case_data_df = pd.DataFrame(matrix)

Sending query.


In [260]:
case = pd.merge(left=biospecimen_df, right=case_data_df, on="case_id")
columns_to_keep = [col for col in case.columns if not col.endswith('_y')]
case = case[columns_to_keep]
case = case.rename(columns={col: col.rstrip('_x') for col in case.columns})


In [261]:
check_names = ['case_submitter_id', 'external_case_id', 'disease_type', 'pool',
       'primary_site', 'status', 'taxon', 'case_is_ref', 'consent_type',
       'days_to_consent', 'days_to_lost_to_followup', 'index_date',
       'lost_to_followup']
to_remove = list(set(case.columns).difference(check_names))

In [262]:
case.drop(columns=to_remove, inplace=True)
case = case.reindex(columns=check_names)

# Demographic

In [264]:
# Variables
variables = {
    "pdc_study_identifier": "PDC000585",
    "study_id": study_df['study_id'][0],
    "offset": 0,  # set offset records to pull
    "limit": 10,  # limit number of records
    "data_type": "log2_ratio"  # option: unshared_log2_ratio
}

In [265]:
demographics_data = query_pdc(query= query_demographcis, variables=variables)
matrix = json.loads(demographics_data.content)['data']["paginatedCaseDemographicsPerStudy"]["caseDemographicsPerStudy"]
demographics_data = pd.DataFrame(matrix[1:], columns=matrix[0])

Sending query.


In [266]:
demographics_data['demographic_id'] = demographics_data['demographics'].apply(lambda diag_list: diag_list[0]['demographic_id'] if diag_list else None)

In [267]:
demographics_df = for_demographics(matrix = matrix)

In [268]:
demographic = pd.merge(left=demographics_data, right=demographics_df, on="demographic_id")

In [269]:
check_names = ['case_submitter_id', 'ethnicity', 'gender', 'race', 'age_at_index',
       'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source',
       'country_of_residence_at_enrollment', 'days_to_birth', 'days_to_death',
       'occupation_duration_years', 'premature_at_birth', 'vital_status',
       'weeks_gestation_at_birth', 'year_of_birth', 'year_of_death']
to_remove = list(set(demographic.columns).difference(check_names))

In [270]:
demographic.drop(columns=to_remove, inplace=True)
demographic = demographic.reindex(columns=check_names)

In [271]:
demographic.head()

Unnamed: 0,case_submitter_id,ethnicity,gender,race,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,occupation_duration_years,premature_at_birth,vital_status,weeks_gestation_at_birth,year_of_birth,year_of_death
0,Withheld,not reported,not reported,not reported,,,,,,,,,,,,,
1,Pool,not reported,not reported,not reported,,,,,,,,,,,,,
2,AP-BTE5,not hispanic or latino,female,black or african american,50.0,,,,,,,,,Dead,,,
3,AP-LU5F,not hispanic or latino,female,black or african american,62.0,,,,,,,,,Dead,,,
4,AP-Y8GF,not hispanic or latino,female,white,68.0,,,,,,,,,Dead,,,


# Diagnosis

In [273]:
diagnose_data = query_pdc(query= query_diagnose, variables=variables)
matrix = json.loads(diagnose_data.content)['data']["paginatedCaseDiagnosesPerStudy"]["caseDiagnosesPerStudy"]
diagnose_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])
diagnose_data_df.columns

Sending query.


Index(['case_id', 'case_submitter_id', 'disease_type', 'primary_site',
       'diagnoses'],
      dtype='object')

In [274]:
diagnose_data_df['diagnosis_id'] = diagnose_data_df['diagnoses'].apply(lambda diag_list: diag_list[0]['diagnosis_id'] if diag_list else None)
diagnose_data_df.head()

Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,diagnoses,diagnosis_id
0,c748755b-e001-4244-be38-b581d81c8ea5,Withheld,Other,Not Reported,[{'diagnosis_id': 'd3996c4b-7f5c-427b-97ba-3c8...,d3996c4b-7f5c-427b-97ba-3c87522c77ce
1,5073e46b-53cf-4771-bf65-392b48f04f52,Pool,Other,Not Reported,[{'diagnosis_id': '80568346-c093-498e-8e39-416...,80568346-c093-498e-8e39-41642af1320e
2,33b95c97-4216-474c-85bf-2dfd671a7f34,AP-BTE5,"Epithelial Neoplasms, NOS",Ovary,[{'diagnosis_id': 'dc05c0d0-6eed-432e-9938-ab6...,dc05c0d0-6eed-432e-9938-ab66abc3d6c8
3,0917e0cd-4a2e-4f22-b4da-379b76ec91c7,AP-LU5F,"Epithelial Neoplasms, NOS",Ovary,[{'diagnosis_id': 'd3f53162-f2f5-4696-94cf-568...,d3f53162-f2f5-4696-94cf-5681c971ff9c
4,1e6a6cdf-48e3-46b8-af60-67971d2af4a8,AP-Y8GF,"Epithelial Neoplasms, NOS",Ovary,[{'diagnosis_id': '2df96eaa-4b38-4624-895f-f97...,2df96eaa-4b38-4624-895f-f97c63216c26


In [275]:
diagnose_df = for_diagnosis(matrix = matrix)

In [276]:
diagnosis = pd.merge(left=diagnose_data_df, right=diagnose_df, on="diagnosis_id")

In [277]:
check_names = ['case_submitter_id', 'age_at_diagnosis', 'days_to_last_follow_up', 'days_to_last_known_disease_status', 
       'days_to_recurrence', 'diagnosis_is_primary_disease', 'last_known_disease_status', 'morphology', 
       'primary_diagnosis', 'progression_or_recurrence', 'site_of_resection_or_biopsy', 'tissue_or_organ_of_origin', 
       'tumor_grade', 'tumor_stage', 'adrenal_hormone', 'ajcc_clinical_m', 'ajcc_clinical_n', 'ajcc_clinical_stage', 
       'ajcc_clinical_t', 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_stage', 'ajcc_pathologic_t',
       'ajcc_staging_system_edition', 'anaplasia_present', 'anaplasia_present_type', 'ann_arbor_b_symptoms',
       'ann_arbor_b_symptoms_described', 'ann_arbor_clinical_stage', 'ann_arbor_extranodal_involvement', 
       'ann_arbor_pathologic_stage', 'best_overall_response', 'breslow_thickness', 'burkitt_lymphoma_clinical_variant',
       'child_pugh_classification', 'circumferential_resection_margin', 'classification_of_tumor', 'cog_liver_stage', 
       'cog_neuroblastoma_risk_group', 'cog_renal_stage', 'cog_rhabdomyosarcoma_risk_group', 'colon_polyps_history',
       'days_to_best_overall_response', 'days_to_diagnosis', 'days_to_hiv_diagnosis', 'days_to_new_event', 
       'eln_risk_classification', 'enneking_msts_grade', 'enneking_msts_metastasis', 'enneking_msts_stage', 
       'enneking_msts_tumor_site', 'esophageal_columnar_dysplasia_degree', 'esophageal_columnar_metaplasia_present', 
       'figo_stage', 'figo_staging_edition_year', 'first_symptom_prior_to_diagnosis', 
       'gastric_esophageal_junction_involvement', 'gleason_grade_group', 'gleason_grade_tertiary', 
       'gleason_patterns_percent', 'goblet_cells_columnar_mucosa_present', 'gross_tumor_weight', 
       'hiv_positive', 'hpv_positive_type', 'hpv_status', 'icd_10_code', 'igcccg_stage', 'inpc_grade',
       'inpc_histologic_group', 'inrg_stage', 'inss_stage', 'international_prognostic_index', 'irs_group', 
       'irs_stage', 'ishak_fibrosis_score', 'iss_stage', 'largest_extrapelvic_peritoneal_focus', 'laterality',
       'ldh_level_at_diagnosis', 'ldh_normal_range_upper', 'lymph_nodes_positive', 'lymph_nodes_tested', 
       'lymphatic_invasion_present', 'margin_distance', 'margins_involved_site', 'masaoka_stage',
       'medulloblastoma_molecular_classification', 'metastasis_at_diagnosis', 'metastasis_at_diagnosis_site',
       'method_of_diagnosis', 'mitosis_karyorrhexis_index', 'new_event_anatomic_site', 'new_event_type', 
       'non_nodal_regional_disease', 'non_nodal_tumor_deposits', 'ovarian_specimen_status', 
       'ovarian_surface_involvement', 'overall_survival', 'percent_tumor_invasion', 
       'perineural_invasion_present', 'peripancreatic_lymph_nodes_positive', 
       'peripancreatic_lymph_nodes_tested', 'peritoneal_fluid_cytological_status',
       'pregnant_at_diagnosis', 'primary_gleason_grade', 'prior_malignancy', 
       'prior_treatment', 'progression_free_survival', 'progression_free_survival_event', 
       'residual_disease', 'satellite_nodule_present', 'secondary_gleason_grade', 
       'sites_of_involvement', 'supratentorial_localization', 'synchronous_malignancy', 
       'tumor_cell_content', 'tumor_confined_to_organ_of_origin', 'tumor_depth', 
       'tumor_focality', 'tumor_largest_dimension_diameter', 'tumor_regression_grade', 
       'vascular_invasion_present', 'vascular_invasion_type', 'weiss_assessment_score', 
       'who_cns_grade', 'who_nte_grade', 'wilms_tumor_histologic_subtype', 'year_of_diagnosis']
to_remove = list(set(diagnosis.columns).difference(check_names))

In [278]:
diagnosis.drop(columns=to_remove, inplace=True)
diagnosis = diagnosis.reindex(columns=check_names)
diagnosis.head()

Unnamed: 0,case_submitter_id,age_at_diagnosis,days_to_last_follow_up,days_to_last_known_disease_status,days_to_recurrence,diagnosis_is_primary_disease,last_known_disease_status,morphology,primary_diagnosis,progression_or_recurrence,...,tumor_focality,tumor_largest_dimension_diameter,tumor_regression_grade,vascular_invasion_present,vascular_invasion_type,weiss_assessment_score,who_cns_grade,who_nte_grade,wilms_tumor_histologic_subtype,year_of_diagnosis
0,Withheld,,,,,Not Reported,not reported,Not Reported,Not Reported,yes,...,,,,,,,,,,
1,Pool,,,,,Not Reported,not reported,Not Reported,Not Reported,yes,...,,,,,,,,,,
2,AP-BTE5,16263.0,751.0,751.0,386.0,True,Distant met recurrence/progression,Unknown,High-grade serous carcinoma,yes,...,,,,,,,,,,
3,AP-LU5F,20166.0,77.0,77.0,30.0,True,Distant met recurrence/progression,Unknown,High-grade serous carcinoma,yes,...,,,,,,,,,,
4,AP-Y8GF,22117.0,1660.0,1660.0,534.0,True,Distant met recurrence/progression,Unknown,High-grade serous carcinoma,yes,...,,,,,,,,,,


# Exposure

In [279]:
# Variables
variables = {
    "pdc_study_identifier": "PDC000127",
    "study_id": "a5da6836-c92e-4bdc-8f84-d28d629fc383",
    "offset": 0,  # set offset records to pull
    "limit": 10,  # limit number of records
    "data_type": "log2_ratio"  # option: unshared_log2_ratio
}

In [280]:
exposure_data = query_pdc(query= query_exposure, variables= variables)
matrix = json.loads(exposure_data.content)['data']["paginatedCaseExposuresPerStudy"]["caseExposuresPerStudy"]
exposure_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])
exposure_data_df.columns

Sending query.


Index(['case_id', 'case_submitter_id', 'disease_type', 'primary_site',
       'exposures'],
      dtype='object')

In [281]:
exposure_data_df['exposure_id'] = exposure_data_df['exposures'].apply(lambda diag_list: diag_list[0]['exposure_id'] if diag_list else None)
exposure_data_df.head()

Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,exposures,exposure_id
0,bf7ade95-1fb8-11e9-b7f8-0a80fada099c,C3L-00360,Clear Cell Renal Cell Carcinoma,Kidney,[{'exposure_id': '19e54723-2c37-11ec-b712-0a4e...,19e54723-2c37-11ec-b712-0a4e2186f121
1,b76d3749-1fb8-11e9-b7f8-0a80fada099c,C3L-00097,Clear Cell Renal Cell Carcinoma,Kidney,[{'exposure_id': '19c4a43a-2c37-11ec-b712-0a4e...,19c4a43a-2c37-11ec-b712-0a4e2186f121
2,0e79bef5-1fba-11e9-b7f8-0a80fada099c,QC5,Other,Not Reported,[],
3,cc8a63fd-1fb8-11e9-b7f8-0a80fada099c,C3L-00583,Clear Cell Renal Cell Carcinoma,Kidney,[{'exposure_id': '1a319da2-2c37-11ec-b712-0a4e...,1a319da2-2c37-11ec-b712-0a4e2186f121
4,b5158703-1fb9-11e9-b7f8-0a80fada099c,C3N-01648,Clear Cell Renal Cell Carcinoma,Kidney,[{'exposure_id': '1e9160be-2c37-11ec-b712-0a4e...,1e9160be-2c37-11ec-b712-0a4e2186f121


In [282]:
exposure_df = for_case(matrix = matrix)
exposure_df.head()

Unnamed: 0,exposure_id,exposure_submitter_id,alcohol_days_per_week,alcohol_drinks_per_day,alcohol_history,alcohol_intensity,asbestos_exposure,cigarettes_per_day,coal_dust_exposure,environmental_tobacco_smoke_exposure,...,age_at_onset,alcohol_type,exposure_duration,exposure_duration_years,exposure_type,marijuana_use_per_week,parent_with_radiation_exposure,secondhand_smoke_as_child,smokeless_tobacco_quit_age,tobacco_use_per_day
0,1a8d2cd0-2c37-11ec-b712-0a4e2186f121,C3L-00791-EX,,,Not Reported,Unknown,,,,,...,,,,,,,,,,
1,19e54723-2c37-11ec-b712-0a4e2186f121,C3L-00360-EX,,,Yes,Occasional Drinker,,20.0,,,...,,,,,,,,,,
2,19c4a43a-2c37-11ec-b712-0a4e2186f121,C3L-00097-EX,,,Yes,Occasional Drinker,,20.0,,,...,,,,,,,,,,
3,1a319da2-2c37-11ec-b712-0a4e2186f121,C3L-00583-EX,,,No,Lifelong Non-Drinker,,,,,...,,,,,,,,,,
4,1e9160be-2c37-11ec-b712-0a4e2186f121,C3N-01648-EX,,,Yes,Occasional Drinker,,15.0,,,...,,,,,,,,,,


In [283]:
exposure = pd.merge(left=exposure_data_df, right=exposure_df, on="exposure_id")

In [284]:
check_names = ['case_submitter_id', 'age_at_onset', 'alcohol_days_per_week',
    'alcohol_drinks_per_day', 'alcohol_history', 'alcohol_intensity',
    'alcohol_type', 'asbestos_exposure', 'cigarettes_per_day', 
    'coal_dust_exposure', 'environmental_tobacco_smoke_exposure', 
    'exposure_duration', 'exposure_duration_years', 'exposure_type', 
    'marijuana_use_per_week', 'pack_years_smoked', 'parent_with_radiation_exposure', 
    'radon_exposure', 'respirable_crystalline_silica_exposure', 'secondhand_smoke_as_child', 
    'smokeless_tobacco_quit_age', 'smoking_frequency', 'time_between_waking_and_first_smoke', 
    'tobacco_smoking_onset_year', 'tobacco_smoking_quit_year', 'tobacco_smoking_status', 
    'tobacco_use_per_day', 'type_of_smoke_exposure', 'type_of_tobacco_used', 'years_smoked']
to_remove = list(set(exposure.columns).difference(check_names))

In [285]:
exposure = exposure.drop(columns=to_remove)
exposure = exposure.reindex(columns=check_names)
exposure.head()

Unnamed: 0,case_submitter_id,age_at_onset,alcohol_days_per_week,alcohol_drinks_per_day,alcohol_history,alcohol_intensity,alcohol_type,asbestos_exposure,cigarettes_per_day,coal_dust_exposure,...,smokeless_tobacco_quit_age,smoking_frequency,time_between_waking_and_first_smoke,tobacco_smoking_onset_year,tobacco_smoking_quit_year,tobacco_smoking_status,tobacco_use_per_day,type_of_smoke_exposure,type_of_tobacco_used,years_smoked
0,C3L-00360,,,,Yes,Occasional Drinker,,,20.0,,...,,,,1963.0,2003.0,4,,,,40.0
1,C3L-00097,,,,Yes,Occasional Drinker,,,20.0,,...,,,,1975.0,1997.0,3,,,,22.0
2,C3L-00583,,,,No,Lifelong Non-Drinker,,,,,...,,,,,,1,,,,
3,C3N-01648,,,,Yes,Occasional Drinker,,,15.0,,...,,,,1978.0,2017.0,4,,,,39.0
4,C3L-00814,,,,Not Reported,Unknown,,,,,...,,,,,,7,,,,


# Family History -- to do

case_submitter_id', 'relationship_age_at_diagnosis',
       'relationship_gender', 'relationship_primary_diagnosis',
       'relationship_type', 'relative_with_cancer_history        ',
       'relatives_with_cancer_history_count 

# Treatment

In [286]:

# Variables
variables = {
    "pdc_study_identifier": "PDC000436",
    "study_id": "a5da6836-c92e-4bdc-8f84-d28d629fc383",
    "offset": 0,  # set offset records to pull
    "limit": 10,  # limit number of records
    "data_type": "log2_ratio"  # option: unshared_log2_ratio
}

In [287]:
treatments_data = query_pdc(query= query_treatments, variables=variables)
matrix = json.loads(treatments_data.content)['data']["paginatedCaseTreatmentsPerStudy"]["caseTreatmentsPerStudy"]
treatments_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])
treatments_data_df.head()

Sending query.


Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,treatments
0,e3c38f0f-db71-445b-95e6-f189c2504bc0,AP-E98M,Lung Adenocarcinoma,Bronchus and lung,[]
1,90e33601-748b-4df7-bff3-a60ee8a09df3,AP-G79N,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '3203f46f-9014-4752-9125-2d0...
2,5629fe7e-ee5e-4e2e-bc4b-2c4a99183d61,AP-RA2G,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '9e3b90e2-1905-4368-b23c-6f7...
3,131b3172-8e24-4bb8-be95-38566b0f84c9,AP-DMUM,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '60126c4d-f1be-4ad1-b358-0c6...
4,8fba4d73-5cb9-43fc-b073-ddd05b921cac,AP-ZLDH,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': 'a4aab2aa-259c-40d6-85b8-0ce...


In [288]:
treatments_data_df['treatment_id'] = treatments_data_df['treatments'].apply(lambda diag_list: diag_list[0]['treatment_id'] if diag_list else None)
treatments_data_df.head()

Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,treatments,treatment_id
0,e3c38f0f-db71-445b-95e6-f189c2504bc0,AP-E98M,Lung Adenocarcinoma,Bronchus and lung,[],
1,90e33601-748b-4df7-bff3-a60ee8a09df3,AP-G79N,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '3203f46f-9014-4752-9125-2d0...,3203f46f-9014-4752-9125-2d0d9bd85356
2,5629fe7e-ee5e-4e2e-bc4b-2c4a99183d61,AP-RA2G,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '9e3b90e2-1905-4368-b23c-6f7...,9e3b90e2-1905-4368-b23c-6f7fa577493c
3,131b3172-8e24-4bb8-be95-38566b0f84c9,AP-DMUM,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': '60126c4d-f1be-4ad1-b358-0c6...,60126c4d-f1be-4ad1-b358-0c653788c222
4,8fba4d73-5cb9-43fc-b073-ddd05b921cac,AP-ZLDH,Lung Adenocarcinoma,Bronchus and lung,[{'treatment_id': 'a4aab2aa-259c-40d6-85b8-0ce...,a4aab2aa-259c-40d6-85b8-0ce758520a74


In [289]:
treatments_df = for_treatment(matrix = matrix)
treatments_df.head()

Unnamed: 0,treatment_id,treatment_submitter_id,days_to_treatment_start,initial_disease_status,regimen_or_line_of_therapy,therapeutic_agents,treatment_anatomic_site,treatment_effect,treatment_intent_type,treatment_or_therapy,...,treatment_type,chemo_concurrent_to_radiation,number_of_cycles,reason_treatment_ended,route_of_administration,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect_indicator,treatment_frequency
0,3203f46f-9014-4752-9125-2d0d9bd85356,AP-G79N-TR,,,,,,,Adjuvant,,...,"Radiation Therapy, NOS",,,,,,,,,
1,b087a7bf-7f18-44df-a7c3-d5db211154fe,AP-G79N-TR,,,,,,,Adjuvant,,...,Chemotherapy,,,,,,,,,
2,9e3b90e2-1905-4368-b23c-6f7fa577493c,AP-RA2G-TR,,,,,,,Adjuvant,,...,"Radiation Therapy, NOS",,,,,,,,,
3,60126c4d-f1be-4ad1-b358-0c653788c222,AP-DMUM-TR,,,,,,,Adjuvant,,...,Chemotherapy,,,,,,,,,
4,a4aab2aa-259c-40d6-85b8-0ce758520a74,AP-ZLDH-TR,,,,,,,Adjuvant,,...,"Radiation Therapy, NOS",,,,,,,,,


In [290]:
treatments = pd.merge(left=treatments_data_df, right=treatments_df, on="treatment_id")
treatments.columns

Index(['case_id', 'case_submitter_id', 'disease_type', 'primary_site',
       'treatments', 'treatment_id', 'treatment_submitter_id',
       'days_to_treatment_start', 'initial_disease_status',
       'regimen_or_line_of_therapy', 'therapeutic_agents',
       'treatment_anatomic_site', 'treatment_effect', 'treatment_intent_type',
       'treatment_or_therapy', 'treatment_outcome', 'treatment_type',
       'chemo_concurrent_to_radiation', 'number_of_cycles',
       'reason_treatment_ended', 'route_of_administration', 'treatment_arm',
       'treatment_dose', 'treatment_dose_units', 'treatment_effect_indicator',
       'treatment_frequency'],
      dtype='object')

In [291]:
check_names = ['case_submitter_id', 'chemo_concurrent_to_radiation', 
               'days_to_treatment_end', 'days_to_treatment_start', 
               'initial_disease_status', 'number_of_cycles', 'reason_treatment_ended', 
               'regimen_or_line_of_therapy', 'route_of_administration', 
               'therapeutic_agents', 'treatment_anatomic_site', 'treatment_arm', 
               'treatment_dose', 'treatment_dose_units', 'treatment_effect', 
               'treatment_effect_indicator', 'treatment_frequency', 'treatment_intent_type', 
               'treatment_or_therapy', 'treatment_outcome', 'treatment_type']
to_remove = list(set(treatments.columns).difference(check_names))

In [292]:
treatments = treatments.drop(columns=to_remove)
treatments = treatments.reindex(columns=check_names)
treatments.head()

Unnamed: 0,case_submitter_id,chemo_concurrent_to_radiation,days_to_treatment_end,days_to_treatment_start,initial_disease_status,number_of_cycles,reason_treatment_ended,regimen_or_line_of_therapy,route_of_administration,therapeutic_agents,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,AP-G79N,,,,,,,,,,...,,,,,,,Adjuvant,,,"Radiation Therapy, NOS"
1,AP-RA2G,,,,,,,,,,...,,,,,,,Adjuvant,,,"Radiation Therapy, NOS"
2,AP-DMUM,,,,,,,,,,...,,,,,,,Adjuvant,,,Chemotherapy
3,AP-ZLDH,,,,,,,,,,...,,,,,,,Adjuvant,,,"Radiation Therapy, NOS"
4,AP-WTXJ,,,,,,,,,,...,,,,,,,Adjuvant,,,Chemotherapy


# Follow up

In [293]:
follow_up_data = query_pdc(query= query_follow_up, variables=variables)
matrix = json.loads(follow_up_data.content)['data']["paginatedCaseFollowUpsPerStudy"]['caseFollowUpsPerStudy']
follow_up_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])
follow_up_data_df.head()

Sending query.


Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,follow_ups
0,e3c38f0f-db71-445b-95e6-f189c2504bc0,AP-E98M,Lung Adenocarcinoma,Bronchus and lung,[]
1,90e33601-748b-4df7-bff3-a60ee8a09df3,AP-G79N,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': '14eb782e-c701-45de-8a79-d77...
2,5629fe7e-ee5e-4e2e-bc4b-2c4a99183d61,AP-RA2G,Lung Adenocarcinoma,Bronchus and lung,[]
3,131b3172-8e24-4bb8-be95-38566b0f84c9,AP-DMUM,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': 'ed050783-ee70-40e1-8a9c-d95...
4,8fba4d73-5cb9-43fc-b073-ddd05b921cac,AP-ZLDH,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': '5a32a0ea-e70e-4f1a-9085-efe...


In [294]:
follow_up_data_df['follow_up_id'] = follow_up_data_df['follow_ups'].apply(lambda diag_list: diag_list[0]['follow_up_id'] if diag_list else None)
follow_up_data_df.head()

Unnamed: 0,case_id,case_submitter_id,disease_type,primary_site,follow_ups,follow_up_id
0,e3c38f0f-db71-445b-95e6-f189c2504bc0,AP-E98M,Lung Adenocarcinoma,Bronchus and lung,[],
1,90e33601-748b-4df7-bff3-a60ee8a09df3,AP-G79N,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': '14eb782e-c701-45de-8a79-d77...,14eb782e-c701-45de-8a79-d77726e0bc14
2,5629fe7e-ee5e-4e2e-bc4b-2c4a99183d61,AP-RA2G,Lung Adenocarcinoma,Bronchus and lung,[],
3,131b3172-8e24-4bb8-be95-38566b0f84c9,AP-DMUM,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': 'ed050783-ee70-40e1-8a9c-d95...,ed050783-ee70-40e1-8a9c-d954e7c011c2
4,8fba4d73-5cb9-43fc-b073-ddd05b921cac,AP-ZLDH,Lung Adenocarcinoma,Bronchus and lung,[{'follow_up_id': '5a32a0ea-e70e-4f1a-9085-efe...,5a32a0ea-e70e-4f1a-9085-efef900e4dbe


In [295]:
follow_up_df = for_follows_up(matrix = matrix)
follow_up_df.head()

Unnamed: 0,follow_up_id,follow_up_submitter_id,adverse_event,barretts_esophagus_goblet_cells_present,bmi,cause_of_response,comorbidity,comorbidity_method_of_diagnosis,days_to_adverse_event,days_to_comorbidity,...,procedures_performed,recist_targeted_regions_number,recist_targeted_regions_sum,scan_tracer_used,undescended_testis_corrected,undescended_testis_corrected_age,undescended_testis_corrected_laterality,undescended_testis_corrected_method,undescended_testis_history,undescended_testis_history_laterality
0,14eb782e-c701-45de-8a79-d77726e0bc14,AP-G79N-FL,,,,,,,,,...,,,,,,,,,,
1,ed050783-ee70-40e1-8a9c-d954e7c011c2,AP-DMUM-FL,,,,,,,,,...,,,,,,,,,,
2,5a32a0ea-e70e-4f1a-9085-efef900e4dbe,AP-ZLDH-FL,,,,,,,,,...,,,,,,,,,,
3,64eab365-2d47-4a54-90cf-31d56fe9003c,AP-WTXJ-FL,,,,,,,,,...,,,,,,,,,,
4,df135c69-a3eb-433c-af7f-a59dee0235ae,AP-5JDU-FL,,,,,,,,,...,,,,,,,,,,


In [296]:
follow_ups = pd.merge(left=follow_up_data_df, right=follow_up_df, on="follow_up_id")
follow_ups.columns

Index(['case_id', 'case_submitter_id', 'disease_type', 'primary_site',
       'follow_ups', 'follow_up_id', 'follow_up_submitter_id', 'adverse_event',
       'barretts_esophagus_goblet_cells_present', 'bmi', 'cause_of_response',
       'comorbidity', 'comorbidity_method_of_diagnosis',
       'days_to_adverse_event', 'days_to_comorbidity', 'days_to_follow_up',
       'days_to_progression', 'days_to_progression_free', 'days_to_recurrence',
       'diabetes_treatment_type', 'disease_response',
       'dlco_ref_predictive_percent', 'ecog_performance_status',
       'fev1_ref_post_bronch_percent', 'fev1_ref_pre_bronch_percent',
       'fev1_fvc_pre_bronch_percent', 'fev1_fvc_post_bronch_percent', 'height',
       'hepatitis_sustained_virological_response', 'hpv_positive_type',
       'karnofsky_performance_status', 'menopause_status',
       'pancreatitis_onset_year', 'progression_or_recurrence',
       'progression_or_recurrence_anatomic_site',
       'progression_or_recurrence_type', 'ref

In [299]:
check_names = ['case_submitter_id', 'days_to_follow_up', 'adverse_event',
       'adverse_event_grade', 'aids_risk_factors',
       'barretts_esophagus_goblet_cells_present', 'bmi', 'body_surface_area',
       'cause_of_response', 'cd4_count', 'cdc_hiv_risk_factors', 'comorbidity',
       'comorbidity_method_of_diagnosis', 'days_to_adverse_event',
       'days_to_comorbidity', 'days_to_imaging', 'days_to_progression',
       'days_to_progression_free', 'days_to_recurrence',
       'diabetes_treatment_type', 'disease_response',
       'dlco_ref_predictive_percent', 'ecog_performance_status',
       'evidence_of_recurrence_type', 'eye_color',
       'fev1_fvc_post_bronch_percent', 'fev1_fvc_pre_bronch_percent',
       'fev1_ref_post_bronch_percent', 'fev1_ref_pre_bronch_percent',
       'haart_treatment_indicator', 'height',
       'hepatitis_sustained_virological_response', 'history_of_tumor',
       'history_of_tumor_type', 'hiv_viral_load',
       'hormonal_contraceptive_type', 'hormonal_contraceptive_use',
       'hormone_replacement_therapy_type', 'hpv_positive_type',
       'hysterectomy_margins_involved', 'hysterectomy_type', 'imaging_result',
       'imaging_type', 'immunosuppressive_treatment_type',
       'karnofsky_performance_status', 'menopause_status', 'nadir_cd4_count',
       'pancreatitis_onset_year', 'pregnancy_outcome', 'procedures_performed',
       'progression_or_recurrence', 'progression_or_recurrence_anatomic_site',
       'progression_or_recurrence_type', 'recist_targeted_regions_number',
       'recist_targeted_regions_sum', 'reflux_treatment_type', 'risk_factor',
       'risk_factor_treatment', 'scan_tracer_used',
       'undescended_testis_corrected', 'undescended_testis_corrected_age',
       'undescended_testis_corrected_laterality',
       'undescended_testis_corrected_method', 'undescended_testis_history',
       'undescended_testis_history_laterality', 'viral_hepatitis_serologies',
       'weight']
to_remove = list(set(follow_ups.columns).difference(check_names))

In [302]:
follow_ups = follow_ups.drop(columns=to_remove)
follow_ups = follow_ups.reindex(columns=check_names)
follow_ups.head()

Unnamed: 0,case_submitter_id,days_to_follow_up,adverse_event,adverse_event_grade,aids_risk_factors,barretts_esophagus_goblet_cells_present,bmi,body_surface_area,cause_of_response,cd4_count,...,risk_factor_treatment,scan_tracer_used,undescended_testis_corrected,undescended_testis_corrected_age,undescended_testis_corrected_laterality,undescended_testis_corrected_method,undescended_testis_history,undescended_testis_history_laterality,viral_hepatitis_serologies,weight
0,AP-G79N,670,,,,,,,,,...,,,,,,,,,,
1,AP-DMUM,676,,,,,,,,,...,,,,,,,,,,
2,AP-ZLDH,510,,,,,,,,,...,,,,,,,,,,
3,AP-WTXJ,453,,,,,,,,,...,,,,,,,,,,
4,AP-5JDU,913,,,,,,,,,...,,,,,,,,,,


# Sample -- to do

In [304]:
matrix = json.loads(case_data.content)['data']['case']
case_data_df = pd.DataFrame(matrix)

In [305]:
sample_df = for_sample(matrix = matrix)

In [311]:
check_names = ['sample_submitter_id', 'composition', 'pool', 'sample_type', 'status',
       'tissue_type', 'gdc_project_id', 'gdc_sample_id',
       'biospecimen_anatomic_site', 'biospecimen_laterality',
       'catalog_reference', 'current_weight', 'days_to_collection',
       'days_to_sample_procurement', 'diagnosis_pathologically_confirmed',
       'distance_normal_to_tumor', 'distributor_reference', 'freezing_method',
       'growth_rate', 'initial_weight', 'intermediate_dimension',
       'longest_dimension', 'method_of_sample_procurement', 'passage_count',
       'pathology_report_uuid', 'preservation_method', 'sample_is_ref',
       'sample_ordinal', 'sample_type_id', 'shortest_dimension',
       'time_between_clamping_and_freezing',
       'time_between_excision_and_freezing', 'tissue_collection_type',
       'tumor_code', 'tumor_code_id', 'tumor_descriptor']
to_remove = list(set(sample_df.columns).difference(check_names))

In [313]:
samples = sample_df.drop(columns=to_remove)
samples = samples.reindex(columns=check_names)
samples.head()

Unnamed: 0,sample_submitter_id,composition,pool,sample_type,status,tissue_type,gdc_project_id,gdc_sample_id,biospecimen_anatomic_site,biospecimen_laterality,...,sample_is_ref,sample_ordinal,sample_type_id,shortest_dimension,time_between_clamping_and_freezing,time_between_excision_and_freezing,tissue_collection_type,tumor_code,tumor_code_id,tumor_descriptor
0,AP-LM5A,Solid Tissue,,Primary Tumor,,Tumor,,,,,...,,,,,,,,,,
1,AP-U8GL,Solid Tissue,,Metastatic,,Tumor,,,,,...,,,,,,,,,,
2,AP-WC9B,Solid Tissue,,Metastatic,,Tumor,,,,,...,,,,,,,,,,
3,AP-LU5F,Solid Tissue,,Metastatic,,Tumor,,,,,...,,,,,,,,,,
4,AP-2WDT,Solid Tissue,,Primary Tumor,,Tumor,,,,,...,,,,,,,,,,


# Aliquots -- to do

In [315]:
biospecimen_df.columns

Index(['aliquot_id', 'sample_id', 'case_id', 'aliquot_submitter_id',
       'sample_submitter_id', 'case_submitter_id', 'aliquot_status',
       'case_status', 'sample_status', 'project_name', 'sample_type',
       'disease_type', 'primary_site', 'pool', 'taxon', 'externalReferences'],
      dtype='object')

aliquot_submitter_id', 'aliquot_is_ref', 'pool', 'status',
       'aliquot_quantity', 'aliquot_volume', 'amount', 'analyte_type',
       'analyte_type_id', 'concentration'

# Study

In [316]:
study_data = query_pdc(query= query_study_info, variables=variables)
matrix = json.loads(study_data.content)['data']['study']
study_df = pd.DataFrame(matrix)
study_df.head()

Sending query.


Unnamed: 0,study_id,pdc_study_id,study_submitter_id,program_id,project_id,study_name,study_description,program_name,project_name,disease_type,primary_site,analytical_fraction,experiment_type,cases_count,aliquots_count,filesCount
0,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,fa99a299-0d83-11ea-9bfa-0a42f3c845fe,d253b105-b08e-40d2-a70f-a83313f6ea39,APOLLO LUAD - Phosphoproteome - FeNTA,We present a deep proteogenomic profiling stud...,Applied Proteogenomics OrganizationaL Learning...,APOLLO1,Lung Adenocarcinoma;Other,Bronchus and lung;Not Reported,Phosphoproteome,TMT11,101,101,"[{'data_category': 'Other Metadata', 'file_typ..."


In [317]:
check_names = ['study_submitter_id', 'analytical_fraction', 'experiment_type',
       'acquisition_type', 'study_description', 'embargo_date']
to_remove = list(set(study_df.columns).difference(check_names))

In [318]:
study = study_df.drop(columns=to_remove)
study = study.reindex(columns=check_names)
study

Unnamed: 0,study_submitter_id,analytical_fraction,experiment_type,acquisition_type,study_description,embargo_date
0,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,TMT11,,We present a deep proteogenomic profiling stud...,


# Protocol

In [319]:
protocol_Data = query_pdc(query= query_protocol, variables=variables)
matrix = json.loads(protocol_Data.content)['data']['protocolPerStudy']
protocol_df = pd.DataFrame(matrix)
protocol_df.head()

Sending query.


Unnamed: 0,protocol_id,protocol_submitter_id,study_id,pdc_study_id,study_submitter_id,program_id,program_submitter_id,protocol_name,protocol_date,document_name,...,analytical_technique,chromatography_instrument_make,chromatography_instrument_model,polarity,reconstitution_solvent,reconstitution_volume,reconstitution_volume_uom,internal_standards,extraction_method,ionization_mode
0,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,fa99a299-0d83-11ea-9bfa-0a42f3c845fe,Applied Proteogenomics OrganizationaL Learning...,APOLLO LUAD - Phosphoproteome - FeNTA,2022-10-10,,...,,,,,,,,,,


In [320]:
check_names = ['study_submitter_id', 'protocol_name', 'protocol_date', 'document_name',
       'quantitation_strategy', 'experiment_type', 'label_free_quantitation',
       'labeled_quantitation', 'isobaric_labeling_reagent',
       'reporter_ion_ms_level', 'starting_amount', 'starting_amount_uom',
       'digestion_reagent', 'alkylation_reagent', 'enrichment_strategy',
       'enrichment', 'chromatography_dimensions_count',
       '1d_chromatography_type', '2d_chromatography_type',
       'fractions_analyzed_count', 'column_type', 'amount_on_column',
       'amount_on_column_uom', 'column_length', 'column_length_uom',
       'column_inner_diameter', 'column_inner_diameter_uom', 'particle_size',
       'particle_size_uom', 'particle_type', 'gradient_length',
       'gradient_length_uom', 'instrument_make', 'instrument_model',
       'serial_number', 'dissociation_type', 'ms1_resolution',
       'ms2_resolution', 'dda_topn', 'normalized_collision_energy',
       'acquistion_type', 'dia_multiplexing', 'dia_ims']
to_remove = list(set(protocol_df.columns).difference(check_names))

In [321]:
protocol = protocol_df.drop(columns=to_remove)
protocol = protocol.reindex(columns=check_names)
protocol

Unnamed: 0,study_submitter_id,protocol_name,protocol_date,document_name,quantitation_strategy,experiment_type,label_free_quantitation,labeled_quantitation,isobaric_labeling_reagent,reporter_ion_ms_level,...,instrument_model,serial_number,dissociation_type,ms1_resolution,ms2_resolution,dda_topn,normalized_collision_energy,acquistion_type,dia_multiplexing,dia_ims
0,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,2022-10-10,,Isobaric label quantitation analysis,TMT11,,TMT,TMT11,MS2,...,Q Exactive HF,,HCD,60000,60000,top 12,34,DDA,,


# Exp_Metadata

In [352]:
expMetadat_data_2 = query_pdc(query= query_expMetadata_2, variables=variables)
matrix = json.loads(expMetadat_data_2.content)['data']["studyExperimentalDesign"]
expMetadat_data_2 = pd.DataFrame(matrix)
expMetadat_data_2.head()

Sending query.


Unnamed: 0,pdc_study_id,study_run_metadata_id,study_run_metadata_submitter_id,study_id,study_submitter_id,analyte,acquisition_type,protocol_id,protocol_submitter_id,polarity,...,tmt_130c,tmt_131,tmt_131c,tmt_132n,tmt_132c,tmt_133n,tmt_133c,tmt_134n,tmt_134c,tmt_135n
0,PDC000436,4df8b7a1-ff19-4a0e-8186-eda36e8ac680,AP1_QEHF1_AP1_1_FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,DDA,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,,...,[{'aliquot_id': '7ba60b55-fafd-4e1b-a870-ca131...,[{'aliquot_id': '76fa308b-7212-4721-97ed-df37c...,[{'aliquot_id': 'f1ff9002-0115-4578-bd92-4880d...,,,,,,,
1,PDC000436,2ad1562f-9fc5-4a0c-9207-59efb43abafe,AP1_QEHF1_AP1_4_FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,DDA,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,,...,[{'aliquot_id': 'c0fb8fd3-c289-4338-906e-fe1a7...,[{'aliquot_id': '08f2bef2-9a82-423e-985d-7cc28...,[{'aliquot_id': '59fcce98-c804-4b2a-a354-0cd2b...,,,,,,,
2,PDC000436,7d682dcc-846a-4ecc-99f6-0642c35dea9a,AP1_QEHF1_AP1_6_FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,DDA,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,,...,[{'aliquot_id': 'de8f6a60-e03b-4ea0-b442-fa469...,[{'aliquot_id': 'c35f91e2-9447-4536-9876-7c281...,[{'aliquot_id': 'adc2dbe1-1180-4151-a242-e72a5...,,,,,,,
3,PDC000436,02c3df48-e487-4ae3-8339-0ab07b65f94b,AP1_QEHF1_AP1_7_FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,DDA,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,,...,[{'aliquot_id': 'd5f70c1e-9f33-4d55-9e68-20135...,[{'aliquot_id': 'e520950a-4c4b-46ea-86c8-eb268...,[{'aliquot_id': 'bd6bd37f-b2c0-4c67-9c56-20baf...,,,,,,,
4,PDC000436,cf59e595-4180-41d3-a7be-7495b4448851,AP1_QEHF1_AP1_8_FeNTA,8961afcf-05de-4b7c-b471-ca1a4887ed36,APOLLO LUAD - Phosphoproteome - FeNTA,Phosphoproteome,DDA,89764a76-1053-4d1f-8be2-825ebeb08724,APOLLO LUAD - Phosphoproteome - FeNTA,,...,[{'aliquot_id': 'b96e82ec-f597-4985-ab76-95885...,[{'aliquot_id': '42af905c-0dff-4410-a38a-36148...,[{'aliquot_id': 'd72ecce0-1b96-4e8c-8fef-3ca77...,,,,,,,


In [353]:
expMetadat_data_2.columns

Index(['pdc_study_id', 'study_run_metadata_id',
       'study_run_metadata_submitter_id', 'study_id', 'study_submitter_id',
       'analyte', 'acquisition_type', 'protocol_id', 'protocol_submitter_id',
       'polarity', 'experiment_type', 'plex_dataset_name', 'experiment_number',
       'number_of_fractions', 'label_free', 'itraq_113', 'itraq_114',
       'itraq_115', 'itraq_116', 'itraq_117', 'itraq_118', 'itraq_119',
       'itraq_121', 'tmt_126', 'tmt_127n', 'tmt_127c', 'tmt_128n', 'tmt_128c',
       'tmt_129n', 'tmt_129c', 'tmt_130n', 'tmt_130c', 'tmt_131', 'tmt_131c',
       'tmt_132n', 'tmt_132c', 'tmt_133n', 'tmt_133c', 'tmt_134n', 'tmt_134c',
       'tmt_135n'],
      dtype='object')

In [354]:
check_names = ['study_submitter_id', 'experiment_type', 'experiment_number',
       'plex_or_folder_name', 'fraction', 'date', 'operator',
       'replicate_number', 'condition', 'label_free', 'itraq_113', 'itraq_114',
       'itraq_115', 'itraq_116', 'itraq_117', 'itraq_118', 'itraq_119',
       'itraq_121', 'tmt_126', 'tmt_127n', 'tmt_127c', 'tmt_128n', 'tmt_128c',
       'tmt_129n', 'tmt_129c', 'tmt_130n', 'tmt_130c', 'tmt_131', 'tmt_131c',
       'tmt_132n', 'tmt_132c', 'tmt_133n', 'tmt_133c', 'tmt_134n', 'tmt_134c',
       'tmt_135n']
to_remove = list(set(expMetadat_data_2.columns).difference(check_names))

In [356]:
Exp_Metadata = expMetadat_data_2.drop(columns=to_remove)
Exp_Metadata = Exp_Metadata.reindex(columns=check_names)
Exp_Metadata.head()

Unnamed: 0,study_submitter_id,experiment_type,experiment_number,plex_or_folder_name,fraction,date,operator,replicate_number,condition,label_free,...,tmt_130c,tmt_131,tmt_131c,tmt_132n,tmt_132c,tmt_133n,tmt_133c,tmt_134n,tmt_134c,tmt_135n
0,APOLLO LUAD - Phosphoproteome - FeNTA,TMT11,8,,,,,,,,...,[{'aliquot_id': '7ba60b55-fafd-4e1b-a870-ca131...,[{'aliquot_id': '76fa308b-7212-4721-97ed-df37c...,[{'aliquot_id': 'f1ff9002-0115-4578-bd92-4880d...,,,,,,,
1,APOLLO LUAD - Phosphoproteome - FeNTA,TMT11,5,,,,,,,,...,[{'aliquot_id': 'c0fb8fd3-c289-4338-906e-fe1a7...,[{'aliquot_id': '08f2bef2-9a82-423e-985d-7cc28...,[{'aliquot_id': '59fcce98-c804-4b2a-a354-0cd2b...,,,,,,,
2,APOLLO LUAD - Phosphoproteome - FeNTA,TMT11,1,,,,,,,,...,[{'aliquot_id': 'de8f6a60-e03b-4ea0-b442-fa469...,[{'aliquot_id': 'c35f91e2-9447-4536-9876-7c281...,[{'aliquot_id': 'adc2dbe1-1180-4151-a242-e72a5...,,,,,,,
3,APOLLO LUAD - Phosphoproteome - FeNTA,TMT11,2,,,,,,,,...,[{'aliquot_id': 'd5f70c1e-9f33-4d55-9e68-20135...,[{'aliquot_id': 'e520950a-4c4b-46ea-86c8-eb268...,[{'aliquot_id': 'bd6bd37f-b2c0-4c67-9c56-20baf...,,,,,,,
4,APOLLO LUAD - Phosphoproteome - FeNTA,TMT11,9,,,,,,,,...,[{'aliquot_id': 'b96e82ec-f597-4985-ab76-95885...,[{'aliquot_id': '42af905c-0dff-4410-a38a-36148...,[{'aliquot_id': 'd72ecce0-1b96-4e8c-8fef-3ca77...,,,,,,,


# File Metadata

In [359]:
file_metadata_data = query_pdc(query= query_file_metadata, variables=variables)
matrix = json.loads(file_metadata_data.content)['data']["filesPerStudy"]
file_metadata_df = pd.DataFrame(matrix)
file_metadata_df.head()

Sending query.


Unnamed: 0,study_id,pdc_study_id,study_submitter_id,study_name,file_id,file_name,file_submitter_id,file_type,md5sum,file_location,file_size,data_category,file_format,signedUrl
0,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,00c78821-d5c1-467d-ae58-c487997ca078,AP1_QEHF2_AP1_5_FeNTA_11.mzML.gz,AP1_QEHF2_AP1_5_FeNTA_11.mzML.gz,Open Standard,2e3251310ac398d1bf883620a30f5b39,studies/436/mzml/AP1_QEHF2_AP1_5_FeNTA_11.mzML.gz,168149549,Processed Mass Spectra,mzML,{'url': 'https://d3iwtkuvwz4jtf.cloudfront.net...
1,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,0102a1f9-7a88-4b93-bcd7-0c1f6e376b7a,AP1_QEHF1_AP1_6_FeNTA_01.mzid.gz,AP1_QEHF1_AP1_6_FeNTA_01.mzid.gz,Open Standard,41930ee8d4e153c9e971b49cc9e9e3e3,studies/436/PSM/mzid/AP1_QEHF1_AP1_6_FeNTA_01....,159619,Peptide Spectral Matches,mzIdentML,{'url': 'https://d3iwtkuvwz4jtf.cloudfront.net...
2,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,0219ce62-efeb-4b4f-a6c3-10e25e83b57b,AP1_QEHF1_AP1_7_FeNTA_04.mzML.gz,AP1_QEHF1_AP1_7_FeNTA_04.mzML.gz,Open Standard,6e49bb1602a607a4723746607777f998,studies/436/mzml/AP1_QEHF1_AP1_7_FeNTA_04.mzML.gz,132846793,Processed Mass Spectra,mzML,{'url': 'https://d3iwtkuvwz4jtf.cloudfront.net...
3,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,021b98e3-21d6-40a7-b696-8486f6b74c01,AP1_QEHF1_AP1_9_FeNTA_02.mzid.gz,AP1_QEHF1_AP1_9_FeNTA_02.mzid.gz,Open Standard,d07e48254bc9bb7e9de4bb22c80e0de7,studies/436/PSM/mzid/AP1_QEHF1_AP1_9_FeNTA_02....,251935,Peptide Spectral Matches,mzIdentML,{'url': 'https://d3iwtkuvwz4jtf.cloudfront.net...
4,8961afcf-05de-4b7c-b471-ca1a4887ed36,PDC000436,APOLLO LUAD - Phosphoproteome - FeNTA,APOLLO LUAD - Phosphoproteome - FeNTA,02a3d104-b0cd-4bb0-8c6a-11b59b2625a9,AP1_QEHF2_AP1_5_FeNTA_02.mzid.gz,AP1_QEHF2_AP1_5_FeNTA_02.mzid.gz,Open Standard,26f8a31101c7971a96d0dc46c1e2dd4b,studies/436/PSM/mzid/AP1_QEHF2_AP1_5_FeNTA_02....,158982,Peptide Spectral Matches,mzIdentML,{'url': 'https://d3iwtkuvwz4jtf.cloudfront.net...


In [360]:
check_names = ['study_submitter_id', 'plex_or_folder_name', 'file_name',
       'fraction_number', 'data_category', 'file_type', 'file_format',
       'md5sum', 'file_size', "sha1"]
to_remove = list(set(file_metadata_df.columns).difference(check_names))

In [361]:
file_metada = file_metadata_df.drop(columns=to_remove)
file_metada = file_metada.reindex(columns=check_names)

# Create an excell workbook with the study information

In [362]:
# object dictionary:
study_information = {
    'Readme': pd.DataFrame(readme),
    "Project-Program": pd.DataFrame(program_project),
    "Case-Matrix": pd.DataFrame(case_matrix),
    "Case": pd.DataFrame(case),
    "Demographic": pd.DataFrame(demographic),
    "Diagnosis": pd.DataFrame(diagnosis),
    "Exposure": pd.DataFrame(exposure),
    "Family-History": pd.DataFrame(), #need to find the data
    "Treatments": pd.DataFrame(treatments),
    "Follow-up": pd.DataFrame(follow_ups),
    "Sample": pd.DataFrame(), #need to redo
    "Aliquots": pd.DataFrame(), #need to redo
    "Study": pd.DataFrame(study),
    "Protocol": pd.DataFrame(protocol),
    "Exp_Metadata": pd.DataFrame(Exp_Metadata),
    "File-Metadata": pd.DataFrame(file_metada)
}

In [363]:
with pd.ExcelWriter('output.xlsx', engine='xlsxwriter') as writer:
    for sheet_name, df in study_information.items():
        if df is not None and not df.empty:  # Check if the DataFrame is not None and not empty
            df.to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            # Create an empty DataFrame and write it to the sheet
            empty_df = pd.DataFrame()
            empty_df.to_excel(writer, sheet_name=sheet_name, index=False)
