# Import Libraries

In [1]:
# Import libraries
import requests
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from function_query import query_pdc
import openpyxl
import xlsxwriter
import time

# Define variables and url

In [2]:

start_time = time.time()

In [3]:
pdc_study_id_input = "PDC000127"
offset_input = 0 # set offset records to pull
limit_input = 10 # limit number of records,
data_type_input = 'log2_ratio' # option: unshared_log2_ratio 

In [4]:
# Variables
variables = {
    "pdc_study_identifier": pdc_study_id_input,
    "offset": offset_input, 
    "limit": limit_input,  
    "data_type": data_type_input 
}


In [5]:
%run queries.ipynb

# Readme

In [6]:
readme = readme_df

# Program - Project

In [7]:
study_data = query_pdc(query= query_study_info, variables=variables)

Sending query.


In [8]:
study_data = query_pdc(query= query_study_info, variables=variables)
matrix = json.loads(study_data.content)['data']['study']
study_df = pd.DataFrame(matrix)

Sending query.


In [9]:
program_project = study_df[["program_id", 'project_id']].transpose()
program_project.columns = ['name']

In [10]:
program_project["id"] = program_project.index
program_project.head()
program_project = program_project.reindex(columns=['id', 'name'])

# Case_Matrix

In [11]:
speciment_data = query_pdc(query= query_biospecimen, variables=variables)

Sending query.


In [12]:
matrix = json.loads(speciment_data.content)['data']["biospecimenPerStudy"]
biospecimen_df = pd.DataFrame(matrix[1:], columns=matrix[0])

In [13]:
case_matrix = biospecimen_df[['case_submitter_id', "sample_submitter_id", "aliquot_submitter_id"]]
case_matrix = case_matrix[2:]

# Case

In [14]:
case_data = query_pdc(query= query_case, variables=variables)
matrix = json.loads(case_data.content)['data']['case']
case_data_df = pd.DataFrame(matrix)

Sending query.


In [15]:
case = pd.merge(left=biospecimen_df, right=case_data_df, on="case_id")
columns_to_keep = [col for col in case.columns if not col.endswith('_y')]
case = case[columns_to_keep]
case = case.rename(columns={col: col.rstrip('_x') for col in case.columns})

In [16]:
check_names = ['case_submitter_id', 'external_case_id', 'disease_type', 'pool',
       'primary_site', 'status', 'taxon', 'case_is_ref', 'consent_type',
       'days_to_consent', 'days_to_lost_to_followup', 'index_date',
       'lost_to_followup']
to_remove = list(set(case.columns).difference(check_names))

In [17]:
case.drop(columns=to_remove, inplace=True)
case = case.reindex(columns=check_names)

# Demographic

In [18]:
variables['study_id'] = study_df['study_id'][0]

In [19]:
demographics_data = query_pdc(query= query_demographcis, variables=variables)
matrix = json.loads(demographics_data.content)['data']["paginatedCaseDemographicsPerStudy"]["caseDemographicsPerStudy"]
demographics_data = pd.DataFrame(matrix[1:], columns=matrix[0])

Sending query.


In [20]:
demographics_data['demographic_id'] = demographics_data['demographics'].apply(lambda diag_list: diag_list[0]['demographic_id'] if diag_list else None)

In [21]:
demographics_df = for_demographics(matrix = matrix)

In [22]:
demographic = pd.merge(left=demographics_data, right=demographics_df, on="demographic_id")

In [23]:
check_names = ['case_submitter_id', 'ethnicity', 'gender', 'race', 'age_at_index',
       'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source',
       'country_of_residence_at_enrollment', 'days_to_birth', 'days_to_death',
       'occupation_duration_years', 'premature_at_birth', 'vital_status',
       'weeks_gestation_at_birth', 'year_of_birth', 'year_of_death']
to_remove = list(set(demographic.columns).difference(check_names))

In [24]:
demographic.drop(columns=to_remove, inplace=True)
demographic = demographic.reindex(columns=check_names)

# Diagnosis

In [25]:
diagnose_data = query_pdc(query= query_diagnose, variables=variables)
matrix = json.loads(diagnose_data.content)['data']["paginatedCaseDiagnosesPerStudy"]["caseDiagnosesPerStudy"]
diagnose_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])

Sending query.


In [26]:
diagnose_data_df['diagnosis_id'] = diagnose_data_df['diagnoses'].apply(lambda diag_list: diag_list[0]['diagnosis_id'] if diag_list else None)

In [27]:
diagnose_df = for_diagnosis(matrix = matrix)

In [28]:
diagnosis = pd.merge(left=diagnose_data_df, right=diagnose_df, on="diagnosis_id")

In [29]:
check_names = ['case_submitter_id', 'age_at_diagnosis', 'days_to_last_follow_up', 'days_to_last_known_disease_status', 
       'days_to_recurrence', 'diagnosis_is_primary_disease', 'last_known_disease_status', 'morphology', 
       'primary_diagnosis', 'progression_or_recurrence', 'site_of_resection_or_biopsy', 'tissue_or_organ_of_origin', 
       'tumor_grade', 'tumor_stage', 'adrenal_hormone', 'ajcc_clinical_m', 'ajcc_clinical_n', 'ajcc_clinical_stage', 
       'ajcc_clinical_t', 'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_stage', 'ajcc_pathologic_t',
       'ajcc_staging_system_edition', 'anaplasia_present', 'anaplasia_present_type', 'ann_arbor_b_symptoms',
       'ann_arbor_b_symptoms_described', 'ann_arbor_clinical_stage', 'ann_arbor_extranodal_involvement', 
       'ann_arbor_pathologic_stage', 'best_overall_response', 'breslow_thickness', 'burkitt_lymphoma_clinical_variant',
       'child_pugh_classification', 'circumferential_resection_margin', 'classification_of_tumor', 'cog_liver_stage', 
       'cog_neuroblastoma_risk_group', 'cog_renal_stage', 'cog_rhabdomyosarcoma_risk_group', 'colon_polyps_history',
       'days_to_best_overall_response', 'days_to_diagnosis', 'days_to_hiv_diagnosis', 'days_to_new_event', 
       'eln_risk_classification', 'enneking_msts_grade', 'enneking_msts_metastasis', 'enneking_msts_stage', 
       'enneking_msts_tumor_site', 'esophageal_columnar_dysplasia_degree', 'esophageal_columnar_metaplasia_present', 
       'figo_stage', 'figo_staging_edition_year', 'first_symptom_prior_to_diagnosis', 
       'gastric_esophageal_junction_involvement', 'gleason_grade_group', 'gleason_grade_tertiary', 
       'gleason_patterns_percent', 'goblet_cells_columnar_mucosa_present', 'gross_tumor_weight', 
       'hiv_positive', 'hpv_positive_type', 'hpv_status', 'icd_10_code', 'igcccg_stage', 'inpc_grade',
       'inpc_histologic_group', 'inrg_stage', 'inss_stage', 'international_prognostic_index', 'irs_group', 
       'irs_stage', 'ishak_fibrosis_score', 'iss_stage', 'largest_extrapelvic_peritoneal_focus', 'laterality',
       'ldh_level_at_diagnosis', 'ldh_normal_range_upper', 'lymph_nodes_positive', 'lymph_nodes_tested', 
       'lymphatic_invasion_present', 'margin_distance', 'margins_involved_site', 'masaoka_stage',
       'medulloblastoma_molecular_classification', 'metastasis_at_diagnosis', 'metastasis_at_diagnosis_site',
       'method_of_diagnosis', 'mitosis_karyorrhexis_index', 'new_event_anatomic_site', 'new_event_type', 
       'non_nodal_regional_disease', 'non_nodal_tumor_deposits', 'ovarian_specimen_status', 
       'ovarian_surface_involvement', 'overall_survival', 'percent_tumor_invasion', 
       'perineural_invasion_present', 'peripancreatic_lymph_nodes_positive', 
       'peripancreatic_lymph_nodes_tested', 'peritoneal_fluid_cytological_status',
       'pregnant_at_diagnosis', 'primary_gleason_grade', 'prior_malignancy', 
       'prior_treatment', 'progression_free_survival', 'progression_free_survival_event', 
       'residual_disease', 'satellite_nodule_present', 'secondary_gleason_grade', 
       'sites_of_involvement', 'supratentorial_localization', 'synchronous_malignancy', 
       'tumor_cell_content', 'tumor_confined_to_organ_of_origin', 'tumor_depth', 
       'tumor_focality', 'tumor_largest_dimension_diameter', 'tumor_regression_grade', 
       'vascular_invasion_present', 'vascular_invasion_type', 'weiss_assessment_score', 
       'who_cns_grade', 'who_nte_grade', 'wilms_tumor_histologic_subtype', 'year_of_diagnosis']
to_remove = list(set(diagnosis.columns).difference(check_names))

In [30]:
diagnosis.drop(columns=to_remove, inplace=True)
diagnosis = diagnosis.reindex(columns=check_names)

# Exposure

# Variables
variables = {
    "pdc_study_identifier": "PDC000127",
    "study_id": "a5da6836-c92e-4bdc-8f84-d28d629fc383",
    "offset": 0,  # set offset records to pull
    "limit": 10,  # limit number of records
    "data_type": "log2_ratio"  # option: unshared_log2_ratio
}

In [31]:
exposure_data = query_pdc(query= query_exposure, variables= variables)
matrix = json.loads(exposure_data.content)['data']["paginatedCaseExposuresPerStudy"]["caseExposuresPerStudy"]
exposure_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])
exposure_data_df['exposure_id'] = exposure_data_df['exposures'].apply(lambda diag_list: diag_list[0]['exposure_id'] if diag_list else None)


Sending query.


In [32]:
exposure_df = for_case(matrix = matrix)

In [33]:
exposure = pd.merge(left=exposure_data_df, right=exposure_df, on="exposure_id")

In [34]:
check_names = ['case_submitter_id', 'age_at_onset', 'alcohol_days_per_week',
    'alcohol_drinks_per_day', 'alcohol_history', 'alcohol_intensity',
    'alcohol_type', 'asbestos_exposure', 'cigarettes_per_day', 
    'coal_dust_exposure', 'environmental_tobacco_smoke_exposure', 
    'exposure_duration', 'exposure_duration_years', 'exposure_type', 
    'marijuana_use_per_week', 'pack_years_smoked', 'parent_with_radiation_exposure', 
    'radon_exposure', 'respirable_crystalline_silica_exposure', 'secondhand_smoke_as_child', 
    'smokeless_tobacco_quit_age', 'smoking_frequency', 'time_between_waking_and_first_smoke', 
    'tobacco_smoking_onset_year', 'tobacco_smoking_quit_year', 'tobacco_smoking_status', 
    'tobacco_use_per_day', 'type_of_smoke_exposure', 'type_of_tobacco_used', 'years_smoked']
to_remove = list(set(exposure.columns).difference(check_names))

In [35]:
exposure = exposure.drop(columns=to_remove)
exposure = exposure.reindex(columns=check_names)


# Family History -- to do

In [36]:
#family_data = query_pdc(query= query_family_history, variables=variables)
#matrix = json.loads(family_data.content)['data']
#matrix

In [37]:
#family_hist_df = pd.DataFrame(matrix[1:], columns=matrix[0])
#family_hist_df.head()

case_submitter_id', 'relationship_age_at_diagnosis',
       'relationship_gender', 'relationship_primary_diagnosis',
       'relationship_type', 'relative_with_cancer_history        ',
       'relatives_with_cancer_history_count 

# Treatment

In [38]:
treatments_data = query_pdc(query= query_treatments, variables=variables)
matrix = json.loads(treatments_data.content)['data']["paginatedCaseTreatmentsPerStudy"]["caseTreatmentsPerStudy"]
treatments_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])


Sending query.


In [39]:
treatments_data_df['treatment_id'] = treatments_data_df['treatments'].apply(lambda diag_list: diag_list[0]['treatment_id'] if diag_list else None)


In [40]:
treatments_df = for_treatment(matrix = matrix)


In [41]:
treatments = pd.merge(left=treatments_data_df, right=treatments_df, on="treatment_id")


In [42]:
check_names = ['case_submitter_id', 'chemo_concurrent_to_radiation', 
               'days_to_treatment_end', 'days_to_treatment_start', 
               'initial_disease_status', 'number_of_cycles', 'reason_treatment_ended', 
               'regimen_or_line_of_therapy', 'route_of_administration', 
               'therapeutic_agents', 'treatment_anatomic_site', 'treatment_arm', 
               'treatment_dose', 'treatment_dose_units', 'treatment_effect', 
               'treatment_effect_indicator', 'treatment_frequency', 'treatment_intent_type', 
               'treatment_or_therapy', 'treatment_outcome', 'treatment_type']
to_remove = list(set(treatments.columns).difference(check_names))

In [43]:
treatments = treatments.drop(columns=to_remove)
treatments = treatments.reindex(columns=check_names)


# Follow up

In [44]:
follow_up_data = query_pdc(query= query_follow_up, variables=variables)
matrix = json.loads(follow_up_data.content)['data']["paginatedCaseFollowUpsPerStudy"]['caseFollowUpsPerStudy']
follow_up_data_df = pd.DataFrame(matrix[1:], columns=matrix[0])


Sending query.


In [45]:
follow_up_data_df['follow_up_id'] = follow_up_data_df['follow_ups'].apply(lambda diag_list: diag_list[0]['follow_up_id'] if diag_list else None)


In [46]:
follow_up_df = for_follows_up(matrix = matrix)


In [47]:
follow_ups = pd.merge(left=follow_up_data_df, right=follow_up_df, on="follow_up_id")


In [48]:
check_names = ['case_submitter_id', 'days_to_follow_up', 'adverse_event',
       'adverse_event_grade', 'aids_risk_factors',
       'barretts_esophagus_goblet_cells_present', 'bmi', 'body_surface_area',
       'cause_of_response', 'cd4_count', 'cdc_hiv_risk_factors', 'comorbidity',
       'comorbidity_method_of_diagnosis', 'days_to_adverse_event',
       'days_to_comorbidity', 'days_to_imaging', 'days_to_progression',
       'days_to_progression_free', 'days_to_recurrence',
       'diabetes_treatment_type', 'disease_response',
       'dlco_ref_predictive_percent', 'ecog_performance_status',
       'evidence_of_recurrence_type', 'eye_color',
       'fev1_fvc_post_bronch_percent', 'fev1_fvc_pre_bronch_percent',
       'fev1_ref_post_bronch_percent', 'fev1_ref_pre_bronch_percent',
       'haart_treatment_indicator', 'height',
       'hepatitis_sustained_virological_response', 'history_of_tumor',
       'history_of_tumor_type', 'hiv_viral_load',
       'hormonal_contraceptive_type', 'hormonal_contraceptive_use',
       'hormone_replacement_therapy_type', 'hpv_positive_type',
       'hysterectomy_margins_involved', 'hysterectomy_type', 'imaging_result',
       'imaging_type', 'immunosuppressive_treatment_type',
       'karnofsky_performance_status', 'menopause_status', 'nadir_cd4_count',
       'pancreatitis_onset_year', 'pregnancy_outcome', 'procedures_performed',
       'progression_or_recurrence', 'progression_or_recurrence_anatomic_site',
       'progression_or_recurrence_type', 'recist_targeted_regions_number',
       'recist_targeted_regions_sum', 'reflux_treatment_type', 'risk_factor',
       'risk_factor_treatment', 'scan_tracer_used',
       'undescended_testis_corrected', 'undescended_testis_corrected_age',
       'undescended_testis_corrected_laterality',
       'undescended_testis_corrected_method', 'undescended_testis_history',
       'undescended_testis_history_laterality', 'viral_hepatitis_serologies',
       'weight']
to_remove = list(set(follow_ups.columns).difference(check_names))

In [49]:
follow_ups = follow_ups.drop(columns=to_remove)
follow_ups = follow_ups.reindex(columns=check_names)


# Sample

In [50]:
matrix = json.loads(case_data.content)['data']['case']
case_data_df = pd.DataFrame(matrix)

In [51]:
sample_df = for_sample(matrix = matrix)

In [52]:
check_names = ['sample_submitter_id', 'composition', 'pool', 'sample_type', 'status',
       'tissue_type', 'gdc_project_id', 'gdc_sample_id',
       'biospecimen_anatomic_site', 'biospecimen_laterality',
       'catalog_reference', 'current_weight', 'days_to_collection',
       'days_to_sample_procurement', 'diagnosis_pathologically_confirmed',
       'distance_normal_to_tumor', 'distributor_reference', 'freezing_method',
       'growth_rate', 'initial_weight', 'intermediate_dimension',
       'longest_dimension', 'method_of_sample_procurement', 'passage_count',
       'pathology_report_uuid', 'preservation_method', 'sample_is_ref',
       'sample_ordinal', 'sample_type_id', 'shortest_dimension',
       'time_between_clamping_and_freezing',
       'time_between_excision_and_freezing', 'tissue_collection_type',
       'tumor_code', 'tumor_code_id', 'tumor_descriptor']
to_remove = list(set(sample_df.columns).difference(check_names))

In [53]:
samples = sample_df.drop(columns=to_remove)
samples = samples.reindex(columns=check_names)


# Aliquots

In [54]:
aliquots_data = query_pdc(query= query_aliquots, variables=variables)
matrix = json.loads(aliquots_data.content)["data"]["paginatedCasesSamplesAliquots"]["casesSamplesAliquots"]
aliquots_df = pd.DataFrame(matrix)


Sending query.


In [55]:
aliquots_df['gdc_sample_id'] = aliquots_df['samples'].apply(lambda diag_list: diag_list[0]['gdc_sample_id'] if diag_list else None)


aliquot_submitter_id', 'aliquot_is_ref', 'pool', 'status',
       'aliquot_quantity', 'aliquot_volume', 'amount', 'analyte_type',
       'analyte_type_id', 'concentration'

In [56]:
tmp = for_aliquots(matrix= matrix)
aliquots_df = pd.merge(left=biospecimen_df, right=tmp, on = "aliquot_submitter_id",suffixes= ("", "_"))

In [57]:
check_names = ['aliquot_submitter_id', 'aliquot_is_ref', 'pool', 'status',
       'aliquot_quantity', 'aliquot_volume', 'amount', 'analyte_type',
       'analyte_type_id', 'concentration']
to_remove = list(set(aliquots_df.columns).difference(check_names))

In [58]:
aliquots_df = aliquots_df.drop(columns=to_remove)
aliquots = aliquots_df.reindex(columns=check_names)


# Study

In [59]:
study_data = query_pdc(query= query_study_info, variables=variables)
matrix = json.loads(study_data.content)['data']['study']
study_df = pd.DataFrame(matrix)


Sending query.


In [60]:
check_names = ['study_submitter_id', 'analytical_fraction', 'experiment_type',
       'acquisition_type', 'study_description', 'embargo_date']
to_remove = list(set(study_df.columns).difference(check_names))

In [61]:
study = study_df.drop(columns=to_remove)
study = study.reindex(columns=check_names)


# Protocol

In [62]:
protocol_Data = query_pdc(query= query_protocol, variables=variables)
matrix = json.loads(protocol_Data.content)['data']['protocolPerStudy']
protocol_df = pd.DataFrame(matrix)


Sending query.


In [63]:
check_names = ['study_submitter_id', 'protocol_name', 'protocol_date', 'document_name',
       'quantitation_strategy', 'experiment_type', 'label_free_quantitation',
       'labeled_quantitation', 'isobaric_labeling_reagent',
       'reporter_ion_ms_level', 'starting_amount', 'starting_amount_uom',
       'digestion_reagent', 'alkylation_reagent', 'enrichment_strategy',
       'enrichment', 'chromatography_dimensions_count',
       '1d_chromatography_type', '2d_chromatography_type',
       'fractions_analyzed_count', 'column_type', 'amount_on_column',
       'amount_on_column_uom', 'column_length', 'column_length_uom',
       'column_inner_diameter', 'column_inner_diameter_uom', 'particle_size',
       'particle_size_uom', 'particle_type', 'gradient_length',
       'gradient_length_uom', 'instrument_make', 'instrument_model',
       'serial_number', 'dissociation_type', 'ms1_resolution',
       'ms2_resolution', 'dda_topn', 'normalized_collision_energy',
       'acquistion_type', 'dia_multiplexing', 'dia_ims']
to_remove = list(set(protocol_df.columns).difference(check_names))

In [64]:
protocol = protocol_df.drop(columns=to_remove)
protocol = protocol.reindex(columns=check_names)


# Exp_Metadata

In [65]:
expMetadat_data_2 = query_pdc(query= query_expMetadata_2, variables=variables)
matrix = json.loads(expMetadat_data_2.content)['data']["studyExperimentalDesign"]
expMetadat_data_2 = pd.DataFrame(matrix)


Sending query.


In [66]:
check_names = ['study_submitter_id', 'experiment_type', 'experiment_number',
       'plex_or_folder_name', 'fraction', 'date', 'operator',
       'replicate_number', 'condition', 'label_free', 'itraq_113', 'itraq_114',
       'itraq_115', 'itraq_116', 'itraq_117', 'itraq_118', 'itraq_119',
       'itraq_121', 'tmt_126', 'tmt_127n', 'tmt_127c', 'tmt_128n', 'tmt_128c',
       'tmt_129n', 'tmt_129c', 'tmt_130n', 'tmt_130c', 'tmt_131', 'tmt_131c',
       'tmt_132n', 'tmt_132c', 'tmt_133n', 'tmt_133c', 'tmt_134n', 'tmt_134c',
       'tmt_135n']
to_remove = list(set(expMetadat_data_2.columns).difference(check_names))

In [67]:
Exp_Metadata = expMetadat_data_2.drop(columns=to_remove)
Exp_Metadata = Exp_Metadata.reindex(columns=check_names)


# File Metadata

In [68]:
file_metadata_data = query_pdc(query= query_file_metadata, variables=variables)
matrix = json.loads(file_metadata_data.content)['data']["filesPerStudy"]
file_metadata_df = pd.DataFrame(matrix)


Sending query.


In [69]:
check_names = ['study_submitter_id', 'plex_or_folder_name', 'file_name',
       'fraction_number', 'data_category', 'file_type', 'file_format',
       'md5sum', 'file_size', "sha1"]
to_remove = list(set(file_metadata_df.columns).difference(check_names))

In [70]:
file_metada = file_metadata_df.drop(columns=to_remove)
file_metada = file_metada.reindex(columns=check_names)

# Quantitative data

In [71]:
quantitative_data = query_pdc(query= query_quantitative, variables=variables)
matrix = json.loads(quantitative_data.content)['data']["quantDataMatrix"]


Sending query.


In [72]:
if matrix is None:
    quantitative_log2 = pd.DataFrame().fillna('Data not available')
else:
    quantitative_log2 = pd.DataFrame(matrix[1:], columns=matrix[0])
quantitative_log2.head()

Unnamed: 0,Gene/Aliquot,008202d2-207b-11e9-b7f8-0a80fada099c:CPT0026410003,00b60c96-207d-11e9-b7f8-0a80fada099c:CPT0002370001,01d04930-207b-11e9-b7f8-0a80fada099c:CPT0000790001,01fb1ceb-207d-11e9-b7f8-0a80fada099c:QC5,03131a81-207b-11e9-b7f8-0a80fada099c:CPT0019130003,03426a4e-207d-11e9-b7f8-0a80fada099c:CPT0088570001,0456d990-207b-11e9-b7f8-0a80fada099c:CPT0065820001,048b302f-207d-11e9-b7f8-0a80fada099c:CPT0025880003,059e2744-207b-11e9-b7f8-0a80fada099c:NCI7-3,...,f7e70788-207c-11e9-b7f8-0a80fada099c:CPT0000890001,f9267c07-207c-11e9-b7f8-0a80fada099c:CPT0086370003,f94c2e40-207a-11e9-b7f8-0a80fada099c:NCI7-1,fa672cb4-207c-11e9-b7f8-0a80fada099c:CPT0025580004,fbac3dee-207c-11e9-b7f8-0a80fada099c:CPT0081990003,fc3cdfd4-207a-11e9-b7f8-0a80fada099c:CPT0025060001,fcea1e53-207c-11e9-b7f8-0a80fada099c:CPT0075130003,fe2c0e45-207c-11e9-b7f8-0a80fada099c:CPT0000660001,fef82450-207a-11e9-b7f8-0a80fada099c:QC1,ff704795-207c-11e9-b7f8-0a80fada099c:CPT0086820003
0,A1BG,-1.0967,0.1012,-0.6064,-1.0423,0.5464,-0.4046,-0.5677,-0.2304,-1.9938,...,-0.1866,0.0551,-2.2924,-0.1332,0.8185,-0.2213,0.1086,-0.837,-1.1657,-0.4811
1,A1CF,0.245,-0.3086,0.5539,-2.1297,0.6212,0.0107,0.1948,-0.2769,-1.2396,...,0.4798,0.4191,-1.3478,0.1615,0.2765,0.8436,-1.7144,0.0556,-1.9917,-0.9
2,A2M,-0.7675,0.0001,-0.6975,-0.7303,0.5146,-0.4792,-0.3842,-0.0805,-2.77,...,-0.418,0.1071,-3.368,0.3744,2.1494,-0.3169,0.2321,-1.3295,-0.765,0.0294
3,AAAS,0.0772,0.0306,0.0378,0.0259,0.0803,0.0001,-0.1606,0.2808,1.4716,...,0.1004,-0.1249,1.6549,0.2525,-0.0135,-0.0489,0.3626,-0.1593,-0.0927,0.242
4,AACS,-0.2396,0.4141,0.291,0.1962,0.0242,-0.1468,-0.2609,-1.2109,1.0009,...,-0.0668,0.0661,0.6848,0.004,0.0743,0.0743,0.4906,-0.1487,0.2545,-0.2298


# Create an excell workbook with the study information

In [73]:
# object dictionary:
study_information = {
    'Readme': pd.DataFrame(readme),
    "Project-Program": pd.DataFrame(program_project),
    "Case-Matrix": pd.DataFrame(case_matrix),
    "Case": pd.DataFrame(case),
    "Demographic": pd.DataFrame(demographic),
    "Diagnosis": pd.DataFrame(diagnosis),
    "Exposure": pd.DataFrame(exposure),
    "Family-History": pd.DataFrame(), #need to find the data
    "Treatments": pd.DataFrame(treatments),
    "Follow-up": pd.DataFrame(follow_ups),
    "Sample": pd.DataFrame(samples), #need to redo
    "Aliquots": pd.DataFrame(aliquots), #need to redo
    "Study": pd.DataFrame(study),
    "Protocol": pd.DataFrame(protocol),
    "Exp_Metadata": pd.DataFrame(Exp_Metadata),
    "File-Metadata": pd.DataFrame(file_metada),
    "Protein-Levels": pd.DataFrame(quantitative_log2)
}

In [74]:
with pd.ExcelWriter('study_info.xlsx', engine='xlsxwriter') as writer:
    for sheet_name, df in study_information.items():
        if df is not None and not df.empty:  # Check if the DataFrame is not None and not empty
            df.to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            # Create an empty DataFrame and write it to the sheet
            empty_df = pd.DataFrame(columns= df.columns, index=range(10)).fillna('data not available')
            empty_df.to_excel(writer, sheet_name=sheet_name, index=False)


In [75]:

end_time = time.time()
total_time = end_time - start_time
print(f"Total time spent running the notebook: {total_time} seconds")


Total time spent running the notebook: 70.36057686805725 seconds
