In [3]:
import argparse
import pandas as pd

pbta_histologies = pd.read_csv("../../data/pbta-histologies.tsv", sep="\t")

In [4]:
def group_disease(primary_site):
    infra = ["posterior fossa",
             "optic",
             "spinal",
             "tectum",
             "spine"]
    supra = ["frontal lobe",
             "parietal lobe",
             "occipital lobe",
             "temporal lobe"]
    primary = primary_site.lower() # this will prevent possible errors from case mismatches
    for site in infra:
        if site in primary:
            return "infratentorial"
    for site in supra:
        if site in primary:
            return "supratentorial"
    # Note we only get to the below return if the primary site was not in either defined group.
    return "undetermined"

In [5]:
pbta_histologies

Unnamed: 0,Kids_First_Biospecimen_ID,CNS_region,sample_id,aliquot_id,Kids_First_Participant_ID,experimental_strategy,sample_type,composition,tumor_descriptor,primary_site,...,age_last_update_days,seq_center,normal_fraction,tumor_fraction,tumor_ploidy,parent_aliquot_id,cancer_predispositions,molecular_subtype,pathology_free_text_diagnosis,cohort_participant_id
0,BS_W36RZSFA,Hemispheric,7316-431,570128,PT_7TRGHZBK,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Parietal Lobe,...,3159.0,NantOmics,,,,7316-431-T-312975.RNA-Seq,None documented,,choroid plexus carcinoma,C97539
1,BS_40QCA1MC,,7316-2682,564187,PT_Y7F2JZMQ,WGS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,6928.0,NantOmics,,,,7316-2682-N-485610.WGS,NF-1,,na,C702699
2,BS_CYM7BP13,,7316-1107,564205,PT_WKNKNYHH,WGS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,1392.0,NantOmics,,,,7316-1107-N-353844.WGS,None documented,,na,C334191
3,BS_Z56904ZW,,7316-449,550075,PT_8FWP7BR3,WGS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,6256.0,NantOmics,,,,7316-449-N-317755.WGS,None documented,,na,C112422
4,BS_5DPMQQVG,,A08691,A08710,PT_NK8A49X5,WXS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,5600.0,TGEN,,,,A08691-N.WXS,,,,C3079428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2828,BS_1HQWNZCB,,7316-362,549992,PT_S21ZTKPS,WGS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,5683.0,NantOmics,,,,7316-362-N-242606.WGS,None documented,,na,C75522
2829,BS_C0YDDQKB,Hemispheric,7316-1076,571322,PT_1EB5KHZX,WGS,Tumor,Solid Tissue,Initial CNS Tumor,Temporal Lobe,...,666.0,NantOmics,0.0,1.0,2.0,7316-1076-T-353385.WGS,None documented,,cortical dysplasia,C291387
2830,BS_X16PENSA,Posterior fossa,7316-1942,654211,PT_50ZFWMZE,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,...,1570.0,NantOmics,,,,7316-1942-T-389204.RNA-Seq,None documented,"LGG, BRAF fusion",pilocytic astrocytoma,C458175
2831,BS_45N760PZ,,7316-1077,564222,PT_3NM45TQ8,WGS,Normal,Peripheral Whole Blood,,Peripheral Whole Blood,...,2716.0,NantOmics,,,,7316-1077-N-353394.WGS,None documented,,na,C291510


In [17]:
# Filtering for ependymoma samples 
EP = pbta_histologies[pbta_histologies["pathology_diagnosis"]=="Ependymoma"]


In [23]:

# List with only RNA samples
EP_rnaseq_samples = EP[EP["experimental_strategy"] == "RNA-Seq"][["Kids_First_Biospecimen_ID", "primary_site","Kids_First_Participant_ID", "sample_id"]]

# Filtering for DNA samples 
WGS_dnaseqsamples = EP[EP["experimental_strategy"]=="WGS"][["Kids_First_Biospecimen_ID", "Kids_First_Participant_ID", "sample_id","primary_site"]]

# Renaming the column name so they don't conflict in merge step 
EP_rnaseq_samples = EP_rnaseq_samples.rename(columns={"Kids_First_Biospecimen_ID":"Kids_First_Biospecimen_ID_RNA"})
WGS_dnaseqsamples = WGS_dnaseqsamples.rename(columns={"Kids_First_Biospecimen_ID":"Kids_First_Biospecimen_ID_DNA"})



In [24]:
# sample_id is common between both  datafarmes and also unique between RNA and DNA. 
# Some DNA BSID's are missing for the corresponding RNA samples
EP_rnaseq_WGS = EP_rnaseq_samples.merge(WGS_dnaseqsamples, 
                                        on = ["sample_id", "Kids_First_Participant_ID","primary_site"], 
                                        how = "outer")
EP_rnaseq_WGS.fillna('NA', inplace=True)

In [26]:
EP_rnaseq_WGS["disease_group"] = [group_disease(primary) for primary in EP_rnaseq_WGS["primary_site"]]

EP_rnaseq_WGS

Unnamed: 0,Kids_First_Biospecimen_ID_RNA,primary_site,Kids_First_Participant_ID,sample_id,Kids_First_Biospecimen_ID_DNA,disease_group
0,BS_196BBNEJ,Cerebellum/Posterior Fossa,PT_BNARR0N6,7316-1954,BS_99PPRCW4,infratentorial
1,BS_0WQJP6ZG,Frontal Lobe,PT_Y6Y9JJ9P,7316-425,,supratentorial
2,BS_NH7K4CD9,Cerebellum/Posterior Fossa,PT_ZD45GXZ0,7316-2109,BS_J33YVP27,infratentorial
3,BS_N8J1DP02,Cerebellum/Posterior Fossa,PT_VE0Q731Y,7316-1641,BS_BBHFKBE7,infratentorial
4,BS_99B1XRZQ,Cerebellum/Posterior Fossa;Occipital Lobe;Pari...,PT_3VCS1PPF,7316-490,BS_QMY84KF4,infratentorial
...,...,...,...,...,...,...
92,BS_XQYHPBFS,Ventricles,PT_AZQ230WT,7316-443,,undetermined
93,,Spinal Cord- Lumbar/Thecal Sac,PT_N9W4GT6D,7316-174,BS_AVS4DSZW,infratentorial
94,,Cerebellum/Posterior Fossa,PT_6TMT65YA,7316-1642,BS_BSM3ZHW4,infratentorial
95,,Cerebellum/Posterior Fossa,PT_EZW3S4F1,7316-1961,BS_QBZDQX7A,infratentorial
