In [1]:
import requests
import json
import pandas as pd

In [2]:
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.diagnoses.primary_diagnosis",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.progression_or_recurrence"
    "cases.diagnoses.vital_status",
    "cases.diagnoses.days_to_birth",
    "cases.diagnoses.days_to_death",
    "cases.diagnoses.morphology",
    "cases.diagnoses.tissue_or_organ_of_origin",
    "cases.samples.longest_dimension",
    "cases.samples.shortest_dimension",
    "cases.samples.pathology_report_uuid"
    #"cases.project.project_id"
]

fields = ','.join(fields)

## miRNA

In [3]:
#miRNA filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["miRNA Expression Quantification"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.experimental_strategy",
                "value": ["miRNA-Seq"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-LUAD', 'TCGA-LUSC']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [4]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [5]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#print(response.content.decode("utf-8"))

In [6]:
len(response.content.decode("utf-8"))

172247

In [7]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))

In [8]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,cases.project.project_id,file_name,cases.demographic.vital_status,cases.demographic.gender,cases.diagnoses.age_at_diagnosis,cases.diagnoses.days_to_last_follow_up,cases.demographic.days_to_death,cases.demographic.days_to_birth,cases.submitter_id,samples.portions.analytes.aliquots.submitter_id,cases.diagnoses.last_known_disease_status,cases.diagnoses.tumor_stage,cases.exposures.years_smoked,cases.exposures.cigarettes_per_day,cases.samples.portions.analytes.aliquots.submitter_id,cases.samples.longest_dimension,cases.samples.shortest_dimension,cases.samples.tumor_descriptor",
    "format": "TSV",
    "size": "50000"
    }
response = requests.get("https://api.gdc.cancer.gov/files", headers = {"Content-Type": "application/json"}, params = params)
#print(response.content.decode("UTF-8"))
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [9]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
df_files.rename(columns={"cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id":"sample_submitter_id_mirna"}, inplace=True)
df_files.head(2)

Unnamed: 0_level_0,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.project_id,cases.0.samples.0.longest_dimension,sample_submitter_id_mirna,cases.0.samples.0.shortest_dimension,cases.0.samples.0.tumor_descriptor,cases.0.submitter_id
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
03f54564-65bb-4561-ad33-ec78b883c217.mirbase21.mirnas.quantification.txt,-26451.0,,female,Alive,26451.0,652.0,not reported,1.643836,24.0,TCGA-LUAD,0.8,TCGA-55-7726-11A-01H-2169-13,0.4,,TCGA-55-7726
28aeff63-ff53-417c-868c-156b9ee973c9.mirbase21.mirnas.quantification.txt,-19200.0,,female,Alive,19200.0,719.0,not reported,1.09589,,TCGA-LUAD,1.5,TCGA-44-7672-01A-11H-2065-13,0.8,,TCGA-44-7672


In [10]:
df_files["cases.0.samples.0.tumor_descriptor"].dropna().unique()

array([], dtype=float64)

In [11]:
df_files.to_csv("files_miRNA.dat", header=True)

## FPKM

In [12]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-LUAD", "TCGA-LUSC"]
            }
        }
        
    ]
}

In [13]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [14]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#print(response.content.decode("utf-8"))

In [15]:
len(response.content.decode("utf-8"))

154602

In [16]:
with open("manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))

In [17]:
params = {
    "filters": json.dumps(filters),
    "fields": "primary_site,cases.project.project_id,file_name,cases.demographic.vital_status,cases.demographic.gender,cases.diagnoses.age_at_diagnosis,cases.diagnoses.days_to_last_follow_up,cases.demographic.days_to_death,cases.demographic.days_to_birth,cases.submitter_id,samples.portions.analytes.aliquots.submitter_id,cases.diagnoses.last_known_disease_status,cases.diagnoses.tumor_stage,cases.exposures.years_smoked,cases.exposures.cigarettes_per_day,cases.samples.portions.analytes.aliquots.submitter_id",
    "format": "TSV",
    "size": "50000"
    }
response = requests.get("https://api.gdc.cancer.gov/files", headers = {"Content-Type": "application/json"}, params = params)
#print(response.content.decode("UTF-8"))
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [18]:
df_files = pd.read_csv("files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
df_files.rename(columns={"cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id":"sample_submitter_id"}, inplace=True)
df_files.head(2)

Unnamed: 0_level_0,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.project_id,sample_submitter_id,cases.0.submitter_id
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12fff07f-74d6-4ee0-9f4f-36a4d194d24f.FPKM.txt.gz,-25069.0,161.0,male,Dead,25069.0,,not reported,1.369863,15.0,TCGA-LUAD,TCGA-MP-A4T8-01A-11R-A24X-07,TCGA-MP-A4T8
05d2b1e9-22a4-4369-b622-63846d672875.FPKM.txt.gz,-27993.0,1790.0,female,Dead,27993.0,,not reported,2.739726,,TCGA-LUAD,TCGA-MP-A4T6-01A-32R-A262-07,TCGA-MP-A4T6


In [19]:
df_files.to_csv("files_fpkm.dat", header=True)

# Merge

In [20]:
df_messangers = pd.read_csv("files_fpkm.dat")
df_messangers.head(2)

Unnamed: 0,file_name,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.project_id,sample_submitter_id,cases.0.submitter_id
0,12fff07f-74d6-4ee0-9f4f-36a4d194d24f.FPKM.txt.gz,-25069.0,161.0,male,Dead,25069.0,,not reported,1.369863,15.0,TCGA-LUAD,TCGA-MP-A4T8-01A-11R-A24X-07,TCGA-MP-A4T8
1,05d2b1e9-22a4-4369-b622-63846d672875.FPKM.txt.gz,-27993.0,1790.0,female,Dead,27993.0,,not reported,2.739726,,TCGA-LUAD,TCGA-MP-A4T6-01A-32R-A262-07,TCGA-MP-A4T6


In [21]:
df_miRNA = pd.read_csv("files_miRNA.dat")
df_miRNA.head(2)

Unnamed: 0,file_name,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.project_id,cases.0.samples.0.longest_dimension,sample_submitter_id_mirna,cases.0.samples.0.shortest_dimension,cases.0.samples.0.tumor_descriptor,cases.0.submitter_id
0,03f54564-65bb-4561-ad33-ec78b883c217.mirbase21...,-26451.0,,female,Alive,26451.0,652.0,not reported,1.643836,24.0,TCGA-LUAD,0.8,TCGA-55-7726-11A-01H-2169-13,0.4,,TCGA-55-7726
1,28aeff63-ff53-417c-868c-156b9ee973c9.mirbase21...,-19200.0,,female,Alive,19200.0,719.0,not reported,1.09589,,TCGA-LUAD,1.5,TCGA-44-7672-01A-11H-2065-13,0.8,,TCGA-44-7672


In [22]:
df_messangers = df_messangers.loc[~df_messangers["cases.0.submitter_id"].duplicated(keep="first"),:].rename(columns={"file_name":"file_fpkm"})
df_miRNA = df_miRNA.loc[~df_miRNA["cases.0.submitter_id"].duplicated(keep="first"),:].rename(columns={"file_name":"file_miRNA"})

In [23]:
df_files = df_miRNA.set_index("cases.0.submitter_id").transpose().append(df_messangers.set_index("cases.0.submitter_id").transpose()).transpose()
df_files.head(2)

Unnamed: 0_level_0,file_miRNA,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,...,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.project_id,sample_submitter_id
cases.0.submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-55-7726,03f54564-65bb-4561-ad33-ec78b883c217.mirbase21...,-26451.0,,female,Alive,26451.0,652.0,not reported,1.643836,24.0,...,,female,Alive,26451.0,652.0,not reported,1.643836,24.0,TCGA-LUAD,TCGA-55-7726-01A-11R-2170-07
TCGA-44-7672,28aeff63-ff53-417c-868c-156b9ee973c9.mirbase21...,-19200.0,,female,Alive,19200.0,719.0,not reported,1.09589,,...,,female,Alive,19200.0,719.0,not reported,1.09589,,TCGA-LUAD,TCGA-44-7672-01A-11R-2066-07


In [24]:
df_files.to_csv("files_manifest.dat")

Discordant LUSC are described in [Papaleo et al.; *BMC Cancers* (**2019**)](https://bmccancer.biomedcentral.com/articles/10.1186/s12885-019-5965-x)

In [25]:
df_discordant = pd.read_csv("https://raw.githubusercontent.com/ELELAB/LUAD_LUSC_TCGA_comparison/master/6-recount/LUSC/discordant_samples.txt", header=None)

df_biolinks = pd.read_csv("TCGA_biolinks.csv", index_col=0)
df_biolinks = df_biolinks[df_biolinks["cancer.type"].isin(["LUAD","LUSC"])]
df_biolinks.set_index("pan.samplesID", inplace=True)
df_biolinks.index = ["-".join(case.split("-")[:3]) for case in df_biolinks.index]
df_biolinks.head(2)

Unnamed: 0,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
TCGA-05-4249,LUAD,,,,,,5,,LUAD.5
TCGA-05-4382,LUAD,,,,,,6,,LUAD.6


In [26]:
def get_tissue_type(sample_id):
    try:
        return "normal" if int(sample_id.split("-")[3][:-1])>10 else "tumor"
    except:
        return sample_id

In [27]:
df_files["sample_submitter_id_type"]=list(map(get_tissue_type, df_files["sample_submitter_id"]))
df_files["sample_submitter_id_mirna_type"]=list(map(get_tissue_type, df_files["sample_submitter_id_mirna"]))

In [28]:
df_files = df_files.join(df_biolinks)
df_files = df_files.dropna(how="all", axis=0)
df_files.columns
for (sample,case) in zip(df_files.index,df_files["sample_submitter_id"]):
    if case in df_discordant.values.ravel():
        df_files.at[sample,"cancer.type"]="Discordant_LUSC"
df_files.to_csv("files.dat")
df_files.head(2)

Unnamed: 0_level_0,file_miRNA,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,...,sample_submitter_id_mirna_type,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
cases.0.submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-55-7726,03f54564-65bb-4561-ad33-ec78b883c217.mirbase21...,-26451.0,,female,Alive,26451.0,652.0,not reported,1.643836,24.0,...,normal,LUAD,,low,,,,6,,LUAD.6
TCGA-44-7672,28aeff63-ff53-417c-868c-156b9ee973c9.mirbase21...,-19200.0,,female,Alive,19200.0,719.0,not reported,1.09589,,...,tumor,LUAD,,intermediate,,,,6,,LUAD.6


In [29]:
df_files[["sample_submitter_id", "sample_submitter_id_type", "sample_submitter_id_mirna", "sample_submitter_id_mirna_type"]]

Unnamed: 0_level_0,sample_submitter_id,sample_submitter_id_type,sample_submitter_id_mirna,sample_submitter_id_mirna_type
cases.0.submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-55-7726,TCGA-55-7726-01A-11R-2170-07,tumor,TCGA-55-7726-11A-01H-2169-13,normal
TCGA-44-7672,TCGA-44-7672-01A-11R-2066-07,tumor,TCGA-44-7672-01A-11H-2065-13,tumor
TCGA-50-5072,TCGA-50-5072-01A-21R-1858-07,tumor,TCGA-50-5072-01A-21H-1857-13,tumor
TCGA-05-4425,TCGA-05-4425-01A-01R-1755-07,tumor,TCGA-05-4425-01A-01T-1754-13,tumor
TCGA-69-A59K,TCGA-69-A59K-01A-11R-A262-07,tumor,TCGA-69-A59K-01A-11H-A263-13,tumor
...,...,...,...,...
TCGA-22-4599,TCGA-22-4599-01A-01R-1443-07,tumor,,
TCGA-39-5016,TCGA-39-5016-01A-01R-1443-07,tumor,,
TCGA-21-1071,TCGA-21-1071-01A-01R-0692-07,tumor,,
TCGA-22-1002,TCGA-22-1002-01A-01R-0692-07,tumor,,


In [30]:
df_files["cancer.type"].unique()

array(['LUAD', nan, 'LUSC', 'Discordant_LUSC'], dtype=object)