In [14]:
import requests
import json
import pandas as pd
from io import StringIO
from tqdm import tqdm
import datetime
from pathlib import Path

import re

In [15]:
def build_filter_files(project_id, data_type=["Masked Somatic Mutation"]):

    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "data_type",
                "value": data_type
            }
            },
            {
            "op": "in",
            "content":{
                "field": "cases.project.project_id",
                "value": project_id
            }
            }
        ]
    }
    return filters

In [16]:
def get_file_ids(filters):
    
    files_endpt = "https://api.gdc.cancer.gov/files"

    fields = [
        "file_id",
        "cases.project.project_id",
        "data_format",
        "data_category",
        "data_type",
        "annotations.case_submitter_id"
    ]

    params = {
        "filters": json.dumps(filters),
        "fields": ','.join(fields),
        "format": "TSV",
        "size": "60"
    }

    response = requests.get(files_endpt, params = params)

    out_df = pd.read_csv(StringIO(response.content.decode("utf-8")), sep='\t')

    file_ids = list(out_df['file_id'])

    return file_ids

In [17]:
def download_data(file_ids):

    data_endpt = "https://api.gdc.cancer.gov/data"

    params = {"ids": file_ids}

    response = requests.post(data_endpt,
                            data = json.dumps(params),
                            headers={
                                "Content-Type": "application/json"
                                })
    
    return response

In [18]:
tcga_name_descr = '''LAML	Acute Myeloid Leukemia
ACC	Adrenocortical carcinoma
BLCA	Bladder Urothelial Carcinoma
LGG	Brain Lower Grade Glioma
BRCA	Breast invasive carcinoma
CESC	Cervical squamous cell carcinoma and endocervical adenocarcinoma
CHOL	Cholangiocarcinoma
LCML	Chronic Myelogenous Leukemia
COAD	Colon adenocarcinoma
CNTL	Controls
ESCA	Esophageal carcinoma
FPPP	FFPE Pilot Phase II
GBM	    Glioblastoma multiforme
HNSC	Head and Neck squamous cell carcinoma
KICH	Kidney Chromophobe
KIRC	Kidney renal clear cell carcinoma
KIRP	Kidney renal papillary cell carcinoma
LIHC	Liver hepatocellular carcinoma
LUAD	Lung adenocarcinoma
LUSC	Lung squamous cell carcinoma
DLBC	Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
MESO	Mesothelioma
MISC	Miscellaneous
OV	Ovarian serous cystadenocarcinoma
PAAD	Pancreatic adenocarcinoma
PCPG	Pheochromocytoma and Paraganglioma
PRAD	Prostate adenocarcinoma
READ	Rectum adenocarcinoma
SARC	Sarcoma
SKCM	Skin Cutaneous Melanoma
STAD	Stomach adenocarcinoma
TGCT	Testicular Germ Cell Tumors
THYM	Thymoma
THCA	Thyroid carcinoma
UCS	Uterine Carcinosarcoma
UCEC	Uterine Corpus Endometrial Carcinoma
UVM\tUveal Melanoma'''
tcga_project_ids = [s.split('\t')[0] for s in tcga_name_descr.splitlines()]

In [19]:
# main test
projects_ids = [
    '-'.join(['TCGA', id]) for id in tcga_project_ids
]

current_time = datetime.datetime.now().strftime("%Y_%m_%d__%I_%M_%S_%p")

target_dir = r'.\MAF_raw_' + current_time

dir_path = Path(target_dir)
dir_path.mkdir(exist_ok=True)

for project_id in tqdm(projects_ids):

    filter = build_filter_files(project_id)
    try:
        file_ids = get_file_ids(filter)
    except:
        print(project_id, ' file ids retriveal failed')
        continue
    try:
        response = download_data(file_ids)
        
        response_head_cd = response.headers["Content-Disposition"]

        file_name = '_'.join([project_id, re.findall("filename=(.+)", response_head_cd)[0]])

        file_path = Path(dir_path / Path(file_name))

        with open(file_path, "wb") as output_file:
            output_file.write(response.content)
    except:
        print(project_id, ' download failed')




 22%|██▏       | 8/37 [03:55<11:21, 23.50s/it]

TCGA-LCML  file ids retriveal failed


 27%|██▋       | 10/37 [04:35<08:57, 19.90s/it]

TCGA-CNTL  file ids retriveal failed


 32%|███▏      | 12/37 [05:12<07:21, 17.65s/it]

TCGA-FPPP  file ids retriveal failed


 62%|██████▏   | 23/37 [10:51<05:45, 24.67s/it]

TCGA-MISC  file ids retriveal failed


100%|██████████| 37/37 [19:14<00:00, 31.20s/it]
