This notebook is a breakdown python script which is aim to retrieve the TCGA's exome capture kit from GDC's file API endpoint so that we can use the correct BED for process like somatic calling and TMB calc etc.

We have downloaded the TCGA based a requested manifest, idealy we should have all file UUID captured in that manifest, but this script is going to use the released [TCGA manifest](https://s3.amazonaws.com/kf-openaccess-us-east-1-prd-pbta/data/release-v14-20200203/pbta-tcga-manifest.tsv) as the base to retrieve its GDC file uuid and then track down the file metadata to get the actual target_capture_kit info.

In [1]:
import requests
import json
import pandas as pd
import os

# 1. get TCGA manifest from the data release
## set data bucket base url and version 
PBTA_BUCKET = 'https://s3.amazonaws.com/kf-openaccess-us-east-1-prd-pbta/data/'
RELEASE = 'release-v14-20200203/'

## GET and load manifest
tcga_manifest = requests.get(
    PBTA_BUCKET+RELEASE+"pbta-tcga-manifest.tsv").content

## iterate TCGA manifest, to get all the file names
tcga_manifest_lines = tcga_manifest.split("\n")
tcga_filenames = []
for line in tcga_manifest_lines:
    tcga_filenames.append(line.split("\t")[0])


In [2]:
# 2. hit GDC file API endpoint to get the details of the capture kit
## set GDC API base url and request headers
gdc_url = 'https://api.gdc.cancer.gov/files'
headers = {'Content-Type': 'application/json'}

## API request field, removed "analysis.metadata.read_groups.read_group_name"
## can add that back for details
fields = [
    'file_name',
    'analysis.metadata.read_groups.target_capture_kit_name',
    'analysis.metadata.read_groups.target_capture_kit_target_region'
]
fields = ','.join(fields)

## API request body 
payload = {
        'filters':{
            'op':'=',
            'content':{
                'field':'file_name',
                'value':tcga_filenames}},
        'format':'json',
        'fields':fields,
        'size':5000
}
payload = json.dumps(payload)

## hit GDC API file endpoint
gdc_response = requests.post(gdc_url, headers=headers, data=payload)

In [3]:
# 3. handle GDC API return to find out capture kit url
gdc_response = gdc_response.json()
capture_kits = []

## iterate .data.hits entity manifest
## an example of the return body
# {
#   "data": {
#     "hits": [
#       {
#         "analysis": {
#           "metadata": {
#             "read_groups": [
#               {
#                 "read_group_name": "0.2",
#                 "target_capture_kit_target_region": "..."
#                 "target_capture_kit_name": "..."
#               },
#               ...
#             ]
#           }
#         },
#         "file_name": "C282.TCGA-12-0773-10A-01W.8_gdc_realn.bam",
#         "id": "b858579f-99c6-4802-afcd-a91dcbb28dc3",
#         "experimental_strategy": "WXS"
#       }
#     ],
# ...
# }
for i in gdc_response['data']['hits']:
    for j in i['analysis']['metadata']['read_groups']:
        capture_kits.append([
            i['file_name'], 
            j['target_capture_kit_name'],
            j['target_capture_kit_target_region']
        ])
    

In [4]:
# 4. load capture kit into data frame, find unique kit download url
df = pd.DataFrame(capture_kits).drop_duplicates()
df.columns = ['filename','kit_name','kit_url']
df.head()

Unnamed: 0,filename,kit_name,kit_url
0,C494.TCGA-DU-5855-10A-01D-1705-08.5_gdc_realn.bam,"Custom V2 Exome Bait, 48 RXN X 16 tubes",https://bitbucket.org/cghub/cghub-capture-kit-...
21,C494.TCGA-DU-5847-10A-01D-1705-08.5_gdc_realn.bam,"Custom V2 Exome Bait, 48 RXN X 16 tubes",https://bitbucket.org/cghub/cghub-capture-kit-...
42,C494.TCGA-HT-7681-10C-01D-2396-08.1_gdc_realn.bam,"Custom V2 Exome Bait, 48 RXN X 16 tubes",https://bitbucket.org/cghub/cghub-capture-kit-...
61,C494.TCGA-P5-A737-10A-01D-A329-08.1_gdc_realn.bam,"Custom V2 Exome Bait, 48 RXN X 16 tubes",https://bitbucket.org/cghub/cghub-capture-kit-...
79,C494.TCGA-DB-A4XG-10A-01D-A27N-08.4_gdc_realn.bam,"Custom V2 Exome Bait, 48 RXN X 16 tubes",https://bitbucket.org/cghub/cghub-capture-kit-...


In [5]:
# 5. output the capture kit information
pd.to_csv(os.path'tcga-capture-kit.csv', )

https://bitbucket.org/cghub/cghub-capture-kit-info/raw/c38c4b9cb500b724de46546fd52f8d532fd9eba9/BI/vendor/Agilent/whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed
