This repository has been archived by the owner on Jun 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
get-tcga-capture_kit.py
79 lines (67 loc) · 2.53 KB
/
get-tcga-capture_kit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
################## PURPOSE #################
#
# retrieve the TCGA's exome capture kit from GDC's file API endpoint
# so that we can use the correct BED for process like somatic calling
# and TMB calculation or other related analysis
#
############################################
import requests
import json
import pandas as pd
import os
# 1. get TCGA manifest from the data release
## set data bucket base url and version
PBTA_BUCKET = 'https://s3.amazonaws.com/kf-openaccess-us-east-1-prd-pbta/data/'
RELEASE = 'release-v14-20200203/'
## GET and load manifest
tcga_manifest = requests.get(
PBTA_BUCKET+RELEASE+"pbta-tcga-manifest.tsv").content.decode("utf-8")
## iterate TCGA manifest, to get all the file names
tcga_manifest_lines = tcga_manifest.splitlines()
tcga_filenames = []
for line in tcga_manifest_lines[1:]:
tcga_filenames.append(line.split("\t")[0])
# 2. hit GDC file API endpoint to get the details of the capture kit
## set GDC API base url and request headers
gdc_url = 'https://api.gdc.cancer.gov/files'
headers = {'Content-Type': 'application/json'}
## API request field, removed "analysis.metadata.read_groups.read_group_name"
## can add that back for details
fields = [
'file_name',
'analysis.metadata.read_groups.target_capture_kit_name',
'analysis.metadata.read_groups.target_capture_kit_target_region'
]
fields = ','.join(fields)
## API request body
payload = {
'filters':{
'op':'=',
'content':{
'field':'file_name',
'value':tcga_filenames}},
'format':'json',
'fields':fields,
'size':5000 # make sure we get all the returns
}
payload = json.dumps(payload)
## hit GDC API file endpoint
gdc_response = requests.post(gdc_url, headers=headers, data=payload)
# 3. handle GDC API return to find out capture kit url
gdc_response = gdc_response.json()
capture_kits = []
## iterate .data.hits entity manifest
for i in gdc_response['data']['hits']:
for j in i['analysis']['metadata']['read_groups']:
capture_kits.append([
i['file_name'],
j['target_capture_kit_name'],
j['target_capture_kit_target_region']
])
# 4. load capture kit into data frame, find unique kit download url
df = pd.DataFrame(capture_kits).drop_duplicates()
df.columns = ['filename','kit_name','kit_url']
# 5. output the capture kit data frame
py_path = os.path.dirname(os.path.realpath(__file__))
output_csv_path = os.path.join(py_path, '../results/tcga-capture_kit-info.tsv')
df.to_csv(output_csv_path, sep='\t', index=False)