Skip to content
Merged

4dn #41

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,36 @@ optional arguments:
-o json_fn, --output json_fn
the output json filename
```
## get_metadata_from_4DNucleosome.py

Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API.
- experiment accession (ENSRXXXXXX)
- assay name
- biosample accession (ENCBSXXXXXX)
- strain info, run type (single/paired end)
- target ("None" if not applicable)
- file size
- total reads
- read length
- genome assembly

```
usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn

Retrieve 4D Nucleosome metadata from API for plotter.

optional arguments:
-h, --help show this help message and exit
-i input_fn, --input input_fn
the tab-delimited file with 4DNFI accessions of BAM
files in the first column
-o json_fn, --output json_fn
the output json filename
```

## Run tests
```
python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json
python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json
python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json
```
209 changes: 209 additions & 0 deletions scripts/get_metadata_from_4DNucleosome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import argparse
import json
import requests

def getParams():
'''Parse parameters from the command line'''
parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.')

parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column')
parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename')

args = parser.parse_args()
return(args)


# Helper: 4DNFI to URL to payload
def fetch_data(url):
# Force return from the server in JSON format
headers = {'accept': 'application/json'}

# GET the search result
response = requests.get(url, headers=headers)

# Extract the JSON response as a python dictionary
search_results = response.json()
return(search_results)


# Main program which takes in input parameters
if __name__ == '__main__':

# Get params
args = getParams()

# Parse list of accessions
sample_list = []
reader = open(args.input, 'r')
for line in reader:
sample_list.append(line.strip().split('\t')[0])
reader.close()

# Initialize metadata storage dict
metadata = {}

# Parse payload for each accession
for bam_4DNFI in sample_list:
# Get payload for accession
url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI
data = fetch_data(url)

# Confirm payload accession
accession = data.get('accession', '4DNFIXXXXXXX').strip()
if (accession != bam_4DNFI):
print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI))
continue
experiments = data.get('experiments', [])
track_facet_info = data.get("track_and_facet_info", None)



# Get Library accession
# Okay that it's None
ENCLB = None

# Get Experiment Accession
ENCSR = None
for experiment in experiments:
if '@id' in experiment:
ENCSR = experiment['@id']
else:
print("No experiments or accession not in experiments")

# Get Experiment-dependent info
ENCBS = None
for experiment in experiments:
if 'biosample' in experiment:
biosample = experiment['biosample']
biosource = biosample["biosource"]
for id in biosource:
if "@id" in id:
ENCBS = id["@id"]
else:
print("No biosource or ENCBS not in biosource")
else:
print("No experiments or biosample not in experiments")

# Get Target
target = None
if track_facet_info is not None:
target = track_facet_info["assay_info"]
else:
print("No track_and_facet_info, can't find experiment_type")

# Get Biosample name
strain = None
for experiment in experiments:
if 'biosample' in experiment:
biosample = experiment['biosample']
biosource = biosample["biosource"]
for bio in biosource:
if "cell_line" in bio:
cell_line = bio["cell_line"]
else:
print("No biosource or cell_line not in biosource")
strain = cell_line["term_name"]
else:
print("No experiments or biosample not in experiments")

# Get Treatment (N/A for now)

# Get Assay
assay_title = None
if track_facet_info is not None:
assay_title = track_facet_info["experiment_type"]
else:
print("No track_and_facet_info, can't find experiment_type")

# Get Read Info
assembly = data.get("genome_assembly", None)

file_size = data.get('file_size', None)

# Get Total Reads
# CUT&RUN doens't have total reads
total_reads = None
if assay_title == "in situ Hi-C":
total_reads = None
quality_metric = data.get("quality_metric", [])
quality_metric_summary = quality_metric.get("quality_metric_summary", [])
for metric in quality_metric_summary:
if metric["title"] == "Total Reads":
total_reads = metric["value"]
break
else:
print("No quality_metric_summary, can't find total reads")

# Get all Fastq's from the json
fastq_list = []
if assay_title == "CUT&RUN":
workflow_run_outputs = data.get('workflow_run_outputs', [])
for w in workflow_run_outputs:
if "input_files" in w:
for input_file in w["input_files"]:
if "value" in input_file:
value = input_file["value"]
if "@id" in value:
id = value["@id"]
if "/files-fastq/" in id:
parts = id.split('/')
fastq = parts[-2]
fastq_list.append(fastq)
else:
print("@id not in value section")
else:
print("value not in input_files section")
else:
print("input_files not in workflow_run_outputs section")

fastq_read_length_dict = {}
fastq_run_type_dict = {}
for f in fastq_list:
fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f
fastq_data = fetch_data(fastq_url)
read_length = fastq_data.get("read_length", None)
key = "/files-fastq/" + f
fastq_read_length_dict[key] = read_length
if "paired_end" in fastq_data:
run_type = "pair-ended"
else:
run_type = "single-ended"
fastq_run_type_dict[key] = run_type

# Get Read Length
mapped_read_length = None
if assay_title == "CUT&RUN":
mapped_read_length = [fastq_read_length_dict]
if mapped_read_length is None:
mapped_read_length = "None"

# Get Run Type
# May need to double check if this is extracted from the right place
mapped_run_type = None
if assay_title == "CUT&RUN":
mapped_run_type = [fastq_run_type_dict]
if mapped_run_type is None:
mapped_run_type = "None"

# Future work: add audit information

# Udate metadata with new accession info
metadata.update({
accession: {
'ENCSR': str(ENCSR),
'ENCLB': str(ENCLB),
'target': str(target),
'ENCBS': str(ENCBS),
'strain': str(strain),
'assay': str(assay_title),
'assembly': str(assembly),
'file_size': str(file_size),
'total_reads': str(total_reads),
'read_length': (mapped_read_length),
'run_type': (mapped_run_type)
}
})

# Writing to sample.json
with open(args.output, "w") as outfile:
outfile.write(json.dumps(metadata, indent=4))
114 changes: 114 additions & 0 deletions scripts/testdata/4dnucleosome_sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"4DNFIK734P7Z": {
"ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/",
"ENCLB": "None",
"target": "Arima - A1, A2",
"ENCBS": "/biosources/4DNSRCCM5D5D/",
"strain": "HUES8",
"assay": "in situ Hi-C",
"assembly": "GRCh38",
"file_size": "59552912307",
"total_reads": "321592041",
"read_length": "None",
"run_type": "None"
},
"4DNFIKSORPB9": {
"ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/",
"ENCLB": "None",
"target": "HindIII",
"ENCBS": "/biosources/4DNSRLAXYUCU/",
"strain": "GM19204",
"assay": "Dilution Hi-C",
"assembly": "GRCh38",
"file_size": "92909712242",
"total_reads": "None",
"read_length": "None",
"run_type": "None"
},
"4DNFIP6DJ98P": {
"ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/",
"ENCLB": "None",
"target": "late fraction of 2 fractions",
"ENCBS": "/biosources/4DNSRIOTVJ4X/",
"strain": "pluripotent stem cell",
"assay": "2-stage Repli-seq",
"assembly": "GRCh38",
"file_size": "606626982",
"total_reads": "None",
"read_length": "None",
"run_type": "None"
},
"4DNFI66KS84H": {
"ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/",
"ENCLB": "None",
"target": "P2 of 16 fractions",
"ENCBS": "/biosources/4DNSRJ3TG8FL/",
"strain": "HCT116",
"assay": "Multi-stage Repli-seq",
"assembly": "GRCh38",
"file_size": "1222308231",
"total_reads": "None",
"read_length": "None",
"run_type": "None"
},
"4DNFI61TAGXP": {
"ENCSR": "/experiments-seq/4DNEXHKQPX6M/",
"ENCLB": "None",
"target": "H2A.Z protein",
"ENCBS": "/biosources/4DNSRV3SKQ8M/",
"strain": "H1-hESC",
"assay": "CUT&RUN",
"assembly": "GRCh38",
"file_size": "10229433099",
"total_reads": "None",
"read_length": [
{
"/files-fastq/4DNFIOXB4NOH": 25,
"/files-fastq/4DNFIMTMXANT": 25,
"/files-fastq/4DNFIHKEPRLT": 25,
"/files-fastq/4DNFIW2Y8BBQ": 25,
"/files-fastq/4DNFIABI5ARW": 25,
"/files-fastq/4DNFI5TBKNYX": 25,
"/files-fastq/4DNFITUXPJN2": 25,
"/files-fastq/4DNFILMHOUZC": 25,
"/files-fastq/4DNFIPSB3Z5A": 25,
"/files-fastq/4DNFIZ9HJHMH": 25,
"/files-fastq/4DNFIBNA7Y2C": 25,
"/files-fastq/4DNFIT91ZD5W": 25,
"/files-fastq/4DNFI7MS4DBN": 25,
"/files-fastq/4DNFI2YHB4ZG": 25
}
],
"run_type": [
{
"/files-fastq/4DNFIOXB4NOH": "pair-ended",
"/files-fastq/4DNFIMTMXANT": "pair-ended",
"/files-fastq/4DNFIHKEPRLT": "pair-ended",
"/files-fastq/4DNFIW2Y8BBQ": "pair-ended",
"/files-fastq/4DNFIABI5ARW": "pair-ended",
"/files-fastq/4DNFI5TBKNYX": "pair-ended",
"/files-fastq/4DNFITUXPJN2": "pair-ended",
"/files-fastq/4DNFILMHOUZC": "pair-ended",
"/files-fastq/4DNFIPSB3Z5A": "pair-ended",
"/files-fastq/4DNFIZ9HJHMH": "pair-ended",
"/files-fastq/4DNFIBNA7Y2C": "pair-ended",
"/files-fastq/4DNFIT91ZD5W": "pair-ended",
"/files-fastq/4DNFI7MS4DBN": "pair-ended",
"/files-fastq/4DNFI2YHB4ZG": "pair-ended"
}
]
},
"4DNFICHIIXAT": {
"ENCSR": "/experiments-damid/4DNEXJ6SOGOE/",
"ENCLB": "None",
"target": "LMNB1 protein",
"ENCBS": "/biosources/4DNSRHGVFSRJ/",
"strain": "RPE-hTERT",
"assay": "pA-DamID",
"assembly": "GRCh38",
"file_size": "507523701",
"total_reads": "None",
"read_length": "None",
"run_type": "None"
}
}
6 changes: 6 additions & 0 deletions scripts/testdata/4dnucleosome_sample.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
4DNFIK734P7Z
4DNFIKSORPB9
4DNFIP6DJ98P
4DNFI66KS84H
4DNFI61TAGXP
4DNFICHIIXAT