diff --git a/scripts/README.md b/scripts/README.md index 9196b07..ba26760 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -46,10 +46,36 @@ optional arguments: -o json_fn, --output json_fn the output json filename ``` +## get_metadata_from_4DNucleosome.py +Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API. +- experiment accession (ENSRXXXXXX) +- assay name +- biosample accession (ENCBSXXXXXX) +- strain info, run type (single/paired end) +- target ("None" if not applicable) +- file size +- total reads +- read length +- genome assembly + +``` +usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn + +Retrieve 4D Nucleosome metadata from API for plotter. + +optional arguments: + -h, --help show this help message and exit + -i input_fn, --input input_fn + the tab-delimited file with 4DNFI accessions of BAM + files in the first column + -o json_fn, --output json_fn + the output json filename +``` ## Run tests ``` python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json +python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json ``` diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py new file mode 100644 index 0000000..553f505 --- /dev/null +++ b/scripts/get_metadata_from_4DNucleosome.py @@ -0,0 +1,209 @@ +import argparse +import json +import requests + +def getParams(): + '''Parse parameters from the command line''' + parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.') + + parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column') + parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename') + + args = parser.parse_args() + return(args) + + +# Helper: 4DNFI to URL to payload +def fetch_data(url): + # Force return from the server in JSON format + headers = {'accept': 'application/json'} + + # GET the search result + response = requests.get(url, headers=headers) + + # Extract the JSON response as a python dictionary + search_results = response.json() + return(search_results) + + +# Main program which takes in input parameters +if __name__ == '__main__': + + # Get params + args = getParams() + + # Parse list of accessions + sample_list = [] + reader = open(args.input, 'r') + for line in reader: + sample_list.append(line.strip().split('\t')[0]) + reader.close() + + # Initialize metadata storage dict + metadata = {} + + # Parse payload for each accession + for bam_4DNFI in sample_list: + # Get payload for accession + url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI + data = fetch_data(url) + + # Confirm payload accession + accession = data.get('accession', '4DNFIXXXXXXX').strip() + if (accession != bam_4DNFI): + print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI)) + continue + experiments = data.get('experiments', []) + track_facet_info = data.get("track_and_facet_info", None) + + + + # Get Library accession + # Okay that it's None + ENCLB = None + + # Get Experiment Accession + ENCSR = None + for experiment in experiments: + if '@id' in experiment: + ENCSR = experiment['@id'] + else: + print("No experiments or accession not in experiments") + + # Get Experiment-dependent info + ENCBS = None + for experiment in experiments: + if 'biosample' in experiment: + biosample = experiment['biosample'] + biosource = biosample["biosource"] + for id in biosource: + if "@id" in id: + ENCBS = id["@id"] + else: + print("No biosource or ENCBS not in biosource") + else: + print("No experiments or biosample not in experiments") + + # Get Target + target = None + if track_facet_info is not None: + target = track_facet_info["assay_info"] + else: + print("No track_and_facet_info, can't find experiment_type") + + # Get Biosample name + strain = None + for experiment in experiments: + if 'biosample' in experiment: + biosample = experiment['biosample'] + biosource = biosample["biosource"] + for bio in biosource: + if "cell_line" in bio: + cell_line = bio["cell_line"] + else: + print("No biosource or cell_line not in biosource") + strain = cell_line["term_name"] + else: + print("No experiments or biosample not in experiments") + + # Get Treatment (N/A for now) + + # Get Assay + assay_title = None + if track_facet_info is not None: + assay_title = track_facet_info["experiment_type"] + else: + print("No track_and_facet_info, can't find experiment_type") + + # Get Read Info + assembly = data.get("genome_assembly", None) + + file_size = data.get('file_size', None) + + # Get Total Reads + # CUT&RUN doens't have total reads + total_reads = None + if assay_title == "in situ Hi-C": + total_reads = None + quality_metric = data.get("quality_metric", []) + quality_metric_summary = quality_metric.get("quality_metric_summary", []) + for metric in quality_metric_summary: + if metric["title"] == "Total Reads": + total_reads = metric["value"] + break + else: + print("No quality_metric_summary, can't find total reads") + + # Get all Fastq's from the json + fastq_list = [] + if assay_title == "CUT&RUN": + workflow_run_outputs = data.get('workflow_run_outputs', []) + for w in workflow_run_outputs: + if "input_files" in w: + for input_file in w["input_files"]: + if "value" in input_file: + value = input_file["value"] + if "@id" in value: + id = value["@id"] + if "/files-fastq/" in id: + parts = id.split('/') + fastq = parts[-2] + fastq_list.append(fastq) + else: + print("@id not in value section") + else: + print("value not in input_files section") + else: + print("input_files not in workflow_run_outputs section") + + fastq_read_length_dict = {} + fastq_run_type_dict = {} + for f in fastq_list: + fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f + fastq_data = fetch_data(fastq_url) + read_length = fastq_data.get("read_length", None) + key = "/files-fastq/" + f + fastq_read_length_dict[key] = read_length + if "paired_end" in fastq_data: + run_type = "pair-ended" + else: + run_type = "single-ended" + fastq_run_type_dict[key] = run_type + + # Get Read Length + mapped_read_length = None + if assay_title == "CUT&RUN": + mapped_read_length = [fastq_read_length_dict] + if mapped_read_length is None: + mapped_read_length = "None" + + # Get Run Type + # May need to double check if this is extracted from the right place + mapped_run_type = None + if assay_title == "CUT&RUN": + mapped_run_type = [fastq_run_type_dict] + if mapped_run_type is None: + mapped_run_type = "None" + + # Future work: add audit information + + # Udate metadata with new accession info + metadata.update({ + accession: { + 'ENCSR': str(ENCSR), + 'ENCLB': str(ENCLB), + 'target': str(target), + 'ENCBS': str(ENCBS), + 'strain': str(strain), + 'assay': str(assay_title), + 'assembly': str(assembly), + 'file_size': str(file_size), + 'total_reads': str(total_reads), + 'read_length': (mapped_read_length), + 'run_type': (mapped_run_type) + } + }) + + # Writing to sample.json + with open(args.output, "w") as outfile: + outfile.write(json.dumps(metadata, indent=4)) diff --git a/scripts/testdata/4dnucleosome_sample.json b/scripts/testdata/4dnucleosome_sample.json new file mode 100644 index 0000000..d174300 --- /dev/null +++ b/scripts/testdata/4dnucleosome_sample.json @@ -0,0 +1,114 @@ +{ + "4DNFIK734P7Z": { + "ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/", + "ENCLB": "None", + "target": "Arima - A1, A2", + "ENCBS": "/biosources/4DNSRCCM5D5D/", + "strain": "HUES8", + "assay": "in situ Hi-C", + "assembly": "GRCh38", + "file_size": "59552912307", + "total_reads": "321592041", + "read_length": "None", + "run_type": "None" + }, + "4DNFIKSORPB9": { + "ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/", + "ENCLB": "None", + "target": "HindIII", + "ENCBS": "/biosources/4DNSRLAXYUCU/", + "strain": "GM19204", + "assay": "Dilution Hi-C", + "assembly": "GRCh38", + "file_size": "92909712242", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFIP6DJ98P": { + "ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/", + "ENCLB": "None", + "target": "late fraction of 2 fractions", + "ENCBS": "/biosources/4DNSRIOTVJ4X/", + "strain": "pluripotent stem cell", + "assay": "2-stage Repli-seq", + "assembly": "GRCh38", + "file_size": "606626982", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFI66KS84H": { + "ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/", + "ENCLB": "None", + "target": "P2 of 16 fractions", + "ENCBS": "/biosources/4DNSRJ3TG8FL/", + "strain": "HCT116", + "assay": "Multi-stage Repli-seq", + "assembly": "GRCh38", + "file_size": "1222308231", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFI61TAGXP": { + "ENCSR": "/experiments-seq/4DNEXHKQPX6M/", + "ENCLB": "None", + "target": "H2A.Z protein", + "ENCBS": "/biosources/4DNSRV3SKQ8M/", + "strain": "H1-hESC", + "assay": "CUT&RUN", + "assembly": "GRCh38", + "file_size": "10229433099", + "total_reads": "None", + "read_length": [ + { + "/files-fastq/4DNFIOXB4NOH": 25, + "/files-fastq/4DNFIMTMXANT": 25, + "/files-fastq/4DNFIHKEPRLT": 25, + "/files-fastq/4DNFIW2Y8BBQ": 25, + "/files-fastq/4DNFIABI5ARW": 25, + "/files-fastq/4DNFI5TBKNYX": 25, + "/files-fastq/4DNFITUXPJN2": 25, + "/files-fastq/4DNFILMHOUZC": 25, + "/files-fastq/4DNFIPSB3Z5A": 25, + "/files-fastq/4DNFIZ9HJHMH": 25, + "/files-fastq/4DNFIBNA7Y2C": 25, + "/files-fastq/4DNFIT91ZD5W": 25, + "/files-fastq/4DNFI7MS4DBN": 25, + "/files-fastq/4DNFI2YHB4ZG": 25 + } + ], + "run_type": [ + { + "/files-fastq/4DNFIOXB4NOH": "pair-ended", + "/files-fastq/4DNFIMTMXANT": "pair-ended", + "/files-fastq/4DNFIHKEPRLT": "pair-ended", + "/files-fastq/4DNFIW2Y8BBQ": "pair-ended", + "/files-fastq/4DNFIABI5ARW": "pair-ended", + "/files-fastq/4DNFI5TBKNYX": "pair-ended", + "/files-fastq/4DNFITUXPJN2": "pair-ended", + "/files-fastq/4DNFILMHOUZC": "pair-ended", + "/files-fastq/4DNFIPSB3Z5A": "pair-ended", + "/files-fastq/4DNFIZ9HJHMH": "pair-ended", + "/files-fastq/4DNFIBNA7Y2C": "pair-ended", + "/files-fastq/4DNFIT91ZD5W": "pair-ended", + "/files-fastq/4DNFI7MS4DBN": "pair-ended", + "/files-fastq/4DNFI2YHB4ZG": "pair-ended" + } + ] + }, + "4DNFICHIIXAT": { + "ENCSR": "/experiments-damid/4DNEXJ6SOGOE/", + "ENCLB": "None", + "target": "LMNB1 protein", + "ENCBS": "/biosources/4DNSRHGVFSRJ/", + "strain": "RPE-hTERT", + "assay": "pA-DamID", + "assembly": "GRCh38", + "file_size": "507523701", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + } +} \ No newline at end of file diff --git a/scripts/testdata/4dnucleosome_sample.txt b/scripts/testdata/4dnucleosome_sample.txt new file mode 100644 index 0000000..e18d131 --- /dev/null +++ b/scripts/testdata/4dnucleosome_sample.txt @@ -0,0 +1,6 @@ +4DNFIK734P7Z +4DNFIKSORPB9 +4DNFIP6DJ98P +4DNFI66KS84H +4DNFI61TAGXP +4DNFICHIIXAT \ No newline at end of file