From 5a0df17f8ef75f01d261be7570a0d5649b34688b Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Wed, 11 Oct 2023 21:27:34 -0400 Subject: [PATCH 1/7] code for getting 4D Nucleosome metadata --- scripts/get_metadata_from_4DNucleosome.py | 205 ++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 scripts/get_metadata_from_4DNucleosome.py diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py new file mode 100644 index 0000000..60fb05b --- /dev/null +++ b/scripts/get_metadata_from_4DNucleosome.py @@ -0,0 +1,205 @@ +import argparse +import json +import requests + +def getParams(): + '''Parse parameters from the command line''' + parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.') + + parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column') + parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename') + + args = parser.parse_args() + return(args) + + +# Helper: 4DNFI to URL to payload +def fetch_data(url): + # Force return from the server in JSON format + headers = {'accept': 'application/json'} + + # GET the search result + response = requests.get(url, headers=headers) + + # Extract the JSON response as a python dictionary + search_results = response.json() + return(search_results) + + +# Main program which takes in input parameters +if __name__ == '__main__': + + # Get params + args = getParams() + + # Parse list of accessions + sample_list = [] + reader = open(args.input, 'r') + for line in reader: + sample_list.append(line.strip().split('\t')[0]) + reader.close() + + # Initialize metadata storage dict + metadata = {} + + # Parse payload for each accession + for bam_4DNFI in sample_list: + # Get payload for accession + url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI + data = fetch_data(url) + + # Confirm payload accession + accession = data.get('accession', '4DNFIXXXXXXX').strip() + if (accession != bam_4DNFI): + print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI)) + continue + experiments = data.get('experiments', []) + track_facet_info = data.get("track_and_facet_info", None) + + + + # Get Library accession + # Okay that it's None + ENCLB = None + + # Get Experiment Accession + ENCSR = None + for experiment in experiments: + if '@id' in experiment: + ENCSR = experiment['@id'] + else: + print("No experiments or accession not in experiments") + + # Get Experiment-dependent info + ENCBS = None + for experiment in experiments: + if 'biosample' in experiment: + biosample = experiment['biosample'] + biosource = biosample["biosource"] + for id in biosource: + if "@id" in id: + ENCBS = id["@id"] + else: + print("No biosource or ENCBS not in biosource") + else: + print("No experiments or biosample not in experiments") + + # Get Target + target = None + if track_facet_info is not None: + target = track_facet_info["assay_info"] + else: + print("No track_and_facet_info, can't find experiment_type") + + # Get Biosample name + strain = None + for experiment in experiments: + if 'biosample' in experiment: + biosample = experiment['biosample'] + biosource = biosample["biosource"] + for bio in biosource: + if "cell_line" in bio: + cell_line = bio["cell_line"] + else: + print("No biosource or cell_line not in biosource") + strain = cell_line["term_name"] + else: + print("No experiments or biosample not in experiments") + + # Get Treatment (N/A for now) + + # Get Assay + assay_title = None + if track_facet_info is not None: + assay_title = track_facet_info["experiment_type"] + else: + print("No track_and_facet_info, can't find experiment_type") + + # Get Read Info + assembly = data.get("genome_assembly", None) + + file_size = data.get('file_size', None) + + # Get Total Reads + # CUT&RUN doens't have total reads + total_reads = None + if assay_title == "in situ Hi-C": + total_reads = None + quality_metric = data.get("quality_metric", []) + quality_metric_summary = quality_metric.get("quality_metric_summary", []) + for metric in quality_metric_summary: + if metric["title"] == "Total Reads": + total_reads = metric["value"] + break + else: + print("No quality_metric_summary, can't find total reads") + + # Get all Fastq's from the json + fastq_list = [] + if assay_title == "CUT&RUN": + workflow_run_outputs = data.get('workflow_run_outputs', []) + for w in workflow_run_outputs: + if "input_files" in w: + for input_file in w["input_files"]: + if "value" in input_file: + value = input_file["value"] + if "@id" in value: + id = value["@id"] + if "/files-fastq/" in id: + parts = id.split('/') + fastq = parts[-2] + fastq_list.append(fastq) + else: + print("@id not in value section") + else: + print("value not in input_files section") + else: + print("input_files not in workflow_run_outputs section") + + fastq_read_length_dict = {} + fastq_run_type_dict = {} + for f in fastq_list: + fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f + fastq_data = fetch_data(fastq_url) + read_length = fastq_data.get("read_length", None) + key = "/files-fastq/" + f + fastq_read_length_dict[key] = read_length + if "paired_end" in fastq_data: + run_type = "pair-ended" + else: + run_type = "single-ended" + fastq_read_length_dict[key] = run_type + + # Get Read Length + mapped_read_length = None + if assay_title == "CUT&RUN": + mapped_read_length = [fastq_read_length_dict] + + # Get Run Type + # May need to double check if this is extracted from the right place + mapped_run_type = None + if assay_title == "CUT&RUN": + mapped_read_length = [fastq_read_length_dict] + + # Future work: add audit information + + # Udate metadata with new accession info + metadata.update({ + accession: { + 'ENCSR': str(ENCSR), + 'ENCLB': str(ENCLB), + 'target': str(target), + 'ENCBS': str(ENCBS), + 'strain': str(strain), + 'assay': str(assay_title), + 'assembly': str(assembly), + 'file_size': str(file_size), + 'total_reads': str(total_reads), + 'read_length': (mapped_read_length), + 'run_type': str(mapped_run_type) + } + }) + + # Writing to sample.json + with open(args.output, "w") as outfile: + outfile.write(json.dumps(metadata, indent=4)) \ No newline at end of file From cfdf2cc4233db95db2034e5cc1ee1441be5c47d2 Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Wed, 11 Oct 2023 21:39:20 -0400 Subject: [PATCH 2/7] test data for 4D Nucleosome --- scripts/testdata/4dnucleosome_sample.json | 114 ++++++++++++++++++++++ scripts/testdata/4dnucleosome_sample.txt | 6 ++ 2 files changed, 120 insertions(+) create mode 100644 scripts/testdata/4dnucleosome_sample.json create mode 100644 scripts/testdata/4dnucleosome_sample.txt diff --git a/scripts/testdata/4dnucleosome_sample.json b/scripts/testdata/4dnucleosome_sample.json new file mode 100644 index 0000000..d174300 --- /dev/null +++ b/scripts/testdata/4dnucleosome_sample.json @@ -0,0 +1,114 @@ +{ + "4DNFIK734P7Z": { + "ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/", + "ENCLB": "None", + "target": "Arima - A1, A2", + "ENCBS": "/biosources/4DNSRCCM5D5D/", + "strain": "HUES8", + "assay": "in situ Hi-C", + "assembly": "GRCh38", + "file_size": "59552912307", + "total_reads": "321592041", + "read_length": "None", + "run_type": "None" + }, + "4DNFIKSORPB9": { + "ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/", + "ENCLB": "None", + "target": "HindIII", + "ENCBS": "/biosources/4DNSRLAXYUCU/", + "strain": "GM19204", + "assay": "Dilution Hi-C", + "assembly": "GRCh38", + "file_size": "92909712242", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFIP6DJ98P": { + "ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/", + "ENCLB": "None", + "target": "late fraction of 2 fractions", + "ENCBS": "/biosources/4DNSRIOTVJ4X/", + "strain": "pluripotent stem cell", + "assay": "2-stage Repli-seq", + "assembly": "GRCh38", + "file_size": "606626982", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFI66KS84H": { + "ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/", + "ENCLB": "None", + "target": "P2 of 16 fractions", + "ENCBS": "/biosources/4DNSRJ3TG8FL/", + "strain": "HCT116", + "assay": "Multi-stage Repli-seq", + "assembly": "GRCh38", + "file_size": "1222308231", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + }, + "4DNFI61TAGXP": { + "ENCSR": "/experiments-seq/4DNEXHKQPX6M/", + "ENCLB": "None", + "target": "H2A.Z protein", + "ENCBS": "/biosources/4DNSRV3SKQ8M/", + "strain": "H1-hESC", + "assay": "CUT&RUN", + "assembly": "GRCh38", + "file_size": "10229433099", + "total_reads": "None", + "read_length": [ + { + "/files-fastq/4DNFIOXB4NOH": 25, + "/files-fastq/4DNFIMTMXANT": 25, + "/files-fastq/4DNFIHKEPRLT": 25, + "/files-fastq/4DNFIW2Y8BBQ": 25, + "/files-fastq/4DNFIABI5ARW": 25, + "/files-fastq/4DNFI5TBKNYX": 25, + "/files-fastq/4DNFITUXPJN2": 25, + "/files-fastq/4DNFILMHOUZC": 25, + "/files-fastq/4DNFIPSB3Z5A": 25, + "/files-fastq/4DNFIZ9HJHMH": 25, + "/files-fastq/4DNFIBNA7Y2C": 25, + "/files-fastq/4DNFIT91ZD5W": 25, + "/files-fastq/4DNFI7MS4DBN": 25, + "/files-fastq/4DNFI2YHB4ZG": 25 + } + ], + "run_type": [ + { + "/files-fastq/4DNFIOXB4NOH": "pair-ended", + "/files-fastq/4DNFIMTMXANT": "pair-ended", + "/files-fastq/4DNFIHKEPRLT": "pair-ended", + "/files-fastq/4DNFIW2Y8BBQ": "pair-ended", + "/files-fastq/4DNFIABI5ARW": "pair-ended", + "/files-fastq/4DNFI5TBKNYX": "pair-ended", + "/files-fastq/4DNFITUXPJN2": "pair-ended", + "/files-fastq/4DNFILMHOUZC": "pair-ended", + "/files-fastq/4DNFIPSB3Z5A": "pair-ended", + "/files-fastq/4DNFIZ9HJHMH": "pair-ended", + "/files-fastq/4DNFIBNA7Y2C": "pair-ended", + "/files-fastq/4DNFIT91ZD5W": "pair-ended", + "/files-fastq/4DNFI7MS4DBN": "pair-ended", + "/files-fastq/4DNFI2YHB4ZG": "pair-ended" + } + ] + }, + "4DNFICHIIXAT": { + "ENCSR": "/experiments-damid/4DNEXJ6SOGOE/", + "ENCLB": "None", + "target": "LMNB1 protein", + "ENCBS": "/biosources/4DNSRHGVFSRJ/", + "strain": "RPE-hTERT", + "assay": "pA-DamID", + "assembly": "GRCh38", + "file_size": "507523701", + "total_reads": "None", + "read_length": "None", + "run_type": "None" + } +} \ No newline at end of file diff --git a/scripts/testdata/4dnucleosome_sample.txt b/scripts/testdata/4dnucleosome_sample.txt new file mode 100644 index 0000000..e18d131 --- /dev/null +++ b/scripts/testdata/4dnucleosome_sample.txt @@ -0,0 +1,6 @@ +4DNFIK734P7Z +4DNFIKSORPB9 +4DNFIP6DJ98P +4DNFI66KS84H +4DNFI61TAGXP +4DNFICHIIXAT \ No newline at end of file From b71321b480a8e866d23daacb0c5308f1e8a68f60 Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Wed, 11 Oct 2023 21:40:34 -0400 Subject: [PATCH 3/7] Update README.md --- scripts/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 9196b07..eebf7bd 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -13,6 +13,7 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX - strain info, run type (single/paired end) - target ("None" if not applicable) - file size +- total reads - read length - genome assembly @@ -50,6 +51,5 @@ optional arguments: ## Run tests ``` -python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json -python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json +python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json ``` From 4a17c9d7782b3d9c32f30fef82f35c28b4742bfb Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:58:26 -0400 Subject: [PATCH 4/7] Update README.md --- scripts/README.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index eebf7bd..25dbc40 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -13,7 +13,6 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX - strain info, run type (single/paired end) - target ("None" if not applicable) - file size -- total reads - read length - genome assembly @@ -47,9 +46,38 @@ optional arguments: -o json_fn, --output json_fn the output json filename ``` +## get_metadata_from_4DNucleosome.py + +** Needs reformatting for different `schema_version` codes but uses None types for info not found** + +Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX) using the ENCODE API. +- experiment accession (ENSRXXXXXX) +- assay name +- biosample accession (ENCBSXXXXXX) +- strain info, run type (single/paired end) +- target ("None" if not applicable) +- file size +- total reads +- read length +- genome assembly +``` +usage: get_metadata_from_ENCODE.py [-h] -i input_fn -o json_fn + +Retrieve ENCODE metadata from API for plotter. + +optional arguments: + -h, --help show this help message and exit + -i input_fn, --input input_fn + the tab-delimited file with ENCFF accessions of BAM + files in the first column + -o json_fn, --output json_fn + the output json filename +``` ## Run tests ``` +python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json +python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json ``` From 66161d461d7072e9be248c7a2ab9203b54931fe9 Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:01:03 -0400 Subject: [PATCH 5/7] Update README.md --- scripts/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index 25dbc40..4c8e9a8 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -48,9 +48,7 @@ optional arguments: ``` ## get_metadata_from_4DNucleosome.py -** Needs reformatting for different `schema_version` codes but uses None types for info not found** - -Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX) using the ENCODE API. +Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API. - experiment accession (ENSRXXXXXX) - assay name - biosample accession (ENCBSXXXXXX) @@ -62,9 +60,9 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX - genome assembly ``` -usage: get_metadata_from_ENCODE.py [-h] -i input_fn -o json_fn +usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn -Retrieve ENCODE metadata from API for plotter. +Retrieve 4D Nucleosome metadata from API for plotter. optional arguments: -h, --help show this help message and exit From 0a0bbede333d5329b00691b5000654751f57c217 Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:01:46 -0400 Subject: [PATCH 6/7] Update README.md --- scripts/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/README.md b/scripts/README.md index 4c8e9a8..ba26760 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -67,7 +67,7 @@ Retrieve 4D Nucleosome metadata from API for plotter. optional arguments: -h, --help show this help message and exit -i input_fn, --input input_fn - the tab-delimited file with ENCFF accessions of BAM + the tab-delimited file with 4DNFI accessions of BAM files in the first column -o json_fn, --output json_fn the output json filename From 1dc085bfa00ce71bbaf9f0269be87799829f901f Mon Sep 17 00:00:00 2001 From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com> Date: Thu, 12 Oct 2023 15:04:56 -0400 Subject: [PATCH 7/7] Update get_metadata_from_4DNucleosome.py --- scripts/get_metadata_from_4DNucleosome.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py index 60fb05b..553f505 100644 --- a/scripts/get_metadata_from_4DNucleosome.py +++ b/scripts/get_metadata_from_4DNucleosome.py @@ -168,18 +168,22 @@ def fetch_data(url): run_type = "pair-ended" else: run_type = "single-ended" - fastq_read_length_dict[key] = run_type + fastq_run_type_dict[key] = run_type # Get Read Length mapped_read_length = None if assay_title == "CUT&RUN": mapped_read_length = [fastq_read_length_dict] + if mapped_read_length is None: + mapped_read_length = "None" # Get Run Type # May need to double check if this is extracted from the right place mapped_run_type = None if assay_title == "CUT&RUN": - mapped_read_length = [fastq_read_length_dict] + mapped_run_type = [fastq_run_type_dict] + if mapped_run_type is None: + mapped_run_type = "None" # Future work: add audit information @@ -196,10 +200,10 @@ def fetch_data(url): 'file_size': str(file_size), 'total_reads': str(total_reads), 'read_length': (mapped_read_length), - 'run_type': str(mapped_run_type) + 'run_type': (mapped_run_type) } }) # Writing to sample.json with open(args.output, "w") as outfile: - outfile.write(json.dumps(metadata, indent=4)) \ No newline at end of file + outfile.write(json.dumps(metadata, indent=4))