From 5a0df17f8ef75f01d261be7570a0d5649b34688b Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Wed, 11 Oct 2023 21:27:34 -0400
Subject: [PATCH 1/7] code for getting 4D Nucleosome metadata

---
 scripts/get_metadata_from_4DNucleosome.py | 205 ++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 scripts/get_metadata_from_4DNucleosome.py

diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py
new file mode 100644
index 0000000..60fb05b
--- /dev/null
+++ b/scripts/get_metadata_from_4DNucleosome.py
@@ -0,0 +1,205 @@
+import argparse
+import json
+import requests
+
+def getParams():
+    '''Parse parameters from the command line'''
+    parser = argparse.ArgumentParser(description='Retrieve 4D Nucleosome metadata from API for plotter.')
+
+    parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the tab-delimited file with 4DNFI accessions of BAM files in the first column')
+    parser.add_argument('-o','--output', metavar='json_fn', required=True, help='the output json filename')
+
+    args = parser.parse_args()
+    return(args)
+
+
+# Helper: 4DNFI to URL to payload
+def fetch_data(url):
+    # Force return from the server in JSON format
+    headers = {'accept': 'application/json'}
+
+    # GET the search result
+    response = requests.get(url, headers=headers)
+
+    # Extract the JSON response as a python dictionary
+    search_results = response.json()
+    return(search_results)
+
+
+# Main program which takes in input parameters
+if __name__ == '__main__':
+
+    # Get params
+    args = getParams()
+
+    # Parse list of accessions
+    sample_list = []
+    reader = open(args.input, 'r')
+    for line in reader:
+        sample_list.append(line.strip().split('\t')[0])
+    reader.close()
+
+    # Initialize metadata storage dict
+    metadata = {}
+
+    # Parse payload for each accession
+    for bam_4DNFI in sample_list:
+        # Get payload for accession
+        url = 'https://data.4dnucleome.org/files-processed/%s/?format=json' % bam_4DNFI
+        data = fetch_data(url)
+
+        # Confirm payload accession
+        accession = data.get('accession', '4DNFIXXXXXXX').strip()
+        if (accession != bam_4DNFI):
+            print("Error: mismatched ENCFF (%s != %s)" % (accession, bam_4DNFI))
+            continue
+        experiments = data.get('experiments', [])
+        track_facet_info = data.get("track_and_facet_info", None)
+
+
+
+        # Get Library accession
+        # Okay that it's None
+        ENCLB = None
+
+        # Get Experiment Accession
+        ENCSR = None
+        for experiment in experiments:
+            if '@id' in experiment:
+                ENCSR = experiment['@id']
+            else:
+                print("No experiments or accession not in experiments")
+
+        # Get Experiment-dependent info
+        ENCBS = None
+        for experiment in experiments:
+            if 'biosample' in experiment:
+                biosample = experiment['biosample']
+                biosource = biosample["biosource"]
+                for id in biosource:
+                    if "@id" in id:
+                        ENCBS = id["@id"]
+                    else:
+                        print("No biosource or ENCBS not in biosource")
+            else:
+                print("No experiments or biosample not in experiments")
+
+        # Get Target
+        target = None
+        if track_facet_info is not None:
+            target = track_facet_info["assay_info"]
+        else:
+            print("No track_and_facet_info, can't find experiment_type")
+
+        # Get Biosample name
+        strain = None
+        for experiment in experiments:
+            if 'biosample' in experiment:
+                biosample = experiment['biosample']
+                biosource = biosample["biosource"]
+                for bio in biosource:
+                    if "cell_line" in bio:
+                        cell_line = bio["cell_line"]
+                    else:
+                        print("No biosource or cell_line not in biosource")
+                strain = cell_line["term_name"]
+            else:
+                print("No experiments or biosample not in experiments")
+
+        # Get Treatment (N/A for now)
+
+        # Get Assay
+        assay_title = None
+        if track_facet_info is not None:
+            assay_title = track_facet_info["experiment_type"]
+        else:
+            print("No track_and_facet_info, can't find experiment_type")
+
+        # Get Read Info
+        assembly = data.get("genome_assembly", None)
+
+        file_size = data.get('file_size', None)
+
+        # Get Total Reads
+        # CUT&RUN doens't have total reads
+        total_reads = None 
+        if assay_title == "in situ Hi-C":
+            total_reads = None
+            quality_metric = data.get("quality_metric", [])
+            quality_metric_summary = quality_metric.get("quality_metric_summary", [])
+            for metric in quality_metric_summary:
+                if metric["title"] == "Total Reads":
+                    total_reads = metric["value"]
+                    break
+                else:
+                    print("No quality_metric_summary, can't find total reads")
+
+        # Get all Fastq's from the json
+        fastq_list = []
+        if assay_title == "CUT&RUN":
+            workflow_run_outputs = data.get('workflow_run_outputs', [])
+            for w in workflow_run_outputs:
+                if "input_files" in w:
+                    for input_file in w["input_files"]:
+                        if "value" in input_file:
+                            value = input_file["value"]
+                            if "@id" in value:
+                                id = value["@id"]
+                                if "/files-fastq/" in id:
+                                    parts = id.split('/')
+                                    fastq = parts[-2]   
+                                    fastq_list.append(fastq)
+                            else:
+                                print("@id not in value section")
+                        else:
+                            print("value not in input_files section")
+                else:
+                    print("input_files not in workflow_run_outputs section")
+        
+        fastq_read_length_dict = {}
+        fastq_run_type_dict = {}
+        for f in fastq_list:
+            fastq_url = 'https://data.4dnucleome.org/files-fastq/%s/?format=json' % f
+            fastq_data = fetch_data(fastq_url)
+            read_length = fastq_data.get("read_length", None)
+            key = "/files-fastq/" + f
+            fastq_read_length_dict[key] = read_length
+            if "paired_end" in fastq_data:
+                run_type = "pair-ended"
+            else:
+                run_type = "single-ended"
+            fastq_read_length_dict[key] = run_type
+
+        # Get Read Length
+        mapped_read_length = None  
+        if assay_title == "CUT&RUN":
+            mapped_read_length = [fastq_read_length_dict]
+
+        # Get Run Type
+        # May need to double check if this is extracted from the right place
+        mapped_run_type = None
+        if assay_title == "CUT&RUN":
+            mapped_read_length = [fastq_read_length_dict]
+
+        # Future work: add audit information
+
+        # Udate metadata with new accession info
+        metadata.update({
+            accession: {
+                'ENCSR': str(ENCSR),
+                'ENCLB': str(ENCLB),
+                'target': str(target),
+                'ENCBS': str(ENCBS),
+                'strain': str(strain),
+                'assay': str(assay_title),
+                'assembly': str(assembly),
+                'file_size': str(file_size),
+                'total_reads': str(total_reads),
+                'read_length': (mapped_read_length),
+                'run_type': str(mapped_run_type)
+            }
+        })
+
+    # Writing to sample.json
+    with open(args.output, "w") as outfile:
+        outfile.write(json.dumps(metadata, indent=4))
\ No newline at end of file

From cfdf2cc4233db95db2034e5cc1ee1441be5c47d2 Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Wed, 11 Oct 2023 21:39:20 -0400
Subject: [PATCH 2/7] test data for 4D Nucleosome

---
 scripts/testdata/4dnucleosome_sample.json | 114 ++++++++++++++++++++++
 scripts/testdata/4dnucleosome_sample.txt  |   6 ++
 2 files changed, 120 insertions(+)
 create mode 100644 scripts/testdata/4dnucleosome_sample.json
 create mode 100644 scripts/testdata/4dnucleosome_sample.txt

diff --git a/scripts/testdata/4dnucleosome_sample.json b/scripts/testdata/4dnucleosome_sample.json
new file mode 100644
index 0000000..d174300
--- /dev/null
+++ b/scripts/testdata/4dnucleosome_sample.json
@@ -0,0 +1,114 @@
+{
+    "4DNFIK734P7Z": {
+        "ENCSR": "/experiments-hi-c/4DNEXJCUBTM2/",
+        "ENCLB": "None",
+        "target": "Arima - A1, A2",
+        "ENCBS": "/biosources/4DNSRCCM5D5D/",
+        "strain": "HUES8",
+        "assay": "in situ Hi-C",
+        "assembly": "GRCh38",
+        "file_size": "59552912307",
+        "total_reads": "321592041",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFIKSORPB9": {
+        "ENCSR": "/experiments-hi-c/4DNEXW6T5QSA/",
+        "ENCLB": "None",
+        "target": "HindIII",
+        "ENCBS": "/biosources/4DNSRLAXYUCU/",
+        "strain": "GM19204",
+        "assay": "Dilution Hi-C",
+        "assembly": "GRCh38",
+        "file_size": "92909712242",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFIP6DJ98P": {
+        "ENCSR": "/experiments-repliseq/4DNEXOLHMWYM/",
+        "ENCLB": "None",
+        "target": "late fraction of 2 fractions",
+        "ENCBS": "/biosources/4DNSRIOTVJ4X/",
+        "strain": "pluripotent stem cell",
+        "assay": "2-stage Repli-seq",
+        "assembly": "GRCh38",
+        "file_size": "606626982",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFI66KS84H": {
+        "ENCSR": "/experiments-repliseq/4DNEXOA9VFCD/",
+        "ENCLB": "None",
+        "target": "P2 of 16 fractions",
+        "ENCBS": "/biosources/4DNSRJ3TG8FL/",
+        "strain": "HCT116",
+        "assay": "Multi-stage Repli-seq",
+        "assembly": "GRCh38",
+        "file_size": "1222308231",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    },
+    "4DNFI61TAGXP": {
+        "ENCSR": "/experiments-seq/4DNEXHKQPX6M/",
+        "ENCLB": "None",
+        "target": "H2A.Z protein",
+        "ENCBS": "/biosources/4DNSRV3SKQ8M/",
+        "strain": "H1-hESC",
+        "assay": "CUT&RUN",
+        "assembly": "GRCh38",
+        "file_size": "10229433099",
+        "total_reads": "None",
+        "read_length": [
+            {
+                "/files-fastq/4DNFIOXB4NOH": 25,
+                "/files-fastq/4DNFIMTMXANT": 25,
+                "/files-fastq/4DNFIHKEPRLT": 25,
+                "/files-fastq/4DNFIW2Y8BBQ": 25,
+                "/files-fastq/4DNFIABI5ARW": 25,
+                "/files-fastq/4DNFI5TBKNYX": 25,
+                "/files-fastq/4DNFITUXPJN2": 25,
+                "/files-fastq/4DNFILMHOUZC": 25,
+                "/files-fastq/4DNFIPSB3Z5A": 25,
+                "/files-fastq/4DNFIZ9HJHMH": 25,
+                "/files-fastq/4DNFIBNA7Y2C": 25,
+                "/files-fastq/4DNFIT91ZD5W": 25,
+                "/files-fastq/4DNFI7MS4DBN": 25,
+                "/files-fastq/4DNFI2YHB4ZG": 25
+            }
+        ],
+        "run_type": [
+            {
+                "/files-fastq/4DNFIOXB4NOH": "pair-ended",
+                "/files-fastq/4DNFIMTMXANT": "pair-ended",
+                "/files-fastq/4DNFIHKEPRLT": "pair-ended",
+                "/files-fastq/4DNFIW2Y8BBQ": "pair-ended",
+                "/files-fastq/4DNFIABI5ARW": "pair-ended",
+                "/files-fastq/4DNFI5TBKNYX": "pair-ended",
+                "/files-fastq/4DNFITUXPJN2": "pair-ended",
+                "/files-fastq/4DNFILMHOUZC": "pair-ended",
+                "/files-fastq/4DNFIPSB3Z5A": "pair-ended",
+                "/files-fastq/4DNFIZ9HJHMH": "pair-ended",
+                "/files-fastq/4DNFIBNA7Y2C": "pair-ended",
+                "/files-fastq/4DNFIT91ZD5W": "pair-ended",
+                "/files-fastq/4DNFI7MS4DBN": "pair-ended",
+                "/files-fastq/4DNFI2YHB4ZG": "pair-ended"
+            }
+        ]
+    },
+    "4DNFICHIIXAT": {
+        "ENCSR": "/experiments-damid/4DNEXJ6SOGOE/",
+        "ENCLB": "None",
+        "target": "LMNB1 protein",
+        "ENCBS": "/biosources/4DNSRHGVFSRJ/",
+        "strain": "RPE-hTERT",
+        "assay": "pA-DamID",
+        "assembly": "GRCh38",
+        "file_size": "507523701",
+        "total_reads": "None",
+        "read_length": "None",
+        "run_type": "None"
+    }
+}
\ No newline at end of file
diff --git a/scripts/testdata/4dnucleosome_sample.txt b/scripts/testdata/4dnucleosome_sample.txt
new file mode 100644
index 0000000..e18d131
--- /dev/null
+++ b/scripts/testdata/4dnucleosome_sample.txt
@@ -0,0 +1,6 @@
+4DNFIK734P7Z
+4DNFIKSORPB9
+4DNFIP6DJ98P
+4DNFI66KS84H
+4DNFI61TAGXP
+4DNFICHIIXAT
\ No newline at end of file

From b71321b480a8e866d23daacb0c5308f1e8a68f60 Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Wed, 11 Oct 2023 21:40:34 -0400
Subject: [PATCH 3/7] Update README.md

---
 scripts/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index 9196b07..eebf7bd 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -13,6 +13,7 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX
 - strain info, run type (single/paired end)
 - target ("None" if not applicable)
 - file size
+- total reads
 - read length
 - genome assembly
 
@@ -50,6 +51,5 @@ optional arguments:
 
 ## Run tests
 ```
-python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json
-python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json
+python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json 
 ```

From 4a17c9d7782b3d9c32f30fef82f35c28b4742bfb Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Thu, 12 Oct 2023 14:58:26 -0400
Subject: [PATCH 4/7] Update README.md

---
 scripts/README.md | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/scripts/README.md b/scripts/README.md
index eebf7bd..25dbc40 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -13,7 +13,6 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX
 - strain info, run type (single/paired end)
 - target ("None" if not applicable)
 - file size
-- total reads
 - read length
 - genome assembly
 
@@ -47,9 +46,38 @@ optional arguments:
   -o json_fn, --output json_fn
                         the output json filename
 ```
+## get_metadata_from_4DNucleosome.py
+
+** Needs reformatting for different `schema_version` codes but uses None types for info not found**
+
+Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX) using the ENCODE API.
+- experiment accession (ENSRXXXXXX)
+- assay name
+- biosample accession (ENCBSXXXXXX)
+- strain info, run type (single/paired end)
+- target ("None" if not applicable)
+- file size
+- total reads
+- read length
+- genome assembly
 
+```
+usage: get_metadata_from_ENCODE.py [-h] -i input_fn -o json_fn
+
+Retrieve ENCODE metadata from API for plotter.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i input_fn, --input input_fn
+                        the tab-delimited file with ENCFF accessions of BAM
+                        files in the first column
+  -o json_fn, --output json_fn
+                        the output json filename
+```
 
 ## Run tests
 ```
+python get_metadata_from_ENCODE.py -i testdata/encode_samples.txt -o testdata/encode_samples.json
+python get_metadata_from_TABfile.py -i testdata/samples.tab -o testdata/samples.json
 python3 get_metadata_from_4DNucleosome.py -i testdata/4dnucleosome_sample.txt -o testdata/4dnucleosome_sample.json 
 ```

From 66161d461d7072e9be248c7a2ab9203b54931fe9 Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Thu, 12 Oct 2023 15:01:03 -0400
Subject: [PATCH 5/7] Update README.md

---
 scripts/README.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index 25dbc40..4c8e9a8 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -48,9 +48,7 @@ optional arguments:
 ```
 ## get_metadata_from_4DNucleosome.py
 
-** Needs reformatting for different `schema_version` codes but uses None types for info not found**
-
-Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX) using the ENCODE API.
+Retrieves the following information keyed on the BAM file accession (4DNFIXXXXXXX) using the 4D Nucleosome API.
 - experiment accession (ENSRXXXXXX)
 - assay name
 - biosample accession (ENCBSXXXXXX)
@@ -62,9 +60,9 @@ Retrieves the following information keyed on the BAM file accession (ENCFFXXXXXX
 - genome assembly
 
 ```
-usage: get_metadata_from_ENCODE.py [-h] -i input_fn -o json_fn
+usage: get_metadata_from_4DNucleosome.py [-h] -i input_fn -o json_fn
 
-Retrieve ENCODE metadata from API for plotter.
+Retrieve 4D Nucleosome metadata from API for plotter.
 
 optional arguments:
   -h, --help            show this help message and exit

From 0a0bbede333d5329b00691b5000654751f57c217 Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Thu, 12 Oct 2023 15:01:46 -0400
Subject: [PATCH 6/7] Update README.md

---
 scripts/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/README.md b/scripts/README.md
index 4c8e9a8..ba26760 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -67,7 +67,7 @@ Retrieve 4D Nucleosome metadata from API for plotter.
 optional arguments:
   -h, --help            show this help message and exit
   -i input_fn, --input input_fn
-                        the tab-delimited file with ENCFF accessions of BAM
+                        the tab-delimited file with 4DNFI accessions of BAM
                         files in the first column
   -o json_fn, --output json_fn
                         the output json filename

From 1dc085bfa00ce71bbaf9f0269be87799829f901f Mon Sep 17 00:00:00 2001
From: DeanZ3 <67203662+DeanZ3@users.noreply.github.com>
Date: Thu, 12 Oct 2023 15:04:56 -0400
Subject: [PATCH 7/7] Update get_metadata_from_4DNucleosome.py

---
 scripts/get_metadata_from_4DNucleosome.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/get_metadata_from_4DNucleosome.py b/scripts/get_metadata_from_4DNucleosome.py
index 60fb05b..553f505 100644
--- a/scripts/get_metadata_from_4DNucleosome.py
+++ b/scripts/get_metadata_from_4DNucleosome.py
@@ -168,18 +168,22 @@ def fetch_data(url):
                 run_type = "pair-ended"
             else:
                 run_type = "single-ended"
-            fastq_read_length_dict[key] = run_type
+            fastq_run_type_dict[key] = run_type
 
         # Get Read Length
         mapped_read_length = None  
         if assay_title == "CUT&RUN":
             mapped_read_length = [fastq_read_length_dict]
+        if mapped_read_length is None:
+            mapped_read_length = "None"
 
         # Get Run Type
         # May need to double check if this is extracted from the right place
         mapped_run_type = None
         if assay_title == "CUT&RUN":
-            mapped_read_length = [fastq_read_length_dict]
+            mapped_run_type = [fastq_run_type_dict]
+        if mapped_run_type is None:
+            mapped_run_type = "None"
 
         # Future work: add audit information
 
@@ -196,10 +200,10 @@ def fetch_data(url):
                 'file_size': str(file_size),
                 'total_reads': str(total_reads),
                 'read_length': (mapped_read_length),
-                'run_type': str(mapped_run_type)
+                'run_type': (mapped_run_type)
             }
         })
 
     # Writing to sample.json
     with open(args.output, "w") as outfile:
-        outfile.write(json.dumps(metadata, indent=4))
\ No newline at end of file
+        outfile.write(json.dumps(metadata, indent=4))