Filegen wes2 (#112)

* added working example of how to merge metadata * started drafting function to simplify merge * more work on merging * first working version of function to merge input files into existing metadata * working function to merge wes input artifacts into metadata model * implemented fix to logic which was clobbering old data. addressed concerns from PR
CIMAC-CIDC · Aug 6, 2019 · 414073a · curlup · Aug 6, 2019 · 414073a
1 parent faaff71
commit 414073a
Show file tree

Hide file tree

Showing 10 changed files with 437 additions and 111 deletions.
diff --git a/cidc_schemas/prism.py b/cidc_schemas/prism.py
diff --git a/cidc_schemas/schemas/artifacts/artifact_core.json b/cidc_schemas/schemas/artifacts/artifact_core.json
@@ -21,7 +21,7 @@
       "description": "The name of the file with extension",
       "type": "string"
     },
-    "bucket_url": {
+    "object_url": {
       "description": "URL to artifact within Google Bucket.",
       "type": "string"
     },
@@ -75,13 +75,11 @@
     }
   },
   "required": [
-    "uuid",
     "file_name",
-    "bucket_url",
+    "object_url",
     "uploaded_timestamp",
     "file_size_bytes",
     "md5_hash",
-    "visible",
     "artifact_category",
     "assay_category",
     "file_type"

diff --git a/cidc_schemas/schemas/artifacts/artifact_text.json b/cidc_schemas/schemas/artifacts/artifact_text.json
@@ -15,5 +15,6 @@
         }
       }
     }
-  ]
+  ],
+  "mergeStrategy": "objectMerge"
 }
diff --git a/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json b/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json
@@ -5,6 +5,10 @@
   "type": "object",
   "description": "A single data record from an NGS assay",
   "properties": {
+    "entry_id": {
+        "description": "Identifier which is unique to this entry, this is clincal trial, participant, aliquot id's",
+        "type": "string"
+    },
     "library_kit_lot": {
       "description": "Lot number for the library construction kit",
       "type": "string"
@@ -35,6 +39,7 @@
   },
   "required": [
     "library_kit_lot",
-    "library_prep_date"
+    "library_prep_date",
+    "entry_id"
   ]
 }
diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_entry.json b/cidc_schemas/schemas/assays/components/ngs/wes_entry.json
@@ -21,5 +21,8 @@
       "$ref": "assays/components/ngs/wes_input.json"
     }
   },
-  "required": ["enrichment_vendor_lot"]
+  "mergeStrategy": "objectMerge",
+  "required": [
+      "enrichment_vendor_lot"
+    ]
 }
diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_input.json b/cidc_schemas/schemas/assays/components/ngs/wes_input.json
@@ -6,23 +6,17 @@
   "description": "Whole Exome Sequencing Assay Input Files",
   "properties": {
     "tumor": {
-      "allOf": [
-        { "$ref": "assays/components/fastq_pairs.json" }
-      ]
+        "$ref": "assays/components/fastq_pairs.json",
+        "mergeStrategy": "objectMerge"
     },
     "normal": {
-      "allOf": [
-        { "$ref": "assays/components/fastq_pairs.json" }
-      ]
+        "$ref": "assays/components/fastq_pairs.json",
+        "mergeStrategy": "objectMerge"
     },
     "read_group_mapping_file": {
       "$ref": "artifacts/artifact_text.json",
       "description": "Stores read group information for each read in the fastq files.  Needed for when samples are run across multiple lanes."
     }
   },
-  "required": [
-    "tumor",
-    "normal",
-    "read_group_mapping_file"
-  ]
+  "required": []
 }
diff --git a/cidc_schemas/schemas/assays/wes_assay.json b/cidc_schemas/schemas/assays/wes_assay.json
@@ -18,6 +18,10 @@
       "type": "array",
       "items": {
         "$ref": "assays/components/ngs/wes_entry.json"
+      },
+      "mergeStrategy": "arrayMergeById",
+      "mergeOptions": {
+        "idRef": "entry_id"
       }
     },
     "analysis":{

diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py
@@ -16,7 +16,7 @@
     "artifact_category": "Manifest File",
     "artifact_creator": "DFCI",
     "assay_category": "Whole Exome Sequencing (WES)",
-    "bucket_url": "dummy",
+    "object_url": "dummy",
     "file_name": "dummy.txt",
     "file_size_bytes": 1,
     "file_type": "FASTA",

diff --git a/tests/test_assays.py b/tests/test_assays.py
@@ -16,7 +16,7 @@
     "artifact_category": "Manifest File",
     "artifact_creator": "DFCI",
     "assay_category": "Whole Exome Sequencing (WES)",
-    "bucket_url": "dummy",
+    "object_url": "dummy",
     "file_name": "dummy.txt",
     "file_size_bytes": 1,
     "file_type": "FASTA",
@@ -94,6 +94,7 @@ def test_wes():
         "library_kit_lot": "dummy_value",
         "library_prep_date": "01/01/2001",
         "capture_date": "01/01/2001",
+        "entry_id": "xyz",
         "files": {
             "tumor": {
                 "fastq_1": fastq_1,
@@ -175,6 +176,7 @@ def test_rna_expression():
         "library_kit_lot": "dummy_value",
         "library_prep_date": "01/01/2001",
         "capture_date": "01/01/2001",
+        "entry_id": "abc",
         "files": {
             "fastq_1": fastq_1,
             "fastq_2": fastq_1,

diff --git a/tests/test_prism.py b/tests/test_prism.py
@@ -8,17 +8,95 @@
 import pytest
 import jsonschema
 import json
+from deepdiff import grep
 from pprint import pprint
 from jsonmerge import Merger
 
-from cidc_schemas.prism import prismify, filepath_gen
+from cidc_schemas.prism import prismify, merge_artifact
 from cidc_schemas.json_validation import load_and_validate_schema
 from cidc_schemas.template import Template
 from cidc_schemas.template_writer import RowType
 from cidc_schemas.template_reader import XlTemplateReader
 
 from .constants import ROOT_DIR, SCHEMA_DIR, TEMPLATE_EXAMPLES_DIR
 from .test_templates import template_paths
+from .test_assays import ARTIFACT_OBJ
+
+
+CLINICAL_TRIAL = {
+        "lead_organization_study_id": "10021",
+        "participants": [
+            {
+                "samples": [
+                    {
+                        "aliquots": [
+                            {
+                                "assay": {
+                                    "wes": {
+                                        "assay_creator": "Mount Sinai",
+                                        "assay_category": "Whole Exome Sequencing (WES)",
+                                        "enrichment_vendor_kit": "Twist",
+                                        "library_vendor_kit": "KAPA - Hyper Prep",
+                                        "sequencer_platform": "Illumina - NextSeq 550",
+                                        "paired_end_reads": "Paired",
+                                        "read_length": 100,
+                                        "records": [
+                                            {
+                                                "library_kit_lot": "lot abc",
+                                                "enrichment_vendor_lot": "lot 123",
+                                                "library_prep_date": "2019-05-01 00:00:00",
+                                                "capture_date": "2019-05-02 00:00:00",
+                                                "input_ng": 100,
+                                                "library_yield_ng": 700,
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc1"
+                                            }
+                                        ]
+                                    }
+                                },
+                                "cimac_aliquot_id": "aliquot 1"
+                            },
+                        ],
+                        "cimac_sample_id": "sample 1",
+                        "genomic_source": "Tumor"
+                    },
+                    {
+                        "aliquots": [
+                            {
+                                "assay": {
+                                    "wes": {
+                                        "assay_creator": "Mount Sinai",
+                                        "assay_category": "Whole Exome Sequencing (WES)",
+                                        "enrichment_vendor_kit": "Twist",
+                                        "library_vendor_kit": "KAPA - Hyper Prep",
+                                        "sequencer_platform": "Illumina - NextSeq 550",
+                                        "paired_end_reads": "Paired",
+                                        "read_length": 100,
+                                        "records": [
+                                            {
+                                                "library_kit_lot": "lot abc",
+                                                "enrichment_vendor_lot": "lot 123",
+                                                "library_prep_date": "2019-05-01 00:00:00",
+                                                "capture_date": "2019-05-02 00:00:00",
+                                                "input_ng": 100,
+                                                "library_yield_ng": 700,
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc2"
+                                            }
+                                        ]
+                                    }
+                                },
+                                "cimac_aliquot_id": "aliquot 2"
+                            }
+                        ],
+                        "cimac_sample_id": "sample 2",
+                        "genomic_source": "Normal"
+                    }
+                ],
+                "cimac_participant_id": "patient 1"
+            }
+        ]
+    }
 
 
 def test_merge_core():
@@ -112,7 +190,8 @@ def test_assay_merge():
                                                 "capture_date": "2019-05-02 00:00:00",
                                                 "input_ng": 100,
                                                 "library_yield_ng": 700,
-                                                "average_insert_size": 250
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc"
                                             }
                                         ],
                                     }
@@ -202,4 +281,64 @@ def test_filepath_gen():
             assert 2 == sum([1 for x in file_maps if x['gs_key'].count("txt") > 0])
 
         # assert works
-        validator.validate(ct)
+        validator.validate(ct)
+
+
+def test_wes():
+
+    # create validators
+    validator = load_and_validate_schema("clinical_trial.json", return_validator=True)
+    schema = validator.schema
+
+    # create the example template.
+    temp_path = os.path.join(SCHEMA_DIR, 'templates', 'metadata', 'wes_template.json')
+    xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR, "wes_template.xlsx")
+    hint = 'wes'
+
+    # parse the spreadsheet and get the file maps
+    ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint)
+
+    # assert works
+    validator.validate(ct)
+
+
+def test_snippet_wes():
+
+    # create the clinical trial.
+    ct = copy.deepcopy(CLINICAL_TRIAL)
+
+    # define list of gs_urls.
+    urls = [
+        '10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq',
+        '10021/Patient 1/sample 1/aliquot 1/wes_reverse.fastq',
+        '10021/Patient 1/sample 1/aliquot 1/wes_read_group.txt',
+        '10021/Patient 1/sample 1/aliquot 2/wes_forward.fastq',
+        '10021/Patient 1/sample 1/aliquot 2/wes_reverse.fastq',
+        '10021/Patient 1/sample 1/aliquot 2/wes_read_group.txt'
+    ]
+
+    # create validator
+    validator = load_and_validate_schema("clinical_trial.json", return_validator=True)
+
+    # loop over each url
+    searched_urls = []
+    for gs_url in urls:
+
+        # attempt to merge
+        ct = merge_artifact(
+                ct,
+                object_url=gs_url,
+                file_size_bytes=14,
+                uploaded_timestamp="01/01/2001",
+                md5_hash="hash1234"
+            )
+
+        # assert we stull have a good clinical trial object.
+        validator.validate(ct)
+
+        # search for this url and all previous (no clobber)
+        searched_urls.append(gs_url)
+        for url in searched_urls:
+            ds = ct | grep(url)
+            assert 'matched_values' in ds
+            assert len(ds['matched_values']) > 0