From 414073a7ee230a6fb3239ee6e07e917659abcf7b Mon Sep 17 00:00:00 2001
From: James Lindsay <jlindsay@jimmy.harvard.edu>
Date: Tue, 6 Aug 2019 09:00:48 -0400
Subject: [PATCH] Filegen wes2 (#112)

* added working example of how to merge metadata

* started drafting function to simplify merge

* more work on merging

* first working version of function to merge input files into existing metadata

* working function to merge wes input artifacts into metadata model

* implemented fix to logic which was clobbering old data. addressed concerns from PR
---
 cidc_schemas/prism.py                         | 356 +++++++++++++-----
 .../schemas/artifacts/artifact_core.json      |   6 +-
 .../schemas/artifacts/artifact_text.json      |   3 +-
 .../assays/components/ngs/ngs_entry.json      |   7 +-
 .../assays/components/ngs/wes_entry.json      |   5 +-
 .../assays/components/ngs/wes_input.json      |  16 +-
 cidc_schemas/schemas/assays/wes_assay.json    |   4 +
 tests/test_artifacts.py                       |   2 +-
 tests/test_assays.py                          |   4 +-
 tests/test_prism.py                           | 145 ++++++-
 10 files changed, 437 insertions(+), 111 deletions(-)

diff --git a/cidc_schemas/prism.py b/cidc_schemas/prism.py
index 41266a084..669cb3980 100644
--- a/cidc_schemas/prism.py
+++ b/cidc_schemas/prism.py
@@ -4,6 +4,7 @@
 import copy
 import jsonschema
 from deepdiff import grep
+import datetime
 from jsonmerge import merge, Merger
 
 from cidc_schemas.json_validation import load_and_validate_schema
@@ -28,7 +29,8 @@ def _get_coerce(ref: str):
     """
 
     # get the entry
-    resolver = jsonschema.RefResolver(f'file://{SCHEMA_DIR}/schemas', {'$ref': ref})
+    resolver = jsonschema.RefResolver(
+        f'file://{SCHEMA_DIR}/schemas', {'$ref': ref})
     _, entry = resolver.resolve(ref)
 
     # add our own type conversion
@@ -103,6 +105,12 @@ def populate_lu(ref: str, key_lu: dict, xlsx_key: str):
                 # populate lookup.
                 populate_lu(ref, key_lu, data_key)
 
+    # special case for wes keys.
+    if 'wes' in template_path:
+        ref = "assays/components/ngs/ngs_entry.json#properties/entry_id"
+        data_key = "entry_id"
+        populate_lu(ref, key_lu, data_key)
+
     return key_lu
 
 
@@ -121,7 +129,7 @@ def _find_key(schema_key: str, schema: dict, assay_hint: str = "") -> str:
     choice.
 
     I've introduced the assay_hint string to help disambuguate
-    a path to a key when there are multiple possibilities. 
+    a path to a key when there are multiple possibilities.
     Consider "assay_creator" a property in assay_core.json
     which is associated with every assay. Searching the schema
     for assay_creator will return multiple hits, the hint lets
@@ -195,9 +203,9 @@ def _set_val(path: str, val: object, trial: dict, verbose=False):
     consumed:
 
     path = "['items'][0]['properties']['prop1']"
-    
+
     Next we see an 'item' property which in json-schema
-    denotes an array. So the implication 
+    denotes an array. So the implication
     is that the value of 'participants' is list.
         {
             "participants": [...]
@@ -206,7 +214,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False):
     path = "['properties']['prop1']"
 
     Next we 'properties' so we know we are entering an object
-    with *prop1* as a property. This is the 
+    with *prop1* as a property. This is the
     final piece of the *path* so we can assign the val:
         {
             "participants": [{
@@ -220,7 +228,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False):
 
     For each token we test for its json-schema modifier,
     'items', 'properties', 'allOf'. If we see items we need
-    to add a list, assuming it doesn't exist, if we see properties 
+    to add a list, assuming it doesn't exist, if we see properties
     we need to create a dictionary if it doesn't exist.
 
     *One limitation* of this code is that no list can have
@@ -237,7 +245,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False):
     For our purposes we need to treat the 'allOf' followed
     by the array entry and subsequent object properties
     as properties of the previous object 'prop2'. This
-    is why there are "skip" blocks in the code which advance 
+    is why there are "skip" blocks in the code which advance
     to the next token while keeping the pointer of the current
     object on 'prop2'.
 
@@ -353,7 +361,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False):
                 elif key2 == 'properties':
                     curp[key] = {}
 
-                # also a dictionary, just will be proceeded by a nunber
+                # also a dictionary, just will be preceeded by a nunber
                 elif key2 == 'allOf':
                     # this assume allOf always creates object, maybe not true?
                     curp[key] = {}
@@ -394,13 +402,13 @@ def _get_recursively(search_dict, field):
 
 
 def _process_property(
-                    row: list, 
-                    key_lu: dict,
-                    schema: dict,
-                    data_obj: dict,
-                    assay_hint: str,
-                    fp_lu: dict,
-                    verb: bool):
+        row: list,
+        key_lu: dict,
+        schema: dict,
+        data_obj: dict,
+        assay_hint: str,
+        fp_lu: dict,
+        verb: bool):
     """
     Takes a single property (key, val) from spreadsheet, determines
     where it needs to go in the final object, then inserts it.
@@ -435,7 +443,7 @@ def _process_property(
         gs_key = f'{gs_key}/{_get_recursively(data_obj, "cimac_sample_id")[0]}'
         gs_key = f'{gs_key}/{_get_recursively(data_obj, "cimac_aliquot_id")[0]}'
         gs_key = f'{gs_key}/{assay_hint}'
-        gs_key = gs_key.replace(" ", "_")
+        #gs_key = gs_key.replace(" ", "_")
 
         # do the suffix
         tmp = key.lower().split(" ")
@@ -471,7 +479,8 @@ def _process_property(
 def _build_fplu(assay_hint: str):
 
     # get the un resolved schema
-    template_path = os.path.join(TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json')
+    template_path = os.path.join(
+        TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json')
     with open(template_path) as fin:
         schema = json.load(fin)
 
@@ -507,7 +516,7 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
     e.g. file list
     [
         {
-            'local_path': '/path/to/fwd.fastq', 
+            'local_path': '/path/to/fwd.fastq',
             'gs_key': '10021/Patient_1/sample_1/aliquot_1/wes_forward.fastq'
         }
     ]
@@ -515,11 +524,11 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
 
     Args:
         xlsx_path: file on file system to excel file.
-        template_path: path on file system relative to schema root of the 
+        template_path: path on file system relative to schema root of the
                         temaplate
-                
-        assay_hint: string used to help idnetify properties in template. Must 
-                    be the the root of the template filename i.e. 
+
+        assay_hint: string used to help idnetify properties in template. Must
+                    be the the root of the template filename i.e.
                     wes_template.json would be wes.
         verb: boolean indicating verbosity
 
@@ -543,7 +552,6 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
     # add a special key to track the files
     fp_lu['special'] = list()
 
-
     # read the excel file
     t = XlTemplateReader.from_excel(xlsx_path)
 
@@ -562,12 +570,19 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
         for row in ws[RowType.PREAMBLE]:
 
             # process this property
-            _process_property(row, key_lu, schema, root, assay_hint, fp_lu, verb)
-
+            _process_property(row, key_lu, schema, root,
+                              assay_hint, fp_lu, verb)
 
         # move to headers
         headers = ws[RowType.HEADER][0]
 
+        # track these identifiers
+        potential_ids = {
+            "CIMAC PARTICIPANT ID": "",
+            "CIMAC SAMPLE ID": "",
+            "CIMAC ALIQUOT ID": ""
+        }
+
         # get the data.
         data = ws[RowType.DATA]
         for row in data:
@@ -580,10 +595,29 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
                 _process_property([key, val], key_lu, schema,
                                   curd, assay_hint, fp_lu, verb)
 
+                # track ids
+                if key in potential_ids:
+                    potential_ids[key] = val
 
             # save the entry
             data_rows.append(curd)
 
+            # data rows will require a unique identifier
+            if assay_hint == "wes":
+
+                # create a unique key
+                unique_key = potential_ids['CIMAC PARTICIPANT ID']
+                unique_key = f'{unique_key}_{potential_ids["CIMAC SAMPLE ID"]}'
+                unique_key = f'{unique_key}_{potential_ids["CIMAC ALIQUOT ID"]}'
+
+                # add this to the most recent payload
+                _process_property(['entry_id', unique_key], key_lu, schema,
+                        curd, assay_hint, fp_lu, verb)
+
+            else:
+                raise NotImplementedError(f'only WES is supported, please add additional support \
+                    for {assay_hint}')
+
     # create the merger
     merger = Merger(schema)
 
@@ -597,7 +631,7 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo
 
 
 def _deep_get(obj: dict, key: str):
-    """ 
+    """
     returns value of they supplied key
     gotten via deepdif
     """
@@ -614,87 +648,233 @@ def _deep_get(obj: dict, key: str):
     return cur_obj, tokens[-2]
 
 
-def filepath_gen(xlsx_path: str, schema: dict, assay_hint: str, verb: bool = False):
+def _get_path(ct: dict, key: str) -> str:
     """
-    This is a python generator which yields the paths of local files we are expecting 
-    to recieve alongsdie the supplied metadata xlsx file.
+    find the path to the given key in the dictionary
 
-    There is bespoke assay specific logic encoded in this function and it will
-    likely change if conventions around what files are expected in a given 
-    folder, or what files an assay is expecting.
+    Args:
+        ct: clinical_trial object to be modified
+        key: the identifier we are looking for in the dictionary
+
+    Returns:
+        arg1: string describing the location of the key
+    """
+
+    # first look for key as is
+    ds1 = ct | grep(key, match_string=True)
+    count1 = 0
+    if 'matched_values' in ds1:
+        count1 = len(ds1['matched_values'])
+
+    # the hack fails if both work... probably need to deal with this
+    if count1 == 0:
+        raise NotImplementedError(f"key: {key} not found in dictionary")
+
+    # get the keypath
+    return ds1['matched_values'].pop()
+
+
+def _get_source(ct: dict, key: str, level="sample") -> dict:
+    """
+    extract the object in the dicitionary specified by
+    the supplied key (or one of its parents.)
 
     Args:
-        xlsx_path: file on file system to excel file.
-        schema: json schema with all ref resolved
-        assay_hint: string used to help idnetify properties in template. Must 
-                    be the the root of the template filename i.e. 
-                    wes_template.json would be wes.
-        verb: boolean indicating verbosity
+        ct: clinical_trial object to be searched
+        key: the identifier we are looking for in the dictionary,
+        level: a keyword describing which level in the key path
+                (trial, participants, sample, aliquot) we want to return
 
     Returns:
-        None, data_obj is modified in place
+        arg1: string describing the location of the key
     """
 
-    # get the un resolved schema
-    template_path = os.path.join(TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json')
-    with open(template_path) as fin:
-        schema = json.load(fin)
+    # tokenize.
+    key = key.replace("root", "").replace("'", "")
+    tokens = re.findall(r"\[(.*?)\]", key)
 
-    # find key in the schema, this notation is
-    # recommended usage of deepdif grep. assuming they
-    # overload the pipe operator to simulate cmd line
-    schema_key = 'artifact_link'
-    ds = schema | grep(schema_key)
-    if 'matched_paths' not in ds:
-        raise KeyError(f'{schema_key} not found in schema')
+    # this will get us to the object we have the key for
+    if level == "sample":
+        tokens = tokens[0:-3]
+    elif level == "aliquot":
+        tokens = tokens[0:-1]
+    else:
+        raise NotImplementedError(
+            f'the following level is not supported: {level}')
 
-    # sort potential matches, shortest is what we want.
-    choices = sorted(ds['matched_paths'], key=len)
+    # keep getting based on the key.
+    cur_obj = ct
+    for token in tokens:
+        try:
+            token = int(token)
+        except ValueError:
+            pass
 
-    # create tuples
-    key_lu = {}
-    for c in choices:
+        cur_obj = cur_obj[token]
 
-        # get the value and parent of the file link.
-        val, pkey = _deep_get(schema, c)
-        pkey = pkey.upper()
-        key_lu[pkey] = val
+    return cur_obj
 
-    def _do_stuff(key, val, lu):
-        if key in lu:
-            # make the accession key
-            tmp = lu[key][1]
-            print(tmp)
-            gs_key = tmp["lead_organization_study_id"]
-            gs_key = f'{gs_key}/{tmp["cimac_participant_id"]}'
-            gs_key = f'{gs_key}/{tmp["cimac_sample_id"]}'
-            gs_key = f'{gs_key}/{tmp["cimac_aliquot_id"]}'
-            #print("stuff", key, val, lu[key])
-            print(gs_key)
 
-    # read the excel file
-    t = XlTemplateReader.from_excel(xlsx_path)
+def _merge_artifact_wes(
+    ct: dict,
+    object_url: str,
+    file_size_bytes: int,
+    uploaded_timestamp: str,
+    md5_hash: str
+):
+    """
+    create and merge an artifact into the WES assay metadata.
+    The artifacts currently supported are only the input
+    fastq files and read mapping group file.
 
-    # loop over spreadsheet
-    worksheet_names = t.grouped_rows.keys()
-    for name in worksheet_names:
+    Args:
+        ct: clinical_trial object to be searched
+        object_url: the gs url pointing to the object being added
+        file_size_bytes: integer specifying the numebr of bytes in the file
+        uploaded_timestamp: time stamp associated with this object
+        md5_hash: hash of the uploaded object, usually provided by
+                    object storage
 
-        # get the worksheat.
-        ws = t.grouped_rows[name]
+    """
 
-        # Compare preamble rows
-        for row in ws[RowType.PREAMBLE]:
+    # replace gs prfix if exists.
+    object_url, lead_organization_study_id, \
+        cimac_participant_id, cimac_sample_id, cimac_aliquot_id, \
+        file_name = _split_objurl(object_url)
+
+    # get the genomic source.
+    keypath = _get_path(ct, cimac_aliquot_id)
+    sample_obj = _get_source(ct, keypath)
+    genomic_source = sample_obj['genomic_source']
+
+    # create the artifact.
+    artifact = {
+        "artifact_category": "Assay Artifact from CIMAC",
+        "assay_category": "Whole Exome Sequencing (WES)",
+        "object_url": object_url,
+        "file_name": file_name,
+        "file_size_bytes": 1,
+        "md5_hash": md5_hash,
+        "uploaded_timestamp": str(datetime.datetime.now()).split('.')[0]
+    }
+
+    # create the wes input object which will be added to existing data
+    obj = {}
+
+    # check if we are adding read group mapping file.
+    if "wes_read_group" in file_name:
+
+        # set the artifact type and save
+        artifact["file_type"] = "Other"
+        obj['read_group_mapping_file'] = artifact
 
-            _do_stuff(row[0], row[1], key_lu)
+    else:
 
-        # move to headers
-        headers = ws[RowType.HEADER][0]
+        # set the artifact type
+        artifact["file_type"] = "FASTQ"
+
+        # determine how to craft the artifact
+        obj[genomic_source] = {}
+        if "wes_forward" in file_name:
+            obj[genomic_source]['fastq_1'] = artifact
+
+        elif "wes_reverse" in file_name:
+            obj[genomic_source]['fastq_2'] = artifact
+
+    # copy the metadata and add this a new record.
+    # note this will clobber whatever is here. This is
+    # OK because the original copy of ct will have the
+    # clobbered data, while the new copy will have
+    # the new entry which will get appended to the
+    # "records" list by the merge by ID strategy
+    # specified in the json-schema for records
+    ct_copy = copy.deepcopy(ct)
+    aliquot_obj = _get_source(ct_copy, keypath, level="aliquot")
+    aliquot_obj['assay']['wes']['records'][0]['files'] = obj
+
+    # merge the copy with the original.
+    validator = load_and_validate_schema(
+        "clinical_trial.json", return_validator=True)
+    schema = validator.schema
+    merger = Merger(schema)
 
-        # get the data.
-        data = ws[RowType.DATA]
-        for row in data:
+    ct_new = merger.merge(ct, ct_copy)
 
-            # create dictionary per row
-            for key, val in zip(headers, row):
+    # validate the new data
+    validator.validate(ct_new)
+
+    # return the new dictionary
+    return ct_new
+
+
+def _split_objurl(obj_url: str) -> (str, str, str, str, str, str):
+    """
+    splits gs_url into components and returns them
+
+    Args:
+        obj_url: gs://url/to/file
+
+    Returns:
+        arg1: tuple of the components
+    """
+
+    # replace gs prfix if exists.
+    obj_url = obj_url.replace("gs://", "")
+
+    # parse the url to get key identifiers
+    tokens = obj_url.split("/")
+    lead_organization_study_id = tokens[0]
+    cimac_participant_id = tokens[1]
+    cimac_sample_id = tokens[2]
+    cimac_aliquot_id = tokens[3]
+    file_name = tokens[4]
+
+    return obj_url, lead_organization_study_id, cimac_participant_id, \
+        cimac_sample_id, cimac_aliquot_id, file_name
+
+
+def merge_artifact(
+    ct: dict,
+    object_url: str,
+    file_size_bytes: int,
+    uploaded_timestamp: str,
+    md5_hash: str
+):
+    """
+    create and merge an artifact into the metadata blob
+    for a clinical trial. The merging process is automatically
+    determined by inspecting the gs url path.
+
+    Args:
+        ct: clinical_trial object to be searched
+        object_url: the gs url pointing to the object being added
+        file_size_bytes: integer specifying the numebr of bytes in the file
+        uploaded_timestamp: time stamp associated with this object
+        md5_hash: hash of the uploaded object, usually provided by
+                    object storage
+
+    """
+
+    # replace gs prfix if exists.
+    object_url, lead_organization_study_id, \
+        cimac_participant_id, cimac_sample_id, cimac_aliquot_id, \
+        file_name = _split_objurl(object_url)
+
+    # define criteria.
+    wes_names = {'wes_forward', 'wes_reverse', 'wes_read_group'}
+
+    # test criteria.
+    if any(wes_name in file_name for wes_name in wes_names):
+        new_ct = _merge_artifact_wes(
+            ct,
+            object_url,
+            file_size_bytes,
+            uploaded_timestamp,
+            md5_hash
+        )
+    else:
+        raise NotImplementedError(
+            f'the following file_name is not supported: {file_name}')
 
-                _do_stuff(key, val, key_lu)
+    # return new object
+    return new_ct
diff --git a/cidc_schemas/schemas/artifacts/artifact_core.json b/cidc_schemas/schemas/artifacts/artifact_core.json
index aaf114a58..dcada3a1e 100644
--- a/cidc_schemas/schemas/artifacts/artifact_core.json
+++ b/cidc_schemas/schemas/artifacts/artifact_core.json
@@ -21,7 +21,7 @@
       "description": "The name of the file with extension",
       "type": "string"
     },
-    "bucket_url": {
+    "object_url": {
       "description": "URL to artifact within Google Bucket.",
       "type": "string"
     },
@@ -75,13 +75,11 @@
     }
   },
   "required": [
-    "uuid",
     "file_name",
-    "bucket_url",
+    "object_url",
     "uploaded_timestamp",
     "file_size_bytes",
     "md5_hash",
-    "visible",
     "artifact_category",
     "assay_category",
     "file_type"
diff --git a/cidc_schemas/schemas/artifacts/artifact_text.json b/cidc_schemas/schemas/artifacts/artifact_text.json
index 834e96f88..d0bf9efc4 100644
--- a/cidc_schemas/schemas/artifacts/artifact_text.json
+++ b/cidc_schemas/schemas/artifacts/artifact_text.json
@@ -15,5 +15,6 @@
         }
       }
     }
-  ]
+  ],
+  "mergeStrategy": "objectMerge"
 }
\ No newline at end of file
diff --git a/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json b/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json
index 956eb1fcc..9f9b557e9 100644
--- a/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json
+++ b/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json
@@ -5,6 +5,10 @@
   "type": "object",
   "description": "A single data record from an NGS assay",
   "properties": {
+    "entry_id": {
+        "description": "Identifier which is unique to this entry, this is clincal trial, participant, aliquot id's",
+        "type": "string"
+    },
     "library_kit_lot": {
       "description": "Lot number for the library construction kit",
       "type": "string"
@@ -35,6 +39,7 @@
   },
   "required": [
     "library_kit_lot",
-    "library_prep_date"
+    "library_prep_date",
+    "entry_id"
   ]
 }
diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_entry.json b/cidc_schemas/schemas/assays/components/ngs/wes_entry.json
index e60c9a0fa..38cb67bb4 100644
--- a/cidc_schemas/schemas/assays/components/ngs/wes_entry.json
+++ b/cidc_schemas/schemas/assays/components/ngs/wes_entry.json
@@ -21,5 +21,8 @@
       "$ref": "assays/components/ngs/wes_input.json"
     }
   },
-  "required": ["enrichment_vendor_lot"]
+  "mergeStrategy": "objectMerge",
+  "required": [
+      "enrichment_vendor_lot"
+    ]
 }
diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_input.json b/cidc_schemas/schemas/assays/components/ngs/wes_input.json
index ac7f43fb1..976a50386 100644
--- a/cidc_schemas/schemas/assays/components/ngs/wes_input.json
+++ b/cidc_schemas/schemas/assays/components/ngs/wes_input.json
@@ -6,23 +6,17 @@
   "description": "Whole Exome Sequencing Assay Input Files",
   "properties": {
     "tumor": {
-      "allOf": [
-        { "$ref": "assays/components/fastq_pairs.json" }
-      ]
+        "$ref": "assays/components/fastq_pairs.json",
+        "mergeStrategy": "objectMerge"
     },
     "normal": {
-      "allOf": [
-        { "$ref": "assays/components/fastq_pairs.json" }
-      ]
+        "$ref": "assays/components/fastq_pairs.json",
+        "mergeStrategy": "objectMerge"
     },
     "read_group_mapping_file": {
       "$ref": "artifacts/artifact_text.json",
       "description": "Stores read group information for each read in the fastq files.  Needed for when samples are run across multiple lanes."
     }
   },
-  "required": [
-    "tumor",
-    "normal",
-    "read_group_mapping_file"
-  ]
+  "required": []
 }
diff --git a/cidc_schemas/schemas/assays/wes_assay.json b/cidc_schemas/schemas/assays/wes_assay.json
index b56adb82d..23202f0ea 100644
--- a/cidc_schemas/schemas/assays/wes_assay.json
+++ b/cidc_schemas/schemas/assays/wes_assay.json
@@ -18,6 +18,10 @@
       "type": "array",
       "items": {
         "$ref": "assays/components/ngs/wes_entry.json"
+      },
+      "mergeStrategy": "arrayMergeById",
+      "mergeOptions": {
+        "idRef": "entry_id"
       }
     },
     "analysis":{
diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py
index 49c1ee25c..f8a4ccd24 100644
--- a/tests/test_artifacts.py
+++ b/tests/test_artifacts.py
@@ -16,7 +16,7 @@
     "artifact_category": "Manifest File",
     "artifact_creator": "DFCI",
     "assay_category": "Whole Exome Sequencing (WES)",
-    "bucket_url": "dummy",
+    "object_url": "dummy",
     "file_name": "dummy.txt",
     "file_size_bytes": 1,
     "file_type": "FASTA",
diff --git a/tests/test_assays.py b/tests/test_assays.py
index 216878152..00ff1de31 100644
--- a/tests/test_assays.py
+++ b/tests/test_assays.py
@@ -16,7 +16,7 @@
     "artifact_category": "Manifest File",
     "artifact_creator": "DFCI",
     "assay_category": "Whole Exome Sequencing (WES)",
-    "bucket_url": "dummy",
+    "object_url": "dummy",
     "file_name": "dummy.txt",
     "file_size_bytes": 1,
     "file_type": "FASTA",
@@ -94,6 +94,7 @@ def test_wes():
         "library_kit_lot": "dummy_value",
         "library_prep_date": "01/01/2001",
         "capture_date": "01/01/2001",
+        "entry_id": "xyz",
         "files": {
             "tumor": {
                 "fastq_1": fastq_1,
@@ -175,6 +176,7 @@ def test_rna_expression():
         "library_kit_lot": "dummy_value",
         "library_prep_date": "01/01/2001",
         "capture_date": "01/01/2001",
+        "entry_id": "abc",
         "files": {
             "fastq_1": fastq_1,
             "fastq_2": fastq_1,
diff --git a/tests/test_prism.py b/tests/test_prism.py
index fbe76131a..899308e65 100644
--- a/tests/test_prism.py
+++ b/tests/test_prism.py
@@ -8,10 +8,11 @@
 import pytest
 import jsonschema
 import json
+from deepdiff import grep
 from pprint import pprint
 from jsonmerge import Merger
 
-from cidc_schemas.prism import prismify, filepath_gen
+from cidc_schemas.prism import prismify, merge_artifact
 from cidc_schemas.json_validation import load_and_validate_schema
 from cidc_schemas.template import Template
 from cidc_schemas.template_writer import RowType
@@ -19,6 +20,83 @@
 
 from .constants import ROOT_DIR, SCHEMA_DIR, TEMPLATE_EXAMPLES_DIR
 from .test_templates import template_paths
+from .test_assays import ARTIFACT_OBJ
+
+
+CLINICAL_TRIAL = {
+        "lead_organization_study_id": "10021",
+        "participants": [
+            {
+                "samples": [
+                    {
+                        "aliquots": [
+                            {
+                                "assay": {
+                                    "wes": {
+                                        "assay_creator": "Mount Sinai",
+                                        "assay_category": "Whole Exome Sequencing (WES)",
+                                        "enrichment_vendor_kit": "Twist",
+                                        "library_vendor_kit": "KAPA - Hyper Prep",
+                                        "sequencer_platform": "Illumina - NextSeq 550",
+                                        "paired_end_reads": "Paired",
+                                        "read_length": 100,
+                                        "records": [
+                                            {
+                                                "library_kit_lot": "lot abc",
+                                                "enrichment_vendor_lot": "lot 123",
+                                                "library_prep_date": "2019-05-01 00:00:00",
+                                                "capture_date": "2019-05-02 00:00:00",
+                                                "input_ng": 100,
+                                                "library_yield_ng": 700,
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc1"
+                                            }
+                                        ]
+                                    }
+                                },
+                                "cimac_aliquot_id": "aliquot 1"
+                            },
+                        ],
+                        "cimac_sample_id": "sample 1",
+                        "genomic_source": "Tumor"
+                    },
+                    {
+                        "aliquots": [
+                            {
+                                "assay": {
+                                    "wes": {
+                                        "assay_creator": "Mount Sinai",
+                                        "assay_category": "Whole Exome Sequencing (WES)",
+                                        "enrichment_vendor_kit": "Twist",
+                                        "library_vendor_kit": "KAPA - Hyper Prep",
+                                        "sequencer_platform": "Illumina - NextSeq 550",
+                                        "paired_end_reads": "Paired",
+                                        "read_length": 100,
+                                        "records": [
+                                            {
+                                                "library_kit_lot": "lot abc",
+                                                "enrichment_vendor_lot": "lot 123",
+                                                "library_prep_date": "2019-05-01 00:00:00",
+                                                "capture_date": "2019-05-02 00:00:00",
+                                                "input_ng": 100,
+                                                "library_yield_ng": 700,
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc2"
+                                            }
+                                        ]
+                                    }
+                                },
+                                "cimac_aliquot_id": "aliquot 2"
+                            }
+                        ],
+                        "cimac_sample_id": "sample 2",
+                        "genomic_source": "Normal"
+                    }
+                ],
+                "cimac_participant_id": "patient 1"
+            }
+        ]
+    }
 
 
 def test_merge_core():
@@ -112,7 +190,8 @@ def test_assay_merge():
                                                 "capture_date": "2019-05-02 00:00:00",
                                                 "input_ng": 100,
                                                 "library_yield_ng": 700,
-                                                "average_insert_size": 250
+                                                "average_insert_size": 250,
+                                                "entry_id": "abc"
                                             }
                                         ],
                                     }
@@ -202,4 +281,64 @@ def test_filepath_gen():
             assert 2 == sum([1 for x in file_maps if x['gs_key'].count("txt") > 0])
 
         # assert works
-        validator.validate(ct)
\ No newline at end of file
+        validator.validate(ct)
+
+
+def test_wes():
+
+    # create validators
+    validator = load_and_validate_schema("clinical_trial.json", return_validator=True)
+    schema = validator.schema
+
+    # create the example template.
+    temp_path = os.path.join(SCHEMA_DIR, 'templates', 'metadata', 'wes_template.json')
+    xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR, "wes_template.xlsx")
+    hint = 'wes'
+
+    # parse the spreadsheet and get the file maps
+    ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint)
+
+    # assert works
+    validator.validate(ct)
+
+
+def test_snippet_wes():
+
+    # create the clinical trial.
+    ct = copy.deepcopy(CLINICAL_TRIAL)
+
+    # define list of gs_urls.
+    urls = [
+        '10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq',
+        '10021/Patient 1/sample 1/aliquot 1/wes_reverse.fastq',
+        '10021/Patient 1/sample 1/aliquot 1/wes_read_group.txt',
+        '10021/Patient 1/sample 1/aliquot 2/wes_forward.fastq',
+        '10021/Patient 1/sample 1/aliquot 2/wes_reverse.fastq',
+        '10021/Patient 1/sample 1/aliquot 2/wes_read_group.txt'
+    ]
+
+    # create validator
+    validator = load_and_validate_schema("clinical_trial.json", return_validator=True)
+
+    # loop over each url
+    searched_urls = []
+    for gs_url in urls:
+
+        # attempt to merge
+        ct = merge_artifact(
+                ct,
+                object_url=gs_url,
+                file_size_bytes=14,
+                uploaded_timestamp="01/01/2001",
+                md5_hash="hash1234"
+            )
+
+        # assert we stull have a good clinical trial object.
+        validator.validate(ct)
+
+        # search for this url and all previous (no clobber)
+        searched_urls.append(gs_url)
+        for url in searched_urls:
+            ds = ct | grep(url)
+            assert 'matched_values' in ds
+            assert len(ds['matched_values']) > 0