From 414073a7ee230a6fb3239ee6e07e917659abcf7b Mon Sep 17 00:00:00 2001 From: James Lindsay Date: Tue, 6 Aug 2019 09:00:48 -0400 Subject: [PATCH] Filegen wes2 (#112) * added working example of how to merge metadata * started drafting function to simplify merge * more work on merging * first working version of function to merge input files into existing metadata * working function to merge wes input artifacts into metadata model * implemented fix to logic which was clobbering old data. addressed concerns from PR --- cidc_schemas/prism.py | 356 +++++++++++++----- .../schemas/artifacts/artifact_core.json | 6 +- .../schemas/artifacts/artifact_text.json | 3 +- .../assays/components/ngs/ngs_entry.json | 7 +- .../assays/components/ngs/wes_entry.json | 5 +- .../assays/components/ngs/wes_input.json | 16 +- cidc_schemas/schemas/assays/wes_assay.json | 4 + tests/test_artifacts.py | 2 +- tests/test_assays.py | 4 +- tests/test_prism.py | 145 ++++++- 10 files changed, 437 insertions(+), 111 deletions(-) diff --git a/cidc_schemas/prism.py b/cidc_schemas/prism.py index 41266a084..669cb3980 100644 --- a/cidc_schemas/prism.py +++ b/cidc_schemas/prism.py @@ -4,6 +4,7 @@ import copy import jsonschema from deepdiff import grep +import datetime from jsonmerge import merge, Merger from cidc_schemas.json_validation import load_and_validate_schema @@ -28,7 +29,8 @@ def _get_coerce(ref: str): """ # get the entry - resolver = jsonschema.RefResolver(f'file://{SCHEMA_DIR}/schemas', {'$ref': ref}) + resolver = jsonschema.RefResolver( + f'file://{SCHEMA_DIR}/schemas', {'$ref': ref}) _, entry = resolver.resolve(ref) # add our own type conversion @@ -103,6 +105,12 @@ def populate_lu(ref: str, key_lu: dict, xlsx_key: str): # populate lookup. populate_lu(ref, key_lu, data_key) + # special case for wes keys. + if 'wes' in template_path: + ref = "assays/components/ngs/ngs_entry.json#properties/entry_id" + data_key = "entry_id" + populate_lu(ref, key_lu, data_key) + return key_lu @@ -121,7 +129,7 @@ def _find_key(schema_key: str, schema: dict, assay_hint: str = "") -> str: choice. I've introduced the assay_hint string to help disambuguate - a path to a key when there are multiple possibilities. + a path to a key when there are multiple possibilities. Consider "assay_creator" a property in assay_core.json which is associated with every assay. Searching the schema for assay_creator will return multiple hits, the hint lets @@ -195,9 +203,9 @@ def _set_val(path: str, val: object, trial: dict, verbose=False): consumed: path = "['items'][0]['properties']['prop1']" - + Next we see an 'item' property which in json-schema - denotes an array. So the implication + denotes an array. So the implication is that the value of 'participants' is list. { "participants": [...] @@ -206,7 +214,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False): path = "['properties']['prop1']" Next we 'properties' so we know we are entering an object - with *prop1* as a property. This is the + with *prop1* as a property. This is the final piece of the *path* so we can assign the val: { "participants": [{ @@ -220,7 +228,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False): For each token we test for its json-schema modifier, 'items', 'properties', 'allOf'. If we see items we need - to add a list, assuming it doesn't exist, if we see properties + to add a list, assuming it doesn't exist, if we see properties we need to create a dictionary if it doesn't exist. *One limitation* of this code is that no list can have @@ -237,7 +245,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False): For our purposes we need to treat the 'allOf' followed by the array entry and subsequent object properties as properties of the previous object 'prop2'. This - is why there are "skip" blocks in the code which advance + is why there are "skip" blocks in the code which advance to the next token while keeping the pointer of the current object on 'prop2'. @@ -353,7 +361,7 @@ def _set_val(path: str, val: object, trial: dict, verbose=False): elif key2 == 'properties': curp[key] = {} - # also a dictionary, just will be proceeded by a nunber + # also a dictionary, just will be preceeded by a nunber elif key2 == 'allOf': # this assume allOf always creates object, maybe not true? curp[key] = {} @@ -394,13 +402,13 @@ def _get_recursively(search_dict, field): def _process_property( - row: list, - key_lu: dict, - schema: dict, - data_obj: dict, - assay_hint: str, - fp_lu: dict, - verb: bool): + row: list, + key_lu: dict, + schema: dict, + data_obj: dict, + assay_hint: str, + fp_lu: dict, + verb: bool): """ Takes a single property (key, val) from spreadsheet, determines where it needs to go in the final object, then inserts it. @@ -435,7 +443,7 @@ def _process_property( gs_key = f'{gs_key}/{_get_recursively(data_obj, "cimac_sample_id")[0]}' gs_key = f'{gs_key}/{_get_recursively(data_obj, "cimac_aliquot_id")[0]}' gs_key = f'{gs_key}/{assay_hint}' - gs_key = gs_key.replace(" ", "_") + #gs_key = gs_key.replace(" ", "_") # do the suffix tmp = key.lower().split(" ") @@ -471,7 +479,8 @@ def _process_property( def _build_fplu(assay_hint: str): # get the un resolved schema - template_path = os.path.join(TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json') + template_path = os.path.join( + TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json') with open(template_path) as fin: schema = json.load(fin) @@ -507,7 +516,7 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo e.g. file list [ { - 'local_path': '/path/to/fwd.fastq', + 'local_path': '/path/to/fwd.fastq', 'gs_key': '10021/Patient_1/sample_1/aliquot_1/wes_forward.fastq' } ] @@ -515,11 +524,11 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo Args: xlsx_path: file on file system to excel file. - template_path: path on file system relative to schema root of the + template_path: path on file system relative to schema root of the temaplate - - assay_hint: string used to help idnetify properties in template. Must - be the the root of the template filename i.e. + + assay_hint: string used to help idnetify properties in template. Must + be the the root of the template filename i.e. wes_template.json would be wes. verb: boolean indicating verbosity @@ -543,7 +552,6 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo # add a special key to track the files fp_lu['special'] = list() - # read the excel file t = XlTemplateReader.from_excel(xlsx_path) @@ -562,12 +570,19 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo for row in ws[RowType.PREAMBLE]: # process this property - _process_property(row, key_lu, schema, root, assay_hint, fp_lu, verb) - + _process_property(row, key_lu, schema, root, + assay_hint, fp_lu, verb) # move to headers headers = ws[RowType.HEADER][0] + # track these identifiers + potential_ids = { + "CIMAC PARTICIPANT ID": "", + "CIMAC SAMPLE ID": "", + "CIMAC ALIQUOT ID": "" + } + # get the data. data = ws[RowType.DATA] for row in data: @@ -580,10 +595,29 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo _process_property([key, val], key_lu, schema, curd, assay_hint, fp_lu, verb) + # track ids + if key in potential_ids: + potential_ids[key] = val # save the entry data_rows.append(curd) + # data rows will require a unique identifier + if assay_hint == "wes": + + # create a unique key + unique_key = potential_ids['CIMAC PARTICIPANT ID'] + unique_key = f'{unique_key}_{potential_ids["CIMAC SAMPLE ID"]}' + unique_key = f'{unique_key}_{potential_ids["CIMAC ALIQUOT ID"]}' + + # add this to the most recent payload + _process_property(['entry_id', unique_key], key_lu, schema, + curd, assay_hint, fp_lu, verb) + + else: + raise NotImplementedError(f'only WES is supported, please add additional support \ + for {assay_hint}') + # create the merger merger = Merger(schema) @@ -597,7 +631,7 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str = "", verb: boo def _deep_get(obj: dict, key: str): - """ + """ returns value of they supplied key gotten via deepdif """ @@ -614,87 +648,233 @@ def _deep_get(obj: dict, key: str): return cur_obj, tokens[-2] -def filepath_gen(xlsx_path: str, schema: dict, assay_hint: str, verb: bool = False): +def _get_path(ct: dict, key: str) -> str: """ - This is a python generator which yields the paths of local files we are expecting - to recieve alongsdie the supplied metadata xlsx file. + find the path to the given key in the dictionary - There is bespoke assay specific logic encoded in this function and it will - likely change if conventions around what files are expected in a given - folder, or what files an assay is expecting. + Args: + ct: clinical_trial object to be modified + key: the identifier we are looking for in the dictionary + + Returns: + arg1: string describing the location of the key + """ + + # first look for key as is + ds1 = ct | grep(key, match_string=True) + count1 = 0 + if 'matched_values' in ds1: + count1 = len(ds1['matched_values']) + + # the hack fails if both work... probably need to deal with this + if count1 == 0: + raise NotImplementedError(f"key: {key} not found in dictionary") + + # get the keypath + return ds1['matched_values'].pop() + + +def _get_source(ct: dict, key: str, level="sample") -> dict: + """ + extract the object in the dicitionary specified by + the supplied key (or one of its parents.) Args: - xlsx_path: file on file system to excel file. - schema: json schema with all ref resolved - assay_hint: string used to help idnetify properties in template. Must - be the the root of the template filename i.e. - wes_template.json would be wes. - verb: boolean indicating verbosity + ct: clinical_trial object to be searched + key: the identifier we are looking for in the dictionary, + level: a keyword describing which level in the key path + (trial, participants, sample, aliquot) we want to return Returns: - None, data_obj is modified in place + arg1: string describing the location of the key """ - # get the un resolved schema - template_path = os.path.join(TEMPLATE_DIR, 'metadata', f'{assay_hint}_template.json') - with open(template_path) as fin: - schema = json.load(fin) + # tokenize. + key = key.replace("root", "").replace("'", "") + tokens = re.findall(r"\[(.*?)\]", key) - # find key in the schema, this notation is - # recommended usage of deepdif grep. assuming they - # overload the pipe operator to simulate cmd line - schema_key = 'artifact_link' - ds = schema | grep(schema_key) - if 'matched_paths' not in ds: - raise KeyError(f'{schema_key} not found in schema') + # this will get us to the object we have the key for + if level == "sample": + tokens = tokens[0:-3] + elif level == "aliquot": + tokens = tokens[0:-1] + else: + raise NotImplementedError( + f'the following level is not supported: {level}') - # sort potential matches, shortest is what we want. - choices = sorted(ds['matched_paths'], key=len) + # keep getting based on the key. + cur_obj = ct + for token in tokens: + try: + token = int(token) + except ValueError: + pass - # create tuples - key_lu = {} - for c in choices: + cur_obj = cur_obj[token] - # get the value and parent of the file link. - val, pkey = _deep_get(schema, c) - pkey = pkey.upper() - key_lu[pkey] = val + return cur_obj - def _do_stuff(key, val, lu): - if key in lu: - # make the accession key - tmp = lu[key][1] - print(tmp) - gs_key = tmp["lead_organization_study_id"] - gs_key = f'{gs_key}/{tmp["cimac_participant_id"]}' - gs_key = f'{gs_key}/{tmp["cimac_sample_id"]}' - gs_key = f'{gs_key}/{tmp["cimac_aliquot_id"]}' - #print("stuff", key, val, lu[key]) - print(gs_key) - # read the excel file - t = XlTemplateReader.from_excel(xlsx_path) +def _merge_artifact_wes( + ct: dict, + object_url: str, + file_size_bytes: int, + uploaded_timestamp: str, + md5_hash: str +): + """ + create and merge an artifact into the WES assay metadata. + The artifacts currently supported are only the input + fastq files and read mapping group file. - # loop over spreadsheet - worksheet_names = t.grouped_rows.keys() - for name in worksheet_names: + Args: + ct: clinical_trial object to be searched + object_url: the gs url pointing to the object being added + file_size_bytes: integer specifying the numebr of bytes in the file + uploaded_timestamp: time stamp associated with this object + md5_hash: hash of the uploaded object, usually provided by + object storage - # get the worksheat. - ws = t.grouped_rows[name] + """ - # Compare preamble rows - for row in ws[RowType.PREAMBLE]: + # replace gs prfix if exists. + object_url, lead_organization_study_id, \ + cimac_participant_id, cimac_sample_id, cimac_aliquot_id, \ + file_name = _split_objurl(object_url) + + # get the genomic source. + keypath = _get_path(ct, cimac_aliquot_id) + sample_obj = _get_source(ct, keypath) + genomic_source = sample_obj['genomic_source'] + + # create the artifact. + artifact = { + "artifact_category": "Assay Artifact from CIMAC", + "assay_category": "Whole Exome Sequencing (WES)", + "object_url": object_url, + "file_name": file_name, + "file_size_bytes": 1, + "md5_hash": md5_hash, + "uploaded_timestamp": str(datetime.datetime.now()).split('.')[0] + } + + # create the wes input object which will be added to existing data + obj = {} + + # check if we are adding read group mapping file. + if "wes_read_group" in file_name: + + # set the artifact type and save + artifact["file_type"] = "Other" + obj['read_group_mapping_file'] = artifact - _do_stuff(row[0], row[1], key_lu) + else: - # move to headers - headers = ws[RowType.HEADER][0] + # set the artifact type + artifact["file_type"] = "FASTQ" + + # determine how to craft the artifact + obj[genomic_source] = {} + if "wes_forward" in file_name: + obj[genomic_source]['fastq_1'] = artifact + + elif "wes_reverse" in file_name: + obj[genomic_source]['fastq_2'] = artifact + + # copy the metadata and add this a new record. + # note this will clobber whatever is here. This is + # OK because the original copy of ct will have the + # clobbered data, while the new copy will have + # the new entry which will get appended to the + # "records" list by the merge by ID strategy + # specified in the json-schema for records + ct_copy = copy.deepcopy(ct) + aliquot_obj = _get_source(ct_copy, keypath, level="aliquot") + aliquot_obj['assay']['wes']['records'][0]['files'] = obj + + # merge the copy with the original. + validator = load_and_validate_schema( + "clinical_trial.json", return_validator=True) + schema = validator.schema + merger = Merger(schema) - # get the data. - data = ws[RowType.DATA] - for row in data: + ct_new = merger.merge(ct, ct_copy) - # create dictionary per row - for key, val in zip(headers, row): + # validate the new data + validator.validate(ct_new) + + # return the new dictionary + return ct_new + + +def _split_objurl(obj_url: str) -> (str, str, str, str, str, str): + """ + splits gs_url into components and returns them + + Args: + obj_url: gs://url/to/file + + Returns: + arg1: tuple of the components + """ + + # replace gs prfix if exists. + obj_url = obj_url.replace("gs://", "") + + # parse the url to get key identifiers + tokens = obj_url.split("/") + lead_organization_study_id = tokens[0] + cimac_participant_id = tokens[1] + cimac_sample_id = tokens[2] + cimac_aliquot_id = tokens[3] + file_name = tokens[4] + + return obj_url, lead_organization_study_id, cimac_participant_id, \ + cimac_sample_id, cimac_aliquot_id, file_name + + +def merge_artifact( + ct: dict, + object_url: str, + file_size_bytes: int, + uploaded_timestamp: str, + md5_hash: str +): + """ + create and merge an artifact into the metadata blob + for a clinical trial. The merging process is automatically + determined by inspecting the gs url path. + + Args: + ct: clinical_trial object to be searched + object_url: the gs url pointing to the object being added + file_size_bytes: integer specifying the numebr of bytes in the file + uploaded_timestamp: time stamp associated with this object + md5_hash: hash of the uploaded object, usually provided by + object storage + + """ + + # replace gs prfix if exists. + object_url, lead_organization_study_id, \ + cimac_participant_id, cimac_sample_id, cimac_aliquot_id, \ + file_name = _split_objurl(object_url) + + # define criteria. + wes_names = {'wes_forward', 'wes_reverse', 'wes_read_group'} + + # test criteria. + if any(wes_name in file_name for wes_name in wes_names): + new_ct = _merge_artifact_wes( + ct, + object_url, + file_size_bytes, + uploaded_timestamp, + md5_hash + ) + else: + raise NotImplementedError( + f'the following file_name is not supported: {file_name}') - _do_stuff(key, val, key_lu) + # return new object + return new_ct diff --git a/cidc_schemas/schemas/artifacts/artifact_core.json b/cidc_schemas/schemas/artifacts/artifact_core.json index aaf114a58..dcada3a1e 100644 --- a/cidc_schemas/schemas/artifacts/artifact_core.json +++ b/cidc_schemas/schemas/artifacts/artifact_core.json @@ -21,7 +21,7 @@ "description": "The name of the file with extension", "type": "string" }, - "bucket_url": { + "object_url": { "description": "URL to artifact within Google Bucket.", "type": "string" }, @@ -75,13 +75,11 @@ } }, "required": [ - "uuid", "file_name", - "bucket_url", + "object_url", "uploaded_timestamp", "file_size_bytes", "md5_hash", - "visible", "artifact_category", "assay_category", "file_type" diff --git a/cidc_schemas/schemas/artifacts/artifact_text.json b/cidc_schemas/schemas/artifacts/artifact_text.json index 834e96f88..d0bf9efc4 100644 --- a/cidc_schemas/schemas/artifacts/artifact_text.json +++ b/cidc_schemas/schemas/artifacts/artifact_text.json @@ -15,5 +15,6 @@ } } } - ] + ], + "mergeStrategy": "objectMerge" } \ No newline at end of file diff --git a/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json b/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json index 956eb1fcc..9f9b557e9 100644 --- a/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json +++ b/cidc_schemas/schemas/assays/components/ngs/ngs_entry.json @@ -5,6 +5,10 @@ "type": "object", "description": "A single data record from an NGS assay", "properties": { + "entry_id": { + "description": "Identifier which is unique to this entry, this is clincal trial, participant, aliquot id's", + "type": "string" + }, "library_kit_lot": { "description": "Lot number for the library construction kit", "type": "string" @@ -35,6 +39,7 @@ }, "required": [ "library_kit_lot", - "library_prep_date" + "library_prep_date", + "entry_id" ] } diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_entry.json b/cidc_schemas/schemas/assays/components/ngs/wes_entry.json index e60c9a0fa..38cb67bb4 100644 --- a/cidc_schemas/schemas/assays/components/ngs/wes_entry.json +++ b/cidc_schemas/schemas/assays/components/ngs/wes_entry.json @@ -21,5 +21,8 @@ "$ref": "assays/components/ngs/wes_input.json" } }, - "required": ["enrichment_vendor_lot"] + "mergeStrategy": "objectMerge", + "required": [ + "enrichment_vendor_lot" + ] } diff --git a/cidc_schemas/schemas/assays/components/ngs/wes_input.json b/cidc_schemas/schemas/assays/components/ngs/wes_input.json index ac7f43fb1..976a50386 100644 --- a/cidc_schemas/schemas/assays/components/ngs/wes_input.json +++ b/cidc_schemas/schemas/assays/components/ngs/wes_input.json @@ -6,23 +6,17 @@ "description": "Whole Exome Sequencing Assay Input Files", "properties": { "tumor": { - "allOf": [ - { "$ref": "assays/components/fastq_pairs.json" } - ] + "$ref": "assays/components/fastq_pairs.json", + "mergeStrategy": "objectMerge" }, "normal": { - "allOf": [ - { "$ref": "assays/components/fastq_pairs.json" } - ] + "$ref": "assays/components/fastq_pairs.json", + "mergeStrategy": "objectMerge" }, "read_group_mapping_file": { "$ref": "artifacts/artifact_text.json", "description": "Stores read group information for each read in the fastq files. Needed for when samples are run across multiple lanes." } }, - "required": [ - "tumor", - "normal", - "read_group_mapping_file" - ] + "required": [] } diff --git a/cidc_schemas/schemas/assays/wes_assay.json b/cidc_schemas/schemas/assays/wes_assay.json index b56adb82d..23202f0ea 100644 --- a/cidc_schemas/schemas/assays/wes_assay.json +++ b/cidc_schemas/schemas/assays/wes_assay.json @@ -18,6 +18,10 @@ "type": "array", "items": { "$ref": "assays/components/ngs/wes_entry.json" + }, + "mergeStrategy": "arrayMergeById", + "mergeOptions": { + "idRef": "entry_id" } }, "analysis":{ diff --git a/tests/test_artifacts.py b/tests/test_artifacts.py index 49c1ee25c..f8a4ccd24 100644 --- a/tests/test_artifacts.py +++ b/tests/test_artifacts.py @@ -16,7 +16,7 @@ "artifact_category": "Manifest File", "artifact_creator": "DFCI", "assay_category": "Whole Exome Sequencing (WES)", - "bucket_url": "dummy", + "object_url": "dummy", "file_name": "dummy.txt", "file_size_bytes": 1, "file_type": "FASTA", diff --git a/tests/test_assays.py b/tests/test_assays.py index 216878152..00ff1de31 100644 --- a/tests/test_assays.py +++ b/tests/test_assays.py @@ -16,7 +16,7 @@ "artifact_category": "Manifest File", "artifact_creator": "DFCI", "assay_category": "Whole Exome Sequencing (WES)", - "bucket_url": "dummy", + "object_url": "dummy", "file_name": "dummy.txt", "file_size_bytes": 1, "file_type": "FASTA", @@ -94,6 +94,7 @@ def test_wes(): "library_kit_lot": "dummy_value", "library_prep_date": "01/01/2001", "capture_date": "01/01/2001", + "entry_id": "xyz", "files": { "tumor": { "fastq_1": fastq_1, @@ -175,6 +176,7 @@ def test_rna_expression(): "library_kit_lot": "dummy_value", "library_prep_date": "01/01/2001", "capture_date": "01/01/2001", + "entry_id": "abc", "files": { "fastq_1": fastq_1, "fastq_2": fastq_1, diff --git a/tests/test_prism.py b/tests/test_prism.py index fbe76131a..899308e65 100644 --- a/tests/test_prism.py +++ b/tests/test_prism.py @@ -8,10 +8,11 @@ import pytest import jsonschema import json +from deepdiff import grep from pprint import pprint from jsonmerge import Merger -from cidc_schemas.prism import prismify, filepath_gen +from cidc_schemas.prism import prismify, merge_artifact from cidc_schemas.json_validation import load_and_validate_schema from cidc_schemas.template import Template from cidc_schemas.template_writer import RowType @@ -19,6 +20,83 @@ from .constants import ROOT_DIR, SCHEMA_DIR, TEMPLATE_EXAMPLES_DIR from .test_templates import template_paths +from .test_assays import ARTIFACT_OBJ + + +CLINICAL_TRIAL = { + "lead_organization_study_id": "10021", + "participants": [ + { + "samples": [ + { + "aliquots": [ + { + "assay": { + "wes": { + "assay_creator": "Mount Sinai", + "assay_category": "Whole Exome Sequencing (WES)", + "enrichment_vendor_kit": "Twist", + "library_vendor_kit": "KAPA - Hyper Prep", + "sequencer_platform": "Illumina - NextSeq 550", + "paired_end_reads": "Paired", + "read_length": 100, + "records": [ + { + "library_kit_lot": "lot abc", + "enrichment_vendor_lot": "lot 123", + "library_prep_date": "2019-05-01 00:00:00", + "capture_date": "2019-05-02 00:00:00", + "input_ng": 100, + "library_yield_ng": 700, + "average_insert_size": 250, + "entry_id": "abc1" + } + ] + } + }, + "cimac_aliquot_id": "aliquot 1" + }, + ], + "cimac_sample_id": "sample 1", + "genomic_source": "Tumor" + }, + { + "aliquots": [ + { + "assay": { + "wes": { + "assay_creator": "Mount Sinai", + "assay_category": "Whole Exome Sequencing (WES)", + "enrichment_vendor_kit": "Twist", + "library_vendor_kit": "KAPA - Hyper Prep", + "sequencer_platform": "Illumina - NextSeq 550", + "paired_end_reads": "Paired", + "read_length": 100, + "records": [ + { + "library_kit_lot": "lot abc", + "enrichment_vendor_lot": "lot 123", + "library_prep_date": "2019-05-01 00:00:00", + "capture_date": "2019-05-02 00:00:00", + "input_ng": 100, + "library_yield_ng": 700, + "average_insert_size": 250, + "entry_id": "abc2" + } + ] + } + }, + "cimac_aliquot_id": "aliquot 2" + } + ], + "cimac_sample_id": "sample 2", + "genomic_source": "Normal" + } + ], + "cimac_participant_id": "patient 1" + } + ] + } def test_merge_core(): @@ -112,7 +190,8 @@ def test_assay_merge(): "capture_date": "2019-05-02 00:00:00", "input_ng": 100, "library_yield_ng": 700, - "average_insert_size": 250 + "average_insert_size": 250, + "entry_id": "abc" } ], } @@ -202,4 +281,64 @@ def test_filepath_gen(): assert 2 == sum([1 for x in file_maps if x['gs_key'].count("txt") > 0]) # assert works - validator.validate(ct) \ No newline at end of file + validator.validate(ct) + + +def test_wes(): + + # create validators + validator = load_and_validate_schema("clinical_trial.json", return_validator=True) + schema = validator.schema + + # create the example template. + temp_path = os.path.join(SCHEMA_DIR, 'templates', 'metadata', 'wes_template.json') + xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR, "wes_template.xlsx") + hint = 'wes' + + # parse the spreadsheet and get the file maps + ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint) + + # assert works + validator.validate(ct) + + +def test_snippet_wes(): + + # create the clinical trial. + ct = copy.deepcopy(CLINICAL_TRIAL) + + # define list of gs_urls. + urls = [ + '10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq', + '10021/Patient 1/sample 1/aliquot 1/wes_reverse.fastq', + '10021/Patient 1/sample 1/aliquot 1/wes_read_group.txt', + '10021/Patient 1/sample 1/aliquot 2/wes_forward.fastq', + '10021/Patient 1/sample 1/aliquot 2/wes_reverse.fastq', + '10021/Patient 1/sample 1/aliquot 2/wes_read_group.txt' + ] + + # create validator + validator = load_and_validate_schema("clinical_trial.json", return_validator=True) + + # loop over each url + searched_urls = [] + for gs_url in urls: + + # attempt to merge + ct = merge_artifact( + ct, + object_url=gs_url, + file_size_bytes=14, + uploaded_timestamp="01/01/2001", + md5_hash="hash1234" + ) + + # assert we stull have a good clinical trial object. + validator.validate(ct) + + # search for this url and all previous (no clobber) + searched_urls.append(gs_url) + for url in searched_urls: + ds = ct | grep(url) + assert 'matched_values' in ds + assert len(ds['matched_values']) > 0