Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Olink unique gcs filepaths #157

Merged
merged 2 commits into from Sep 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions cidc_schemas/prism.py
Expand Up @@ -3,7 +3,7 @@
import os
import copy
import uuid
from typing import Union
from typing import Union, BinaryIO
import jsonschema
from deepdiff import grep
import datetime
Expand Down Expand Up @@ -302,7 +302,7 @@ def _process_property(


SUPPORTED_ASSAYS = ["wes", "olink"]
def prismify(xlsx_path: str, template_path: str, assay_hint: str, verb: bool = False) -> (dict, dict):
def prismify(xlsx_path: Union[str, BinaryIO], template_path: str, assay_hint: str, verb: bool = False) -> (dict, dict):
"""
Converts excel file to json object. It also identifies local files
which need to uploaded to a google bucket and provides some logic
Expand All @@ -318,7 +318,7 @@ def prismify(xlsx_path: str, template_path: str, assay_hint: str, verb: bool = F


Args:
xlsx_path: file on file system to excel file.
xlsx_path: file on file system to excel file or the open file itself
template_path: path on file system relative to schema root of the
temaplate

Expand Down
10 changes: 6 additions & 4 deletions cidc_schemas/schemas/templates/metadata/olink_template.json
Expand Up @@ -45,14 +45,20 @@
},
"data_columns": {
"Olink Assay details": {
"chip barcode": {
"merge_pointer": "0/chip_barcode",
"type_ref": "assays/components/olink/olink_entry.json#properties/chip_barcode"
},
"npx file": {
"merge_pointer": "0/files/assay_npx",
"is_artifact" : 1,
"gcs_prefix_format": "{chip_barcode}",
"type_ref": "assays/components/local_file.json"
},
"raw ct file": {
"merge_pointer": "0/files/assay_raw_ct",
"is_artifact" : 1,
"gcs_prefix_format": "{chip_barcode}",
"type_ref": "assays/components/local_file.json"
},
"run date": {
Expand All @@ -75,10 +81,6 @@
"merge_pointer": "0/fludigm_application_build",
"type_ref": "assays/components/olink/olink_entry.json#properties/fludigm_application_build"
},
"chip barcode": {
"merge_pointer": "0/chip_barcode",
"type_ref": "assays/components/olink/olink_entry.json#properties/chip_barcode"
},
"probe type": {
"merge_pointer": "0/probe_type",
"type_ref": "assays/components/olink/olink_entry.json#properties/probe_type"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -40,7 +40,7 @@
packages=find_packages(include=['cidc_schemas']),
test_suite='tests',
url='https://github.com/CIMAC-CIDC/schemas',
version='0.4.1',
version='0.4.2',
zip_safe=False,
entry_points={
'console_scripts': ['cidc_schemas=cidc_schemas.cli:main']
Expand Down
8 changes: 4 additions & 4 deletions template_examples/csvs/olink_template__olink.csv
@@ -1,12 +1,12 @@
"#t","METADATA FILE FOR OLINK"
"#p","LEAD ORGANIZATION STUDY ID","10021"
"#p","LEAD ORGANIZATION STUDY ID","test_prism_trial_id"
"#p","ASSAY CREATOR","DFCI"
"#p","PANEL","Olink INFLAMMATION(v.3004)"
"#p","ASSAY PANEL LOT","1"
"#p","COMBINED FILE","/local/path/combined.xlsx"
"#p","COMBINED FILE NPX MANAGER VERSION","Olink NPX Manager 0.0.82.0"

"","Olink Assay details"
"#h","NPX FILE","RAW CT FILE","RUN DATE","RUN TIME","INSTRUMENT","FLUDIGM APPLICATION VERSION","FLUDIGM APPLICATION BUILD","CHIP BARCODE","PROBE TYPE","PASSIVE REFERENCE","QUALITY THRESHOLD","BASELINE CORRECTION","NUMBER OF SAMPLES","NUMBER OF SAMPLES FAILED","NPX MANAGER VERSION"
"#d","Olink_assay_1","Olink_assay_1","2019-12-12 00:00:00","10:11:00","MIOMARKHD411","4.1.3","20140305.43","13456777","FAM-MGB","ROX","0.5","Linear","90","5","Olink NPX Manager 0.0.82.0"
"#d","Olink_assay_2","Olink_assay_2","2019-12-12 00:00:00","10:11:00","MIOMARKHD411","4.1.3","20140305.43","13456777","FAM-MGB","ROX","0.5","Linear","80","10","Olink NPX Manager 0.0.82.0"
"#h","CHIP BARCODE","NPX FILE","RAW CT FILE","RUN DATE","RUN TIME","INSTRUMENT","FLUDIGM APPLICATION VERSION","FLUDIGM APPLICATION BUILD","PROBE TYPE","PASSIVE REFERENCE","QUALITY THRESHOLD","BASELINE CORRECTION","NUMBER OF SAMPLES","NUMBER OF SAMPLES FAILED","NPX MANAGER VERSION"
"#d","1111","Olink_assay_1","Olink_assay_1","2019-12-12 00:00:00","10:11:00","MIOMARKHD411","4.1.3","20140305.43","FAM-MGB","ROX","0.5","Linear","90","5","Olink NPX Manager 0.0.82.0"
"#d","1112","Olink_assay_2","Olink_assay_2","2019-12-12 00:00:00","10:11:00","MIOMARKHD411","4.1.3","20140305.43","FAM-MGB","ROX","0.5","Linear","80","10","Olink NPX Manager 0.0.82.0"
Binary file modified template_examples/olink_template.xlsx
Binary file not shown.
93 changes: 74 additions & 19 deletions tests/test_prism.py
Expand Up @@ -13,7 +13,8 @@
from jsonmerge import Merger

from cidc_schemas.prism import prismify, merge_artifact, \
merge_clinical_trial_metadata, InvalidMergeTargetException
merge_clinical_trial_metadata, InvalidMergeTargetException, \
SUPPORTED_ASSAYS
from cidc_schemas.json_validation import load_and_validate_schema
from cidc_schemas.template import Template
from cidc_schemas.template_writer import RowType
Expand Down Expand Up @@ -296,14 +297,17 @@ def test_prism(schema_path, xlsx_path):
# extract hint.
hint = schema_path.split("/")[-1].replace("_template.json", "")

# TODO: only implemented WES parsing...
if hint != "wes":
# TODO: every other assay
if hint not in SUPPORTED_ASSAYS:
return

# turn into object.
ct, file_maps = prismify(xlsx_path, schema_path, assay_hint=hint)

assert len(ct['assays'][hint]) == 1
# olink is different - is will never have array of assay "runs" - only one
if hint != 'olink':
assert len(ct['assays'][hint]) == 1


# we merge it with a preexisting one
# 1. we get all 'required' fields from this preexisting
Expand All @@ -316,17 +320,19 @@ def test_prism(schema_path, xlsx_path):

if hint == 'wes':
assert merged["lead_organization_study_id"] == "10021"
elif hint == 'olink':
assert merged["lead_organization_study_id"] == "test_prism_trial_id"
else:
assert MINIMAL_CT_1PA1SA1AL["lead_organization_study_id"] == merged["lead_organization_study_id"]


@pytest.mark.parametrize('schema_path, xlsx_path', template_paths())
def test_filepath_gen_wes_only(schema_path, xlsx_path):
def test_filepath_gen(schema_path, xlsx_path):
# extract hint.
hint = schema_path.split("/")[-1].replace("_template.json", "")

# TODO: only implemented WES parsing...
if hint != "wes":
# TODO: every other assay
if hint not in SUPPORTED_ASSAYS:
return

# create validators
Expand All @@ -338,11 +344,15 @@ def test_filepath_gen_wes_only(schema_path, xlsx_path):
# we ignore and do not validate 'ct'
# because it's only a ct patch not a full ct

# assert we have the right counts.
if hint == "wes":
local_to_gcs_mapping = {}
for fmap_entry in file_maps:
local_to_gcs_mapping[fmap_entry.gs_key] = fmap_entry

# check the number of files present.
assert len(file_maps) == 6
assert len(local_to_gcs_mapping) == len(file_maps), "gcs_key/url collision"


# assert we have the right file counts etc.
if hint == "wes":

# we should have 2 fastq per sample.
# we should have 2 tot forward.
Expand All @@ -356,13 +366,38 @@ def test_filepath_gen_wes_only(schema_path, xlsx_path):
assert 2 == sum([1 for x in file_maps if "/read_group_mapping_file" in x.gs_key])
assert 2 == sum([1 for x in file_maps if x.local_path.endswith(".txt")])

# 4 in total
assert len(file_maps) == 6

# all that with
# 2 participants
assert 2 == len(set([x.gs_key.split("/")[0] for x in file_maps]))
# 2 samples
assert 2 == len(set([x.gs_key.split("/")[1] for x in file_maps]))
# 2 aliquots
assert 2 == len(set([x.gs_key.split("/")[2] for x in file_maps]))

elif hint == 'olink':

# we should have 2 npx files
assert 2 == sum([1 for x in file_maps if "assay_npx" in x.gs_key])

# we should have 2 raw_ct files
assert 2 == sum([1 for x in file_maps if "assay_raw_ct" in x.gs_key])

# 4 assay level in tots
assert 4 == sum([1 for x in file_maps if x.local_path.startswith("Olink_assay")])

# we should have 1 study level npx
assert 1 == sum([1 for x in file_maps if "study_npx" in x.gs_key])

# check the number of files - 1 study + 2*(npx + ct raw)
assert len(file_maps) == 5

else:
assert False, f"add {hint} assay specific asserts"





Expand Down Expand Up @@ -514,7 +549,7 @@ def test_end_to_end_wes_olink(schema_path, xlsx_path):
hint = schema_path.split("/")[-1].replace("_template.json", "")

# TODO: implement other assays
if hint not in ["wes", "olink"]:
if hint not in SUPPORTED_ASSAYS:
return

# create validators
Expand All @@ -523,6 +558,8 @@ def test_end_to_end_wes_olink(schema_path, xlsx_path):
# parse the spreadsheet and get the file maps
prism_patch, file_maps = prismify(xlsx_path, schema_path, assay_hint=hint)


# olink is different in structure - no array of assays, only one.
if hint != 'olink':
assert len(prism_patch['assays'][hint]) == 1
assert len(prism_patch['assays'][hint][0]['records']) == 2
Expand All @@ -532,9 +569,16 @@ def test_end_to_end_wes_olink(schema_path, xlsx_path):
for f in file_maps:
assert f'{hint}/' in f.gs_key, f"No {hint} hint found"

# assert we still have a good clinical trial object, so we can save it
# but we need to merge it, because "prismify" provides only a patch
full_after_prism = merge_clinical_trial_metadata(prism_patch, WES_TEMPLATE_EXAMPLE_CT)
original_ct = copy.deepcopy(WES_TEMPLATE_EXAMPLE_CT)
# And we need set lead_organization_study_id to be the same for testing
if hint == "olink":
original_ct['lead_organization_study_id'] = 'test_prism_trial_id'


# "prismify" provides only a patch so we need to merge it into a "full" ct
full_after_prism = merge_clinical_trial_metadata(prism_patch, original_ct)

# Assert we still have a good clinical trial object, so we can save it.
validator.validate(full_after_prism)

patch_copy_4_artifacts = copy.deepcopy(prism_patch)
Expand All @@ -555,31 +599,41 @@ def test_end_to_end_wes_olink(schema_path, xlsx_path):
)

# assert we still have a good clinical trial object, so we can save it
validator.validate(merge_clinical_trial_metadata(patch_copy_4_artifacts, WES_TEMPLATE_EXAMPLE_CT))
validator.validate(merge_clinical_trial_metadata(patch_copy_4_artifacts, original_ct))

# we will than search for this url in the resulting ct,
# to check all artifacts were indeed merged
merged_gs_keys.append(fmap_entry.gs_key)

# `merge_artifact` modifies ct in-place, so
full_ct = merge_clinical_trial_metadata(patch_copy_4_artifacts, WES_TEMPLATE_EXAMPLE_CT)
full_ct = merge_clinical_trial_metadata(patch_copy_4_artifacts, original_ct)

if hint == 'wes':
assert len(merged_gs_keys) == 3*2 # 3 files per entry in xlsx

stripped_uuid_WES = [u[:-len("/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")] for u in WES_TEMPLATE_EXAMPLE_GS_URLS]
assert merged_gs_keys == stripped_uuid_WES

elif hint == 'olink':
assert len(merged_gs_keys) == 5 # 2 files per entry in xlsx + 1 file in preamble

else:
assert False, f"add {hint} assay specific asserts"

for file_map_entry in file_maps:
assert len((full_ct | grep(fmap_entry.gs_key))['matched_values']) == 1 # each gs_url only once

# olink is special - it's not an array
if hint == "olink":
assert len(full_ct['assays'][hint]['records']) == 2, "More records than expected"
else:
elif hint == 'wes':
assert len(full_ct['assays'][hint]) == 1+len(WES_TEMPLATE_EXAMPLE_CT['assays'][hint]), f"Multiple {hint}-assays created instead of merging into one"
assert len(full_ct['assays'][hint][0]['records']) == 2, "More records than expected"

else:
assert False, f"add {hint} assay specific asserts"


dd = DeepDiff(full_after_prism, full_ct)

if hint=='wes':
Expand All @@ -599,4 +653,5 @@ def test_end_to_end_wes_olink(schema_path, xlsx_path):
assert len(dd['dictionary_item_added']) == 6*(2*2+1), "Unexpected CT changes"

else:
assert list(dd.keys()) == ['dictionary_item_added'], "Unexpected CT changes"
assert False, f"add {hint} assay specific asserts"