Skip to content

Commit

Permalink
Filegen wes2 (#112)
Browse files Browse the repository at this point in the history
* added working example of how to merge metadata

* started drafting function to simplify merge

* more work on merging

* first working version of function to merge input files into existing metadata

* working function to merge wes input artifacts into metadata model

* implemented fix to logic which was clobbering old data. addressed concerns from PR
  • Loading branch information
jim-bo committed Aug 6, 2019
1 parent faaff71 commit 414073a
Show file tree
Hide file tree
Showing 10 changed files with 437 additions and 111 deletions.
356 changes: 268 additions & 88 deletions cidc_schemas/prism.py

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions cidc_schemas/schemas/artifacts/artifact_core.json
Expand Up @@ -21,7 +21,7 @@
"description": "The name of the file with extension",
"type": "string"
},
"bucket_url": {
"object_url": {
"description": "URL to artifact within Google Bucket.",
"type": "string"
},
Expand Down Expand Up @@ -75,13 +75,11 @@
}
},
"required": [
"uuid",
"file_name",
"bucket_url",
"object_url",
"uploaded_timestamp",
"file_size_bytes",
"md5_hash",
"visible",
"artifact_category",
"assay_category",
"file_type"
Expand Down
3 changes: 2 additions & 1 deletion cidc_schemas/schemas/artifacts/artifact_text.json
Expand Up @@ -15,5 +15,6 @@
}
}
}
]
],
"mergeStrategy": "objectMerge"
}
7 changes: 6 additions & 1 deletion cidc_schemas/schemas/assays/components/ngs/ngs_entry.json
Expand Up @@ -5,6 +5,10 @@
"type": "object",
"description": "A single data record from an NGS assay",
"properties": {
"entry_id": {
"description": "Identifier which is unique to this entry, this is clincal trial, participant, aliquot id's",
"type": "string"
},
"library_kit_lot": {
"description": "Lot number for the library construction kit",
"type": "string"
Expand Down Expand Up @@ -35,6 +39,7 @@
},
"required": [
"library_kit_lot",
"library_prep_date"
"library_prep_date",
"entry_id"
]
}
5 changes: 4 additions & 1 deletion cidc_schemas/schemas/assays/components/ngs/wes_entry.json
Expand Up @@ -21,5 +21,8 @@
"$ref": "assays/components/ngs/wes_input.json"
}
},
"required": ["enrichment_vendor_lot"]
"mergeStrategy": "objectMerge",
"required": [
"enrichment_vendor_lot"
]
}
16 changes: 5 additions & 11 deletions cidc_schemas/schemas/assays/components/ngs/wes_input.json
Expand Up @@ -6,23 +6,17 @@
"description": "Whole Exome Sequencing Assay Input Files",
"properties": {
"tumor": {
"allOf": [
{ "$ref": "assays/components/fastq_pairs.json" }
]
"$ref": "assays/components/fastq_pairs.json",
"mergeStrategy": "objectMerge"
},
"normal": {
"allOf": [
{ "$ref": "assays/components/fastq_pairs.json" }
]
"$ref": "assays/components/fastq_pairs.json",
"mergeStrategy": "objectMerge"
},
"read_group_mapping_file": {
"$ref": "artifacts/artifact_text.json",
"description": "Stores read group information for each read in the fastq files. Needed for when samples are run across multiple lanes."
}
},
"required": [
"tumor",
"normal",
"read_group_mapping_file"
]
"required": []
}
4 changes: 4 additions & 0 deletions cidc_schemas/schemas/assays/wes_assay.json
Expand Up @@ -18,6 +18,10 @@
"type": "array",
"items": {
"$ref": "assays/components/ngs/wes_entry.json"
},
"mergeStrategy": "arrayMergeById",
"mergeOptions": {
"idRef": "entry_id"
}
},
"analysis":{

This comment has been minimized.

Copy link
@curlup

curlup Aug 6, 2019

Member

analysis is an array as records` are, but no merge strategy is specified?

Expand Down
2 changes: 1 addition & 1 deletion tests/test_artifacts.py
Expand Up @@ -16,7 +16,7 @@
"artifact_category": "Manifest File",
"artifact_creator": "DFCI",
"assay_category": "Whole Exome Sequencing (WES)",
"bucket_url": "dummy",
"object_url": "dummy",
"file_name": "dummy.txt",
"file_size_bytes": 1,
"file_type": "FASTA",
Expand Down
4 changes: 3 additions & 1 deletion tests/test_assays.py
Expand Up @@ -16,7 +16,7 @@
"artifact_category": "Manifest File",
"artifact_creator": "DFCI",
"assay_category": "Whole Exome Sequencing (WES)",
"bucket_url": "dummy",
"object_url": "dummy",
"file_name": "dummy.txt",
"file_size_bytes": 1,
"file_type": "FASTA",
Expand Down Expand Up @@ -94,6 +94,7 @@ def test_wes():
"library_kit_lot": "dummy_value",
"library_prep_date": "01/01/2001",
"capture_date": "01/01/2001",
"entry_id": "xyz",
"files": {
"tumor": {
"fastq_1": fastq_1,
Expand Down Expand Up @@ -175,6 +176,7 @@ def test_rna_expression():
"library_kit_lot": "dummy_value",
"library_prep_date": "01/01/2001",
"capture_date": "01/01/2001",
"entry_id": "abc",
"files": {
"fastq_1": fastq_1,
"fastq_2": fastq_1,
Expand Down
145 changes: 142 additions & 3 deletions tests/test_prism.py
Expand Up @@ -8,17 +8,95 @@
import pytest
import jsonschema
import json
from deepdiff import grep
from pprint import pprint
from jsonmerge import Merger

from cidc_schemas.prism import prismify, filepath_gen
from cidc_schemas.prism import prismify, merge_artifact
from cidc_schemas.json_validation import load_and_validate_schema
from cidc_schemas.template import Template
from cidc_schemas.template_writer import RowType
from cidc_schemas.template_reader import XlTemplateReader

from .constants import ROOT_DIR, SCHEMA_DIR, TEMPLATE_EXAMPLES_DIR
from .test_templates import template_paths
from .test_assays import ARTIFACT_OBJ


CLINICAL_TRIAL = {
"lead_organization_study_id": "10021",
"participants": [
{
"samples": [
{
"aliquots": [
{
"assay": {
"wes": {
"assay_creator": "Mount Sinai",
"assay_category": "Whole Exome Sequencing (WES)",
"enrichment_vendor_kit": "Twist",
"library_vendor_kit": "KAPA - Hyper Prep",
"sequencer_platform": "Illumina - NextSeq 550",
"paired_end_reads": "Paired",
"read_length": 100,
"records": [
{
"library_kit_lot": "lot abc",
"enrichment_vendor_lot": "lot 123",
"library_prep_date": "2019-05-01 00:00:00",
"capture_date": "2019-05-02 00:00:00",
"input_ng": 100,
"library_yield_ng": 700,
"average_insert_size": 250,
"entry_id": "abc1"
}
]
}
},
"cimac_aliquot_id": "aliquot 1"
},
],
"cimac_sample_id": "sample 1",
"genomic_source": "Tumor"
},
{
"aliquots": [
{
"assay": {
"wes": {
"assay_creator": "Mount Sinai",
"assay_category": "Whole Exome Sequencing (WES)",
"enrichment_vendor_kit": "Twist",
"library_vendor_kit": "KAPA - Hyper Prep",
"sequencer_platform": "Illumina - NextSeq 550",
"paired_end_reads": "Paired",
"read_length": 100,
"records": [
{
"library_kit_lot": "lot abc",
"enrichment_vendor_lot": "lot 123",
"library_prep_date": "2019-05-01 00:00:00",
"capture_date": "2019-05-02 00:00:00",
"input_ng": 100,
"library_yield_ng": 700,
"average_insert_size": 250,
"entry_id": "abc2"
}
]
}
},
"cimac_aliquot_id": "aliquot 2"
}
],
"cimac_sample_id": "sample 2",
"genomic_source": "Normal"
}
],
"cimac_participant_id": "patient 1"
}
]
}


def test_merge_core():
Expand Down Expand Up @@ -112,7 +190,8 @@ def test_assay_merge():
"capture_date": "2019-05-02 00:00:00",
"input_ng": 100,
"library_yield_ng": 700,
"average_insert_size": 250
"average_insert_size": 250,
"entry_id": "abc"
}
],
}
Expand Down Expand Up @@ -202,4 +281,64 @@ def test_filepath_gen():
assert 2 == sum([1 for x in file_maps if x['gs_key'].count("txt") > 0])

# assert works
validator.validate(ct)
validator.validate(ct)


def test_wes():

# create validators
validator = load_and_validate_schema("clinical_trial.json", return_validator=True)
schema = validator.schema

# create the example template.
temp_path = os.path.join(SCHEMA_DIR, 'templates', 'metadata', 'wes_template.json')
xlsx_path = os.path.join(TEMPLATE_EXAMPLES_DIR, "wes_template.xlsx")
hint = 'wes'

# parse the spreadsheet and get the file maps
ct, file_maps = prismify(xlsx_path, temp_path, assay_hint=hint)

# assert works
validator.validate(ct)


def test_snippet_wes():

# create the clinical trial.
ct = copy.deepcopy(CLINICAL_TRIAL)

# define list of gs_urls.
urls = [
'10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq',
'10021/Patient 1/sample 1/aliquot 1/wes_reverse.fastq',
'10021/Patient 1/sample 1/aliquot 1/wes_read_group.txt',
'10021/Patient 1/sample 1/aliquot 2/wes_forward.fastq',
'10021/Patient 1/sample 1/aliquot 2/wes_reverse.fastq',
'10021/Patient 1/sample 1/aliquot 2/wes_read_group.txt'
]

# create validator
validator = load_and_validate_schema("clinical_trial.json", return_validator=True)

# loop over each url
searched_urls = []
for gs_url in urls:

# attempt to merge
ct = merge_artifact(
ct,
object_url=gs_url,
file_size_bytes=14,
uploaded_timestamp="01/01/2001",
md5_hash="hash1234"
)

# assert we stull have a good clinical trial object.
validator.validate(ct)

# search for this url and all previous (no clobber)
searched_urls.append(gs_url)
for url in searched_urls:
ds = ct | grep(url)
assert 'matched_values' in ds
assert len(ds['matched_values']) > 0

0 comments on commit 414073a

Please sign in to comment.