CIMAC-CIDC · jacoblurye · Sep 6, 2019 · Sep 6, 2019 · Sep 6, 2019 · Sep 6, 2019
diff --git a/cidc_schemas/json_validation.py b/cidc_schemas/json_validation.py
@@ -60,28 +60,39 @@ def _map_refs(node: dict, on_refs: Callable[[str], dict]) -> dict:
     Note: _map_refs is shallow, i.e., if calling `on_refs` on a node produces 
     a new node that contains refs, those refs will not be resolved.
     """
-    if isinstance(node, collections.abc.Mapping) and '$ref' in node:
-        extra_keys = set(node.keys()).difference({'$ref', '$comment'})
-        if extra_keys:
-            # As for json-schema.org:
-            # "... You will always use $ref as the only key in an object: 
-            # any other keys you put there will be ignored by the validator."
-            # So we raise on that, to notify schema creator that s/he should not 
-            # expect those additional keys to be verified by schema validator.
-            raise Exception(f"Schema node with '$ref' should not contain anything else (besides '$comment' for docs). \
-                \nOn: {node} \nOffending keys {extra_keys}")
-
-        # We found a ref, so return it mapped through `on_refs`
-        new_node = on_refs(node['$ref'])
-        # Plus concatenated new and old '$comment' fields 
-        # which should be just ignored anyways.
-        if '$comment' in new_node or '$comment' in node:
-            new_node['$comment'] = new_node.get('$comment', '') + node.get('$comment', '')
-        return new_node
-    elif isinstance(node, collections.abc.Mapping):
-        # Look for all refs in this mapping
-        for k, v in node.items():
-            node[k] = _map_refs(v, on_refs)
+    if isinstance(node, collections.abc.Mapping):
+        if '$ref' in node or 'type_ref' in node:
+            ref_key = '$ref' if '$ref' in node else 'type_ref'
+
+            if ref_key == '$ref':
+                extra_keys = set(node.keys()).difference({'$ref', '$comment'})
+                if extra_keys:
+                    # As for json-schema.org:
+                    # "... You will always use $ref as the only key in an object:
+                    # any other keys you put there will be ignored by the validator."
+                    # So we raise on that, to notify schema creator that s/he should not
+                    # expect those additional keys to be verified by schema validator.
+                    raise Exception(f"Schema node with '$ref' should not contain anything else (besides '$comment' for docs). \
+                        \nOn: {node} \nOffending keys {extra_keys}")
+
+            # We found a ref, so return it mapped through `on_refs`
+            new_node = on_refs(node[ref_key])
+
+            if ref_key == 'type_ref':
+                # For type_ref's, we don't want to clobber the other properties in node,
+                # so merge new_node and node.
+                new_node.update(node)
+
+            # Plus concatenated new and old '$comment' fields
+            # which should be just ignored anyways.
+            if '$comment' in new_node or '$comment' in node:
+                new_node['$comment'] = new_node.get(
+                    '$comment', '') + node.get('$comment', '')
+            return new_node
+        else:
+            # Look for all refs further down in this mapping
+            for k, v in node.items():
+                node[k] = _map_refs(v, on_refs)
     elif isinstance(node, (list, tuple)):
         # Look for all refs in this list
         for i in range(len(node)):

diff --git a/cidc_schemas/schemas/assays/components/local_file.json b/cidc_schemas/schemas/assays/components/local_file.json
@@ -6,8 +6,9 @@
   "description": "A type to store local file paths",
   "properties": {
     "file_path": {
+      "$id": "local_file_path",
       "type": "string",
-      "description": "Path to a file on a users computer."
+      "description": "Path to a file on a user's computer."
     }
   },
   "required": []

diff --git a/cidc_schemas/schemas/templates/metadata/olink_template.json b/cidc_schemas/schemas/templates/metadata/olink_template.json
@@ -36,7 +36,7 @@
           "combined file": {
             "merge_pointer": "0/study/study_npx",
             "is_artifact" : 1,
-            "type_ref": "assays/components/local_file.json"
+            "type_ref": "assays/components/local_file.json#properties/file_path"
           },
           "combined file npx manager version": {
             "merge_pointer": "0/study/npx_manager_version",
@@ -53,13 +53,13 @@
               "merge_pointer": "0/files/assay_npx",
               "is_artifact" : 1,
               "gcs_prefix_format": "{chip_barcode}",
-              "type_ref": "assays/components/local_file.json"
+              "type_ref": "assays/components/local_file.json#properties/file_path"
             },
             "raw ct file": {
               "merge_pointer": "0/files/assay_raw_ct",
               "is_artifact" : 1,
               "gcs_prefix_format": "{chip_barcode}",
-              "type_ref": "assays/components/local_file.json"
+              "type_ref": "assays/components/local_file.json#properties/file_path"
             },
             "run date": {
               "merge_pointer": "0/run_date",

diff --git a/cidc_schemas/schemas/templates/metadata/wes_template.json b/cidc_schemas/schemas/templates/metadata/wes_template.json
@@ -87,19 +87,19 @@
               "merge_pointer": "0/files/fastq_1", 
               "gcs_prefix_format": "{cimac_participant_id}/{cimac_sample_id}/{cimac_aliquot_id}",
               "is_artifact" : 1,
-              "type_ref": "assays/components/local_file.json"
+              "type_ref": "assays/components/local_file.json#properties/file_path"
             },
             "reverse fastq": {
               "merge_pointer": "0/files/fastq_2", 
               "gcs_prefix_format": "{cimac_participant_id}/{cimac_sample_id}/{cimac_aliquot_id}",
               "is_artifact" : 1,
-              "type_ref": "assays/components/local_file.json"
+              "type_ref": "assays/components/local_file.json#properties/file_path"
             },
             "read group mapping file": {
               "merge_pointer": "0/files/read_group_mapping_file", 
               "gcs_prefix_format": "{cimac_participant_id}/{cimac_sample_id}/{cimac_aliquot_id}",
               "is_artifact" : 1,
-              "type_ref": "assays/components/local_file.json"
+              "type_ref": "assays/components/local_file.json#properties/file_path"
             }
           }
         }

diff --git a/cidc_schemas/template.py b/cidc_schemas/template.py
@@ -25,7 +25,8 @@ def _get_template_path_map() -> dict:
         if os.path.isdir(abs_type_dir):
             for template_file in os.listdir(abs_type_dir):
                 template_type = template_file.replace('_template.json', '')
-                template_schema_path = os.path.join(abs_type_dir, template_file)
+                template_schema_path = os.path.join(
+                    abs_type_dir, template_file)
                 path_map[template_type] = template_schema_path
 
     return path_map
@@ -65,7 +66,6 @@ def generate_all_templates(target_dir: str):
                     target_path = os.path.join(target_subdir, template_xlsx_file)
                     generate_empty_template(schema_path, target_path)
 
-
 class Template:
     """
     Configuration describing a manifest or assay template
@@ -137,7 +137,6 @@ def process_fields(schema: dict) -> dict:
 
         return processed_worksheet
 
-
     @staticmethod
     def _get_ref_coerce(ref: str):
         """
@@ -174,7 +173,10 @@ def _get_coerce(t: str, object_id = None):
         and determines the best python
         function to type the value.
         """
-
+        # if it's an artifact that will be loaded through local file
+        # we just return uuid as value
+        if object_id == "local_file_path":
+            return lambda _: str(uuid.uuid4())
         if t == 'string':
             return str
         elif t == 'integer':
@@ -183,15 +185,9 @@ def _get_coerce(t: str, object_id = None):
             return float
         elif t == 'boolean':
             return bool
-
-        # if it's an artifact that will be loaded through local file
-        # we just return uuid as value 
-        elif t == 'object' and object_id == "local_file":
-            return lambda _: str(uuid.uuid4())
         else:
             raise NotImplementedError(f"no coercion available for type:{t}")
 
-
     def _load_keylookup(self) -> dict:
         """
         The excel spreadsheet uses human friendly (no _) names
@@ -209,14 +205,19 @@ def _load_keylookup(self) -> dict:
         # create a key lookup dictionary
         key_lu = {}
 
-        def _add_coerce(field_def:dict) -> dict: 
+        def _add_coerce(field_def: dict) -> dict:
             # checks if we have a cast func for that 'type_ref'
             if 'value_template_format' in field_def:
-                coerce = lambda x: field_def['value_template_format'].format_map(x) 
+                coerce = lambda x: field_def['value_template_format'].format_map(x)
             if 'type' in field_def:
-                coerce = self._get_coerce(field_def['type'])
+                if '$id' in field_def:
+                    coerce = self._get_coerce(
+                        field_def['type'], field_def['$id'])
+                else:
+                    coerce = self._get_coerce(field_def['type'])
             else:
-                coerce = self._get_ref_coerce(field_def.get('type_ref') or field_def['$ref'])
+                coerce = self._get_ref_coerce(
+                    field_def.get('type_ref') or field_def['$ref'])
 
             return dict(coerce=coerce, **field_def)
 
@@ -232,7 +233,7 @@ def _add_coerce(field_def:dict) -> dict:
                 # (as for template.schema) - "merge_pointer" and "type_ref"
 
             # load the data columns
-            for section_key, section_def in ws_schema.get('data_columns', {}).items():                
+            for section_key, section_def in ws_schema.get('data_columns', {}).items():
                 for column_key, column_def in section_def.items():
 
                     # populate lookup
@@ -244,10 +245,10 @@ def _add_coerce(field_def:dict) -> dict:
 
     # XlTemplateReader only knows how to format these types of sections
     VALID_WS_SECTIONS = set(['preamble_rows',
-        'data_columns',
-        'prism_preamble_object_pointer',
-        'prism_data_object_pointer',
-        'prism_preamble_object_schema'])
+                             'data_columns',
+                             'prism_preamble_object_pointer',
+                             'prism_data_object_pointer',
+                             'prism_preamble_object_schema'])
 
     @staticmethod
     def _validate_worksheet(ws_title: str, ws_schema: dict):
@@ -276,8 +277,7 @@ def from_json(template_schema_path: str, schema_root: str = SCHEMA_DIR):
             template_schema_path {str} -- path to the template schema file
             schema_root {str} -- path to the directory where all schemas are stored
         """
-        template_schema = load_and_validate_schema(
-            template_schema_path, schema_root)
+        template_schema = load_and_validate_schema(template_schema_path, schema_root)
 
         return Template(template_schema, name=template_schema_path)
 

diff --git a/docs/docs/artifacts.artifact_bam.html b/docs/docs/artifacts.artifact_bam.html
@@ -87,6 +87,8 @@ <h3>Properties</h3>
 
 
 
+
+
 
 
 
@@ -119,6 +121,29 @@ <h3>Properties</h3>
 
 
 
+    </td>
+</tr>
+
+
+
+                <tr>
+    <td>
+
+            <span id="data_format">
+                data_format
+            </span>
+
+
+    </td>
+    <td width=55%>
+
+
+
+    </td>
+    <td>
+
+            &lt; const: <strong>BAM</strong> &gt;            
+
     </td>
 </tr>
 
@@ -145,7 +170,8 @@ <h3>artifact_bam.full.json</h3>
                         "uploaded_timestamp",
                         "file_size_bytes",
                         "md5_hash",
-                        "artifact_category"
+                        "artifact_category",
+                        "data_format"
                     ]
                 },
                 {
@@ -175,6 +201,24 @@ <h3>artifact_bam.full.json</h3>
                     ],
                     "type": "string"
                 },
+                "data_format": {
+                    "description": "Data Format.",
+                    "enum": [
+                        "FASTA",
+                        "FASTQ",
+                        "IMAGE",
+                        "VCF",
+                        "CSV",
+                        "XLSX",
+                        "NPX",
+                        "BAM",
+                        "MAF",
+                        "BINARY",
+                        "TEXT",
+                        "[NOT SET]"
+                    ],
+                    "type": "string"
+                },
                 "file_name": {
                     "$comment": "_____!!!_____ What if file name contains PHI? _____!!!_____",
                     "description": "The name of the file with extension",
@@ -218,6 +262,9 @@ <h3>artifact_bam.full.json</h3>
         },
         {
             "properties": {
+                "data_format": {
+                    "const": "BAM"
+                },
                 "info": {
                     "type": "object"
                 }