In [None]:
!pip install gen3_validator

In [None]:
# import dictionaryutils as du
# dd = du.DataDictionary(root_dir="../examples/schema/yaml")
# # getting resolved schema
# dd.schema.get("lipidomics_file")

In [None]:
from gen3_validator.resolve_schema import ResolveSchema

resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
resolver.resolve_schema()


In [6]:
resolver.schema_resolved['demographic.yaml']

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'id': 'demographic',
 'title': 'Demographic',
 'type': 'object',
 'namespace': 'https://data.test.biocommons.org.au/',
 'category': 'clinical',
 'program': '*',
 'project': '*',
 'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
 'additionalProperties': False,
 'submittable': True,
 'validators': None,
 'systemProperties': ['id',
  'project_id',
  'state',
  'created_datetime',
  'updated_datetime'],
 'links': [{'name': 'timepoints',
   'backref': 'demographics',
   'label': 'describes',
   'target_type': 'timepoint',
   'multiplicity': 'one_to_one',
   'required': True}],
 'required': ['type', 'submitter_id', 'timepoints'],
 'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
 'properties': {'type': {'type': 'string'},
  'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is ei

In [5]:
# get the node order
resolver.node_order

['program',
 'project',
 'acknowledgement',
 'publication',
 'subject',
 'timepoint',
 'demographic',
 'imaging_file',
 'medical_history',
 'blood_pressure_test',
 'sample',
 'exposure',
 'lab_result',
 'medication',
 'serum_marker_assay',
 'lipidomics_assay',
 'genomics_assay',
 'proteomics_assay',
 'metabolomics_assay',
 'serum_marker_file',
 'lipidomics_file',
 'unaligned_reads_file',
 'proteomics_file',
 'metabolomics_file',
 'lipidomics_mapping_file',
 'alignment_workflow',
 'aligned_reads_file',
 'aligned_reads_index_file',
 'variant_workflow',
 'variant_file',
 'core_metadata_collection']

In [50]:
schema = resolver.schema_resolved['proteomics_file.yaml']
schema['properties']['data_file_properties.md5sum']

KeyError: 'data_file_properties.md5sum'

In [91]:
# class to extract and store property, data type, and description

import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

schema = resolver.schema_resolved['unaligned_reads_file.yaml']

class NodeProps:
    def __init__(self, resolved_schema: dict):
        self.resolved_schema = resolved_schema
    
    def get_schema_name(self) -> str:
        schema_name = self.resolved_schema['title']
        return schema_name
    
    def get_prop_names(self) -> list:
        prop_names = list(self.resolved_schema['properties'].keys())
        return prop_names
    
    def get_data_file_prop_names(self) -> list:
        """
        Returns a list of property names under 'data_file_properties' if it exists,
        otherwise returns an empty list.
        """
        prop_names = self.get_prop_names()
        if "data_file_properties" in prop_names:
            return list(self.resolved_schema['properties']['data_file_properties'].keys())
        logger.warning(f"No data_file_properties found in {self.get_schema_name()}")
        return None
        

    def get_prop_info(self, prop_name: str) -> dict:
        prop_names  = self.get_prop_names()
        prop_data_file_names = self.get_data_file_prop_names()
        prop_info = None

        if prop_name in prop_names:
            prop_info = self.resolved_schema['properties'][prop_name]
        elif prop_data_file_names is not None and prop_name in prop_data_file_names:
            prop_info = self.resolved_schema['properties']['data_file_properties'][prop_name]
        else:
            logger.warning(f"Property '{prop_name}' not found in {self.get_schema_name()}")
            
        return prop_info

    def get_data_type(self, prop_name: str) -> str:
        prop_info = self.get_prop_info(prop_name)
        if prop_info is None:
            logger.warning(
                f"Property '{prop_name}' not found in {self.get_schema_name()}, could not pull type"
            )
            return None

        if "type" in prop_info and "pattern" in prop_info:
            prop_type = f"string | pattern = {prop_info['pattern']}"
        elif "type" in prop_info:
            prop_type = prop_info["type"]
        elif "enum" in prop_info:
            prop_type = "enum"
        else:
            logger.warning(
                f"Property '{prop_name}' has no 'type' or 'enum' key. "
                f"Could be an injected property, usually don't need "
                f"these in the template | prop_info = {prop_info}"
            )
            return None

        if not isinstance(prop_type, str):
            try:
                joined_types = ", ".join(prop_type)
                logger.warning(
                    f"Property type '{prop_type}' is not string, converting to string: {joined_types}"
                )
                return joined_types
            except TypeError:
                logger.warning(
                    f"Property type '{prop_type}' is not string and could not be joined."
                )
                return str(prop_type)

        return prop_type

    def get_description(self, prop_name: str) -> str:
        prop_info = self.get_prop_info(prop_name)
        if prop_info is None:
            logger.warning(f"""Property '{prop_name}' not found in {self.get_schema_name()}, could not pull description"""
            )
            return None
        prop_description = None
        
        if "description" in prop_info:
            prop_description = prop_info['description']
        if "term" in prop_info:
            prop_description = prop_info['term']["description"]
        
        if prop_description is None:
            logger.warning(f"""Property '{prop_name}' has no description key. 
                Could be an injected property, usually don't need these in the
                template | prop_info = {prop_info}"""
            )
        
        return prop_description
        
# usage
nodeprops = NodeProps(schema)
# nodeprops.get_description("md5sum")
nodeprops.get_data_file_prop_names()



In [95]:
prop_names = nodeprops.get_prop_names()
data_file_prop_names = nodeprops.get_data_file_prop_names()
if data_file_prop_names is not None:
    prop_names = prop_names.append(data_file_prop_names)
prop_names



['type',
 'id',
 'submitter_id',
 'state',
 'project_id',
 'created_datetime',
 'updated_datetime',
 'file_name',
 'file_size',
 'file_format',
 'md5sum',
 'object_id',
 'file_state',
 'error_type',
 'ga4gh_drs_uri',
 'genomics_assay',
 'core_metadata_collections',
 'data_category',
 'data_format',
 'data_type',
 'run_id']

In [96]:
descriptions = {}
for prop_name in prop_names:
    descriptions[prop_name] = nodeprops.get_description(prop_name)
descriptions

                Could be an injected property, usually don't need these in the
                template | prop_info = {'type': 'string'}
                Could be an injected property, usually don't need these in the
                template | prop_info = {'anyOf': [{'type': 'array', 'items': {'type': 'object', 'additionalProperties': True, 'properties': {'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n', 'termDef': {'term': 'Universally Unique Identifier', 'source': 'NCIt', 'cde_id': 'C54100', 'cde_version': None, 'term_url': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&cod

{'type': None,
 'id': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n',
 'submitter_id': 'A project-specific identifier for a node. This property is the calling card/nickname/alias for a unit of submission. It can be used in place of the UUID for identifying or recalling a node.\n',
 'state': 'The current state of the object.\n',
 'project_id': 'Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.\n',
 'created_datetime': 'A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n',
 'updated_datetime': 'A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)

In [97]:
types = {}
for prop_name in prop_names:
    types[prop_name] = nodeprops.get_data_type(prop_name)
types



{'type': 'string',
 'id': 'string | pattern = ^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$',
 'submitter_id': 'string',
 'state': None,
 'project_id': 'string',
 'created_datetime': None,
 'updated_datetime': None,
 'file_name': 'string',
 'file_size': 'integer',
 'file_format': 'string',
 'md5sum': 'string | pattern = ^[a-f0-9]{32}$',
 'object_id': 'string',
 'file_state': 'enum',
 'error_type': 'enum',
 'ga4gh_drs_uri': 'string',
 'genomics_assay': None,
 'core_metadata_collections': None,
 'data_category': 'enum',
 'data_format': 'enum',
 'data_type': 'enum',
 'run_id': 'string'}

In [None]:
import pytest
from gen3_validator.resolve_schema import ResolveSchema

@pytest.fixture
def fixture_res_schema():
    resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
    resolver.resolve_schema()
    return resolver.schema_resolved['unaligned_reads_file.yaml']

def test_init_NodeProps(fixture_res_schema):
    nodeprops = NodeProps(fixture_res_schema)
    assert fixture_res_schema == nodeprops.resolved_schema

In [22]:
t = ", ".join({"one": "test", "two": "test"})
t

'one, two'