In [None]:
!pip install gen3_validator

In [None]:
# import dictionaryutils as du
# dd = du.DataDictionary(root_dir="../examples/schema/yaml")
# # getting resolved schema
# dd.schema.get("lipidomics_file")

In [None]:
from gen3_validator.resolve_schema import ResolveSchema

resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
resolver.resolve_schema()


In [None]:
resolver.schema_resolved['demographic.yaml']

In [None]:
# get the node order
resolver.node_order

In [None]:
schema = resolver.schema_resolved['proteomics_file.yaml']
schema['properties']['data_file_properties.md5sum']

In [None]:
# class to extract and store property, data type, and description

import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

schema = resolver.schema_resolved['unaligned_reads_file.yaml']

class PropExtractor:
    def __init__(self, resolved_schema: dict):
        self.resolved_schema = resolved_schema
    
    def get_schema_name(self) -> str:
        schema_name = self.resolved_schema['title']
        return schema_name
    
    def get_prop_names(self) -> list:
        prop_names = list(self.resolved_schema['properties'].keys())
        return prop_names
    
    def get_data_file_prop_names(self) -> list:
        """
        Returns a list of property names under 'data_file_properties' if it exists,
        otherwise returns an empty list.
        """
        prop_names = self.get_prop_names()
        if "data_file_properties" in prop_names:
            return list(self.resolved_schema['properties']['data_file_properties'].keys())
        logger.warning(f"No data_file_properties found in {self.get_schema_name()}")
        return None
        

    def get_prop_info(self, prop_name: str) -> dict:
        prop_names  = self.get_prop_names()
        prop_data_file_names = self.get_data_file_prop_names()
        prop_info = None

        if prop_name in prop_names:
            prop_info = self.resolved_schema['properties'][prop_name]
        elif prop_data_file_names is not None and prop_name in prop_data_file_names:
            prop_info = self.resolved_schema['properties']['data_file_properties'][prop_name]
        else:
            logger.warning(f"Property '{prop_name}' not found in {self.get_schema_name()}")
            
        return prop_info

    def get_data_type(self, prop_name: str) -> str:
        prop_info = self.get_prop_info(prop_name)
        if prop_info is None:
            logger.warning(
                f"Property '{prop_name}' not found in {self.get_schema_name()}, could not pull type"
            )
            return None

        if "type" in prop_info and "pattern" in prop_info:
            prop_type = f"string | pattern = {prop_info['pattern']}"
        elif "type" in prop_info:
            prop_type = prop_info["type"]
        elif "enum" in prop_info:
            prop_type = "enum"
        else:
            logger.warning(
                f"Property '{prop_name}' has no 'type' or 'enum' key. "
                f"Could be an injected property, usually don't need "
                f"these in the template | prop_info = {prop_info}"
            )
            return None

        if not isinstance(prop_type, str):
            try:
                joined_types = ", ".join(prop_type)
                logger.warning(
                    f"Property type '{prop_type}' is not string, converting to string: {joined_types}"
                )
                return joined_types
            except TypeError:
                logger.warning(
                    f"Property type '{prop_type}' is not string and could not be joined."
                )
                return str(prop_type)

        return prop_type

    def get_description(self, prop_name: str) -> str:
        prop_info = self.get_prop_info(prop_name)
        if prop_info is None:
            logger.warning(f"""Property '{prop_name}' not found in {self.get_schema_name()}, could not pull description"""
            )
            return None
        prop_description = None
        
        if "description" in prop_info:
            prop_description = prop_info['description']
        if "term" in prop_info:
            prop_description = prop_info['term']["description"]
        
        if prop_description is None:
            logger.warning(f"""Property '{prop_name}' has no description key. 
                Could be an injected property, usually don't need these in the
                template | prop_info = {prop_info}"""
            )
        
        return prop_description
        
# usage
PropExtractor = PropExtractor(schema)
# PropExtractor.get_description("md5sum")
PropExtractor.get_data_file_prop_names()

In [None]:
from gen3_metadata_templates.props import PropExtractor
schema = resolver.schema_resolved['unaligned_reads_file.yaml']
PropExtractor = PropExtractor(schema)
PropExtractor.get_data_type('md5sum')

In [None]:
prop_names = PropExtractor.get_prop_names()
data_file_prop_names = PropExtractor.get_data_file_prop_names()
if data_file_prop_names is not None:
    prop_names = prop_names.append(data_file_prop_names)
prop_names

In [None]:
descriptions = {}
for prop_name in prop_names:
    descriptions[prop_name] = PropExtractor.get_description(prop_name)
descriptions

In [None]:
types = {}
for prop_name in prop_names:
    types[prop_name] = PropExtractor.get_data_type(prop_name)
types

In [None]:
import pytest
from gen3_validator.resolve_schema import ResolveSchema

@pytest.fixture
def fixture_res_schema():
    resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
    resolver.resolve_schema()
    return resolver.schema_resolved['unaligned_reads_file.yaml']

def test_init_PropExtractor(fixture_res_schema):
    PropExtractor = PropExtractor(fixture_res_schema)
    assert fixture_res_schema == PropExtractor.resolved_schema

In [None]:
t = ", ".join({"one": "test", "two": "test"})
t

In [None]:
from gen3_metadata_templates.props import PropExtractor


In [None]:
from gen3_validator.resolve_schema import ResolveSchema
from gen3_metadata_templates.props import PropExtractor

resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
resolver.resolve_schema()

node_props = PropExtractor(resolver.schema_resolved['unaligned_reads_file.yaml'])


node_props.get_schema_name()

# return prop names for the schema
prop_names = node_props.get_prop_names()
print(prop_names)

# return the data types for the properties
types = {}
for prop_name in prop_names:
    types[prop_name] = node_props.get_data_type(prop_name)
print(types)

# return the description for the properties
descriptions = {}
for prop_name in prop_names:
    descriptions[prop_name] = node_props.get_description(prop_name)
print(descriptions)

# Above is old code

In [None]:
from gen3_validator.resolve_schema import ResolveSchema
from gen3_metadata_templates.props import PropExtractor

resolver = ResolveSchema(schema_path="../examples/schema/json/acdc_schema.json")
resolver.resolve_schema()


INFO:gen3_validator.resolve_schema:Initializing ResolveSchema with schema path: ../examples/schema/json/acdc_schema.json
INFO:gen3_validator.resolve_schema:Starting schema resolution process.
INFO:gen3_validator.resolve_schema:Reading JSON file from path: ../examples/schema/json/acdc_schema.json
INFO:gen3_validator.resolve_schema:Successfully read JSON schema.
INFO:gen3_validator.resolve_schema:Retrieving node names from schema.
INFO:gen3_validator.resolve_schema:Retrieved 34 nodes from schema.
INFO:gen3_validator.resolve_schema:Retrieving all node pairs, excluding specified nodes.
INFO:gen3_validator.resolve_schema:Finding upstream and downstream nodes for: demographic.yaml
INFO:gen3_validator.resolve_schema:Retrieving links and ID for node: demographic.yaml
INFO:gen3_validator.resolve_schema:Finding upstream and downstream nodes for: project.yaml
INFO:gen3_validator.resolve_schema:Retrieving links and ID for node: project.yaml
INFO:gen3_validator.resolve_schema:Finding upstream and d

['demographic.yaml',
 'project.yaml',
 'serum_marker_assay.yaml',
 'alignment_workflow.yaml',
 'imaging_file.yaml',
 'lipidomics_assay.yaml',
 'metabolomics_file.yaml',
 'acknowledgement.yaml',
 'medical_history.yaml',
 '_definitions.yaml',
 '_settings.yaml',
 'blood_pressure_test.yaml',
 'genomics_assay.yaml',
 'variant_file.yaml',
 'timepoint.yaml',
 'program.yaml',
 'serum_marker_file.yaml',
 'proteomics_assay.yaml',
 'sample.yaml',
 'unaligned_reads_file.yaml',
 '_terms.yaml',
 'aligned_reads_index_file.yaml',
 'variant_workflow.yaml',
 'proteomics_file.yaml',
 'exposure.yaml',
 'metabolomics_assay.yaml',
 'lipidomics_mapping_file.yaml',
 'lipidomics_file.yaml',
 'aligned_reads_file.yaml',
 'lab_result.yaml',
 'medication.yaml',
 'publication.yaml',
 'subject.yaml',
 'core_metadata_collection.yaml']

In [14]:
import pandas as pd

node_templates = {}
for node in resolver.node_order:
    node_props = PropExtractor(resolver.schema_resolved[f"{node}.yaml"])
    props_list = node_props.extract_properties()
    node_templates[node] = pd.DataFrame([p.__dict__ for p in props_list])

node_templates



{'program':   node_name               prop_name  \
 0   Program                    type   
 1   Program                      id   
 2   Program                    name   
 3   Program  dbgap_accession_number   
 
                                            data_type  \
 0                                             string   
 1  string | pattern = ^[a-fA-F0-9]{8}-[a-fA-F0-9]...   
 2                                             string   
 3                                             string   
 
                                          description  
 0                                               None  
 1  A 128-bit identifier. Depending on the mechani...  
 2                    Full name/title of the program.  
 3  The dbgap accession number provided for the pr...  ,
 'project':    node_name               prop_name  \
 0    Project                    type   
 1    Project                      id   
 2    Project                    name   
 3    Project                    code   
 4 

In [16]:
node_templates["blood_pressure_test"]

Unnamed: 0,node_name,prop_name,data_type,description
0,Blood Pressure Test,type,string,
1,Blood Pressure Test,id,string | pattern = ^[a-fA-F0-9]{8}-[a-fA-F0-9]...,A 128-bit identifier. Depending on the mechani...
2,Blood Pressure Test,submitter_id,string,A project-specific identifier for a node. This...
3,Blood Pressure Test,state,,The current state of the object.\n
4,Blood Pressure Test,project_id,string,Unique ID for any specific defined piece of wo...
5,Blood Pressure Test,created_datetime,,A combination of date and time of day in the f...
6,Blood Pressure Test,updated_datetime,,A combination of date and time of day in the f...
7,Blood Pressure Test,timepoints,,
8,Blood Pressure Test,bp_diastolic,integer,Resting diastolic blood pressure from the uppe...
9,Blood Pressure Test,bp_systolic,integer,Resting systolic blood pressure from the upper...
