In [None]:
import gen3_validator
# loading gen3 bundled jsonschema into dictionary class
dd = gen3_validator.dict.DataDictionary('../examples/schema/json/schema_dev.json')
dd.parse_schema()

In [None]:
# getting list of nodes
dd.get_nodes()

In [None]:
# Returning the jsonschema for a given node
dd.return_schema('lipidomics_file.yaml')

In [None]:
!pip install dictionaryutils

In [None]:

import dictionaryutils
import json
import yaml
dd = dictionaryutils.DataDictionary('/Users/harrijh/projects/gen3schemadev/examples/schema/yaml')
dd.load_data(directory='/Users/harrijh/projects/gen3schemadev/examples/schema/yaml')
dd_resolved = dd.schema

for k, v in dd_resolved.items():
    with open(f'/Users/harrijh/projects/gen3schemadev/examples/schema/yaml/resolved/{k}.yaml', 'w') as f:
        yaml.safe_dump(v, f)


with open(f'/Users/harrijh/projects/gen3schemadev/examples/schema/json/gen3_bundled_schema_resolved.json', 'w') as f:
    json.dump(dd_resolved, f)

In [None]:
dd = dictionaryutils.load_schemas_from_file('/Users/harrijh/projects/gen3schemadev/examples/schema/json/schema_dev.json')
dd

# To validate a gen3schemadev input yaml with the input schema


In [None]:
# !pip install check-jsonschema

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/input_schema.yml ../src/gen3schemadev/schema/input_example.yml --verbose

# To validate a single gen3 schema yaml with the metaschema
- note, the schema file needs to be resolved

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/gen3_metaschema.yml ../examples/schema/yaml/resolved/lipidomics_file.yaml --verbose

if you give a non-resolved schema, it will not validate

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/gen3_metaschema.yml ../examples/schema/yaml/lipidomics_file.yaml --verbose

# Compiling gen3schemadev input yaml to gen3 yamls
## Steps:
1. Validate input yaml against the input schema
2. Load the input yaml into python dictionary
3. The compiler should start with an empty data structure, with default values for a gen3 schema. This yaml is defined in the `src/gen3schemadev/schema/gen3_schema_template.yml`
4. 

In [None]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
import logging

# Set up basic logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

metaschema_path = "../src/gen3schemadev/schema/gen3_metaschema.yml"
converter_template = generate_gen3_template(metaschema_path)
metaschema = load_yaml(metaschema_path)

# loading input example
data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)


In [None]:
converter_template

In [None]:
metaschema

In [None]:
validated_model.model_dump()

Now we need to read to:
1. validate the input yaml against the input metaschema
2. now we know the input yaml is validated, we can extract the data from the yaml into a data class which has the structure `input.node.properties.links`
3. We then use the data class to populated the converter template for each node
4. The populated templates are then written to the output directory


In [None]:
from dataclasses import dataclass
class node:
    name: str
    description: str
    category: str
    properties: list
    links: list

from typing import Any

def get_node_data(node: str, data: Any) -> node:
    try:
        for ent in data.nodes:
            if ent.name == node:
                return ent
        raise ValueError(f"node '{node}' not found in data.nodes")
    except AttributeError as e:
        raise AttributeError(f"Invalid data structure: {e}")
    except Exception as e:
        raise Exception(f"An error occurred while retrieving node data: {e}")

def get_node_links(node: str, data: Any) -> list[dict]:
    links = data.links
    node_links = []
    for link in links:
        if link.child == node:
            node_links.append(link.model_dump())
    return node_links



from dataclasses import dataclass, asdict
from typing import Optional, List, Union

@dataclass
class LinkObj:
    name: str
    backref: str
    label: Optional[str]
    target_type: str
    multiplicity: str
    required: bool

    def to_dict(self):
        return asdict(self)

@dataclass
class LinkGroup:
    exclusive: bool
    required: bool
    subgroup: List[dict]

    def to_dict(self):
        return asdict(self)
    
def create_core_metadata_link(child_name: str) -> dict:
    link_obj = LinkObj(
        name=f"core_metadata_collections",
        backref=f"{child_name}s",
        label=None,
        target_type="core_metadata_collection",
        multiplicity="one_to_one",
        required=True 
    )
    return link_obj.to_dict()

def convert_node_links(links: dict, node_file: bool = False) -> dict:
    link_list = []
    for link in links:
        link_obj = LinkObj(
            name=f"{link['parent']}s",
            backref=f"{link['child']}s",
            label=None,
            target_type=link['parent'],
            multiplicity=link['multiplicity'],
            required=True  # TODO remove this hard code later, should pull from input yaml
        )
        link_list.append(link_obj.to_dict())

    if node_file:
        core_link = create_core_metadata_link(links[0]['child'])
        link_list.append(core_link)

    if len(link_list) > 1:
        group = LinkGroup(
            exclusive=False,
            required=True,
            subgroup=link_list
        )
        output = group.to_dict()
    else:
        output = link_list
    return output

### Note to self, you will now need to find a way to add in the _definitions and _terms references.
### You also need to define a standard for the _definitions and _terms references files to read from as templates

def get_properties(node_name: str, data: Any) -> list[dict]:
    output = []
    ent = get_node_data(node_name, data)
    props = ent.properties
    if props:
        for prop in props:
            pdict = {
                prop.name: {k: v for k, v in prop.model_dump().items() if k != "name"}
            }
            output.append(pdict)
    else:
        raise Exception(f'No properties found for node {node_name}')
    return output


def get_category(node_name: str, data: Any) -> str:
    ent = get_node_data(node_name, data)
    category = ent.category
    # If it's an Enum, get its value; otherwise, return as is
    if hasattr(category, "value"):
        return category.value
    return category

def get_node_value(node_name: str, key: str, data: Any):
    """
    Returns the value of a single key within an node object.

    Args:
        node_name (str): The name of the node to retrieve.
        key (str): The key whose value is to be returned.
        data (Any): The data structure containing nodes.

    Returns:
        The value associated with the specified key in the node object.
    """
    ent = get_node_data(node_name, data)
    return ent.model_dump()[key]


def populate_template(node_name: str, input_data, template) -> dict:
    """
    Populate a Gen3 schema template dictionary with values from a Pydantic data model.

    This function takes an node name, a Pydantic model instance containing node data,
    and a Gen3 schema template dictionary. It fills a copy of the template with values
    from the input data, applying special logic for certain keys (e.g., 'name', 'category',
    'properties', 'links'). If a key from the input data is not found in the template,
    it is added with a value of None and a warning is logged.

    Args:
        node_name (str): The name of the node to populate in the template.
        input_data: A Pydantic model instance containing the node's data.
        template (dict): A Gen3 schema template dictionary to be populated.

    Returns:
        dict: A new Gen3 schema template dictionary populated with values from the input data.

    Side Effects:
        Logs a warning if a key from the input data is not found in the template.
    """
    # ... function body ...
    ent = get_node_data(node_name, input_data)
    ent_dict = ent.model_dump()
    output_schema = template.copy()
    
    # Checking if node is file category
    file_cat = False
    if get_node_value(node_name, 'category', input_data) == 'file':
        file_cat = True
    
    for key, value in ent_dict.items():
        if key == 'name':
            output_schema['id'] = value
        elif key == 'category':
            output_schema[key] = get_category(node_name, input_data)
        elif key == 'properties':
            output_schema[key] = get_properties(node_name, input_data)
        elif key == 'links':
            links = get_node_links(node_name, input_data)
            output_schema[key] = convert_node_links(links, node_file=file_cat)
        elif key in output_schema:
            output_schema[key] = value
        else:
            logger.warning(f"Key '{key}' not found in template")
    return output_schema

validated_model = DataModel.model_validate(data)
# links = get_node_links('lipidomics_file', validated_model)
# convert_node_links(links, node_file=True)
# get_properties('sample', validated_model)
# get_category('sample', validated_model)

out_template = populate_template('lipidomics_file', validated_model, converter_template)
out_template
# write_yaml(out_template, 'output.yml')

# Testing converter


In [None]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
from gen3schemadev.converter import *



# Loading template and metaschema
metaschema = get_metaschema()
converter_template = generate_gen3_template(metaschema)

# loading input example
data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)


import json
print(json.dumps(construct_props('sample', validated_model), indent=4))

# construct_props('sample', validated_model)

In [None]:
prop = {'sample_tube_type': {'type': 'enum',
   'description': 'Sample tube type (enum)',
   'required': False,
   'enums': [{'name': 'EDTA'}, {'name': 'Heparin'}, {'name': 'Citrate'}]}}
first_key = next(iter(prop))
first_value = prop[first_key]
first_value


In [None]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
from gen3schemadev.converter import *

data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)

from typing import Dict, Any, List

def strip_required_field(props_list: list[dict]) -> list[dict]:
    """
    Remove the 'required' field from all property dicts in the input list.
    Can use the output of get_properties() for this function

    Args:
        props_list (list): A list of property dictionaries, where each dictionary has a single key
            (the property name) and its value is a dictionary describing the property. For example:
                [
                    {
                        "project_id": {
                            "type": "string",
                            "description": "Synthetic_Dataset_1",
                            "required": True,
                            "enums": None
                        }
                    },
                    ...
                ]

    Returns:
        list: A new list with the same structure as props_list, but with the 'required'
            field removed from each property's dictionary (if present).

    Note:
        This function expects a list of property definitions as typically returned by
        get_properties() in the Gen3 schema conversion workflow.
        If you are working with a DataSourceProtocol object, you should first extract the
        properties list using the appropriate function.
    """
    new_list = []
    for prop in props_list:
        if isinstance(prop, dict):
            # Each prop is {property_name: property_dict}
            new_prop = {}
            for k, v in prop.items():
                if isinstance(v, dict):
                    v = {key: val for key, val in v.items() if key != 'required'}
                new_prop[k] = v
            new_list.append(new_prop)
        else:
            new_list.append(prop)
    return new_list

def get_required_prop_names(props_list: list[dict]) -> List[str]:
    """
    Given a list of property dicts (as from get_properties), return a list of property names
    where the property dict has 'required': True.

    Args:
        props_list (list): List of property dictionaries.

    Returns:
        List[str]: List of property names with required True.
    """
    required_names = []
    for prop in props_list:
        if isinstance(prop, dict):
            for k, v in prop.items():
                if isinstance(v, dict) and v.get("required") is True:
                    required_names.append(k)
    return required_names

project_props = get_properties('sample', validated_model)
stripped_props = strip_required_field(project_props)
required_names = get_required_prop_names(project_props)
stripped_props, required_names

In [None]:
get_properties('lipidomics_file', validated_model)
construct_props('lipidomics_file', validated_model)


In [None]:
get_node_data('lipidomics_file', validated_model)

In [None]:
links = get_node_links('lipidomics_file', validated_model)
convert_node_links(links, node_file=True)
get_node_value('lipidomics_file', 'category', validated_model) == 'data_file'

***
## Trying to create template from metaschema


In [None]:
from gen3schemadev.schema.gen3_template import *

out_template = generate_gen3_template('../src/gen3schemadev/schema/gen3_metaschema.yml')

write_yaml(out_template, 'output.yml')

In [None]:
from gen3schemadev.schema.gen3_template import *
generate_def_template()
# generate_setting_template()
# generate_terms_template()
# generate_core_metadata_template()

# Testing yaml to bundle

In [None]:
from gen3schemadev.utils import bundle_yamls

bundle_yamls('../examples/schema/yaml')['subject']

In [None]:
import os
path = "../examples/schema/yaml/acknowledgement.yaml"

os.path.dirname(path)

# Working on validation


In [None]:
%load_ext autoreload
%autoreload 2

In [10]:
from gen3schemadev.utils import *
from gen3schemadev.schema.gen3_template import get_metaschema
import subprocess
import tempfile
from gen3_validator.resolve_schema import ResolveSchema

def bundled_schema_to_dict_list(file: str, return_aux: bool = False):
    """
    Reads a bundled Gen3 JSON schema file and returns a list of schema dictionaries.

    Args:
        file (str): Path to the bundled JSON file containing multiple schemas.
        return_aux (bool): If True, return only the auxiliary schemas (definitions/settings/terms).
                           If False, return only the main node schemas.

    Returns:
        list: A list of schema dictionaries extracted from the bundled file.
    """
    resolver = ResolveSchema(file)
    resolver.resolve_schema()
    bundled = resolver.schema_resolved  # This is a dict-like object
    schema_list = []
    aux_list = []
    aux_schema_names = ['_definitions.yaml', '_settings.yaml', '_terms.yaml']
    for k, v in bundled.items():
        print(f"key: {k}")
        if k in aux_schema_names:
            print(f"found auxiliary schema: {k}")
            aux_list.append(v)
        else:
            schema_list.append(v)

    if return_aux:
        return aux_list
    else:
        return schema_list


import jsonschema

def validate_schema_with_metaschema(schema, metaschema=None, verbose=False):
    """
    Validate a JSON Schema against a metaschema using the check-jsonschema CLI tool.

    This function writes the provided schema and metaschema to temporary files and
    invokes the external `check-jsonschema` command-line tool to perform validation.

    Args:
        schema (dict): The JSON Schema to validate.
        metaschema (dict, optional): The metaschema to validate against.
            If None, you must provide a metaschema explicitly.

    Raises:
        subprocess.CalledProcessError: If the check-jsonschema command fails.

    Returns:
        None. Raises an error if validation fails, otherwise completes silently.

    Note:
        This function does not return a value. It will raise an error if validation fails.
        It is intended for use in environments where the check-jsonschema CLI is available.
    """
    logger.info(f"Validating '{schema.get('id', '')}' with metaschema")
    # Create temp files for schema and metaschema
    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as schema_file, \
         tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as metaschema_file:
        json.dump(schema, schema_file)
        schema_file.flush()
        json.dump(metaschema, metaschema_file)
        metaschema_file.flush()
        schema_path = schema_file.name
        metaschema_path = metaschema_file.name
    
    if verbose:
        cmd = [
            "check-jsonschema", "--verbose",
            "--schemafile", metaschema_path,
            schema_path
        ]
    else:
        cmd = [
            "check-jsonschema",
            "--schemafile", metaschema_path,
            schema_path
        ]
    # Capture output and wait for the subprocess to complete before returning
    completed_process = subprocess.run(cmd, capture_output=True, text=True)
    if completed_process.returncode != 0:
        logger.error(f"check-jsonschema failed with exit code {completed_process.returncode}")
        # Log stdout/stderr if available
        if completed_process.stdout:
            logger.error(f"STDOUT: {completed_process.stdout}")
        if completed_process.stderr:
            logger.error(f"STDERR: {completed_process.stderr}")
        # Do not raise an error, just return
        return



schema_list = bundled_schema_to_dict_list('../examples//schema/json/schema_dev.json')
# schema_list = bundled_schema_to_dict_list('../output/test_schema_bundle.json')
metaschema = get_metaschema()

for schema in schema_list:
    validate_schema_with_metaschema(schema, metaschema=metaschema, verbose=True)

# validate_schema_with_metaschema(schema_list[5], metaschema=metaschema, verbose=False)

2025-10-07 16:44:37,281 [INFO] Initializing DataDictionary with schema path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:37,282 [INFO] Initializing ResolveSchema with schema path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:37,283 [INFO] Starting schema resolution process.
2025-10-07 16:44:37,283 [INFO] Reading JSON file from path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:37,285 [INFO] Successfully read JSON schema.
2025-10-07 16:44:37,285 [INFO] Retrieving node names from schema.
2025-10-07 16:44:37,286 [INFO] Splitting schema into individual node schemas.
2025-10-07 16:44:37,286 [INFO] Split schema into individual node schemas.
2025-10-07 16:44:37,286 [INFO] Retrieving schema for schema ID: _definitions.yaml
2025-10-07 16:44:37,286 [INFO] Retrieved definitions schema.
2025-10-07 16:44:37,286 [INFO] Retrieving schema for schema ID: _terms.yaml
2025-10-07 16:44:37,286 [INFO] Retrieved terms schema.
2025-10-07 16:44:37,287 [INFO] Resolving re

key: demographic.yaml
key: project.yaml
key: acknowledgement.yaml
key: medical_history.yaml
key: program.yaml
key: sample.yaml
key: lipidomics_file.yaml
key: publication.yaml
key: subject.yaml
key: core_metadata_collection.yaml


2025-10-07 16:44:37,528 [INFO] Validating 'project' with metaschema
2025-10-07 16:44:37,697 [INFO] Validating 'acknowledgement' with metaschema
2025-10-07 16:44:37,865 [INFO] Validating 'medical_history' with metaschema
2025-10-07 16:44:38,024 [INFO] Validating 'program' with metaschema
2025-10-07 16:44:38,190 [INFO] Validating 'sample' with metaschema
2025-10-07 16:44:38,348 [INFO] Validating 'lipidomics_file' with metaschema
2025-10-07 16:44:38,507 [INFO] Validating 'publication' with metaschema
2025-10-07 16:44:38,666 [INFO] Validating 'subject' with metaschema
2025-10-07 16:44:38,830 [INFO] Validating 'core_metadata_collection' with metaschema


In [11]:
schema_list = bundled_schema_to_dict_list('../examples//schema/json/schema_dev.json', return_aux=True)
schema_list

2025-10-07 16:44:41,133 [INFO] Initializing DataDictionary with schema path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:41,134 [INFO] Initializing ResolveSchema with schema path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:41,134 [INFO] Starting schema resolution process.
2025-10-07 16:44:41,134 [INFO] Reading JSON file from path: ../examples//schema/json/schema_dev.json
2025-10-07 16:44:41,136 [INFO] Successfully read JSON schema.
2025-10-07 16:44:41,136 [INFO] Retrieving node names from schema.
2025-10-07 16:44:41,137 [INFO] Splitting schema into individual node schemas.
2025-10-07 16:44:41,137 [INFO] Split schema into individual node schemas.
2025-10-07 16:44:41,137 [INFO] Retrieving schema for schema ID: _definitions.yaml
2025-10-07 16:44:41,137 [INFO] Retrieved definitions schema.
2025-10-07 16:44:41,137 [INFO] Retrieving schema for schema ID: _terms.yaml
2025-10-07 16:44:41,138 [INFO] Retrieved terms schema.
2025-10-07 16:44:41,138 [INFO] Resolving re

key: demographic.yaml
key: project.yaml
key: acknowledgement.yaml
key: medical_history.yaml
key: program.yaml
key: sample.yaml
key: lipidomics_file.yaml
key: publication.yaml
key: subject.yaml
key: core_metadata_collection.yaml


[]

In [None]:
schema_list

In [3]:
from gen3_validator.resolve_schema import ResolveSchema
file_input = '../output/test_schema_bundle.json'
resolver = ResolveSchema(file_input)
resolver.resolve_schema()
bundled = resolver.schema_resolved
bundled

{'project.yaml': {'$schema': 'http://json-schema.org/draft-04/schema#',
  'version': None,
  'id': 'project',
  'title': 'project',
  'type': 'object',
  'namespace': None,
  'category': 'administrative',
  'program': '*',
  'project': '*',
  'description': 'Gen3 Compulsary Node',
  'submittable': True,
  'validators': None,
  'systemProperties': ['id',
   'project_id',
   'state',
   'created_datetime',
   'updated_datetime'],
  'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
  'required': ['project_id'],
  'links': [],
  'properties': {'project_id': {'type': 'string',
    'description': 'Synthetic_Dataset_1'},
   'description': {'type': 'string',
    'description': 'Project containing synthetic data'}}},
 'sample.yaml': {'$schema': 'http://json-schema.org/draft-04/schema#',
  'version': None,
  'id': 'sample',
  'title': 'sample',
  'type': 'object',
  'namespace': None,
  'category': 'clinical',
  'program': '*',
  'project': '*',
  'description': 'Info about sample',
  'sub

In [None]:
schema_list

In [None]:
from gen3schemadev.converter import format_multiplicity
format_multiplicity(multiplicity="many_to_one")

# Need to add function in converter that recognises the datetime data input and adds a reference to the _definitions.yaml#/datetime file


In [None]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
from gen3schemadev.converter import *



# Loading template and metaschema
metaschema = get_metaschema()
converter_template = generate_gen3_template(metaschema)

# loading input example
data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)


import json
print(json.dumps(construct_props('sample', validated_model), indent=4))


# construct_props('sample', validated_model)

In [2]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import bundled_schema_to_list_dict
from gen3schemadev.validators.metaschema_validator import validate_schema_with_metaschema
import os

metaschema = get_metaschema()
schema_file = os.path.join('.', '../tests/gen3_schema/schema_dev_fail.json')
resolve_schema = resolve_schema(schema_path=schema_file)
schema_list = bundled_schema_to_list_dict(resolve_schema)

for schema in schema_list:
    validate_schema_with_metaschema(schema, metaschema=metaschema, verbose=True)


2025-10-08 14:19:59,025 [INFO] Initializing DataDictionary with schema path: ./../tests/gen3_schema/schema_dev_fail.json
2025-10-08 14:19:59,026 [INFO] Initializing ResolveSchema with schema path: ./../tests/gen3_schema/schema_dev_fail.json
2025-10-08 14:19:59,026 [INFO] Starting schema resolution process.
2025-10-08 14:19:59,027 [INFO] Reading JSON file from path: ./../tests/gen3_schema/schema_dev_fail.json
2025-10-08 14:19:59,028 [INFO] Successfully read JSON schema.
2025-10-08 14:19:59,028 [INFO] Retrieving node names from schema.
2025-10-08 14:19:59,028 [INFO] Splitting schema into individual node schemas.
2025-10-08 14:19:59,028 [INFO] Split schema into individual node schemas.
2025-10-08 14:19:59,029 [INFO] Retrieving schema for schema ID: _definitions.yaml
2025-10-08 14:19:59,029 [INFO] Retrieved definitions schema.
2025-10-08 14:19:59,029 [INFO] Retrieving schema for schema ID: _terms.yaml
2025-10-08 14:19:59,029 [INFO] Retrieved terms schema.
2025-10-08 14:19:59,030 [INFO] Res

RuntimeError: check-jsonschema validation failed for schema 'demographic'. See logs for details.

In [2]:
demographic_schema

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'id': 'demographic',
 'title': 'Demographic',
 'type': 'object',
 'namespace': 'http://commons.heartdata.baker.edu.au/',
 'category': 'a_random_category',
 'program': '*',
 'project': '*',
 'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
 'additionalProperties': False,
 'submittable': True,
 'validators': None,
 'systemProperties': ['id',
  'project_id',
  'state',
  'created_datetime',
  'updated_datetime'],
 'links': [{'names': 'subjects',
   'backref': 'demographics',
   'label': 'describes',
   'target_type': 'subject',
   'multiplicity': 'one_to_one',
   'required': True}],
 'required': ['type', 'submitter_id', 'subjects'],
 'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
 'properties': {'type': {'type': 'string'},
  'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it

In [1]:
from gen3schemadev.utils import *
schema_file = os.path.join('.', '../tests/gen3_schema/schema_dev_pass.json')
resolved_schema = resolve_schema(schema_file)
len(resolved_schema)

2025-10-07 17:36:37,877 [INFO] Initializing DataDictionary with schema path: ./../tests/gen3_schema/schema_dev_pass.json
2025-10-07 17:36:37,878 [INFO] Initializing ResolveSchema with schema path: ./../tests/gen3_schema/schema_dev_pass.json
2025-10-07 17:36:37,878 [INFO] Starting schema resolution process.
2025-10-07 17:36:37,878 [INFO] Reading JSON file from path: ./../tests/gen3_schema/schema_dev_pass.json
2025-10-07 17:36:37,879 [INFO] Successfully read JSON schema.
2025-10-07 17:36:37,879 [INFO] Retrieving node names from schema.
2025-10-07 17:36:37,879 [INFO] Splitting schema into individual node schemas.
2025-10-07 17:36:37,880 [INFO] Split schema into individual node schemas.
2025-10-07 17:36:37,880 [INFO] Retrieving schema for schema ID: _definitions.yaml
2025-10-07 17:36:37,880 [INFO] Retrieved definitions schema.
2025-10-07 17:36:37,880 [INFO] Retrieving schema for schema ID: _terms.yaml
2025-10-07 17:36:37,880 [INFO] Retrieved terms schema.
2025-10-07 17:36:37,880 [INFO] Res

10

In [3]:
resolved_schema

[{'$schema': 'http://json-schema.org/draft-04/schema#',
  'id': 'demographic',
  'title': 'Demographic',
  'type': 'object',
  'namespace': 'http://commons.heartdata.baker.edu.au/',
  'category': 'clinical',
  'program': '*',
  'project': '*',
  'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
  'additionalProperties': False,
  'submittable': True,
  'validators': None,
  'systemProperties': ['id',
   'project_id',
   'state',
   'created_datetime',
   'updated_datetime'],
  'links': [{'name': 'subjects',
    'backref': 'demographics',
    'label': 'describes',
    'target_type': 'subject',
    'multiplicity': 'one_to_one',
    'required': True}],
  'required': ['type', 'submitter_id', 'subjects'],
  'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
  'properties': {'type': {'type': 'string'},
   'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used t

In [2]:
demographic = [schema for schema in schema_list if schema['id'] == 'demographic']
demographic

[{'$schema': 'http://json-schema.org/draft-04/schema#',
  'id': 'demographic',
  'title': 'Demographic',
  'type': 'object',
  'namespace': 'http://commons.heartdata.baker.edu.au/',
  'category': 'clinical',
  'program': '*',
  'project': '*',
  'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
  'additionalProperties': False,
  'submittable': True,
  'validators': None,
  'systemProperties': ['id',
   'project_id',
   'state',
   'created_datetime',
   'updated_datetime'],
  'links': [{'name': 'subjects',
    'backref': 'demographics',
    'label': 'describes',
    'target_type': 'subject',
    'multiplicity': 'one_to_one',
    'required': True}],
  'required': ['type', 'submitter_id', 'subjects'],
  'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
  'properties': {'type': {'type': 'string'},
   'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used t

In [15]:
schema_list

[{'$schema': 'http://json-schema.org/draft-04/schema#',
  'id': 'project',
  'title': 'Project',
  'type': 'object',
  'program': '*',
  'project': '*',
  'category': 'administrative',
  'description': 'The study the data is coming from',
  'additionalProperties': False,
  'submittable': True,
  'validators': None,
  'systemProperties': ['id',
   'state',
   'released',
   'releasable',
   'intended_release_date'],
  'required': ['code', 'name', 'programs', 'dbgap_accession_number'],
  'uniqueKeys': [['id'], ['code']],
  'links': [{'name': 'programs',
    'backref': 'projects',
    'label': 'member_of',
    'target_type': 'program',
    'multiplicity': 'many_to_one',
    'required': True}],
  'constraints': None,
  'properties': {'type': {'type': 'string'},
   'id': {'term': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different.

In [16]:
from gen3_validator.resolve_schema import ResolveSchema
from gen3schemadev.utils import *

# Generate a bundled json, a resolved bundled json, a dir of yamls and a dir of resolved yamls from a singled unresolved bundled_json

bundled_schema_path = '../examples/schema/json/schema_dev.json'
bundled_schema = read_json(bundled_schema_path)

# write to yamls
output_folder = "../tests/gen3_schema/examples/yaml/"
for k,v in bundled_schema.items():
    w_path = os.path.join(output_folder, k)
    write_yaml(v, w_path)

# Resolve the schema and write to yamls
resolved_schema = resolve_schema(schema_path=bundled_schema_path)
for v in resolved_schema:
    k = v['id'] + '.yaml'
    print(k)
    w_path = os.path.join(output_folder, 'resolved', k)
    write_yaml(v, w_path)


2025-10-09 10:46:43,128 [INFO] Successfully loaded JSON file: ../examples/schema/json/schema_dev.json
2025-10-09 10:46:43,131 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/demographic.yaml
2025-10-09 10:46:43,136 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/project.yaml
2025-10-09 10:46:43,137 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/acknowledgement.yaml
2025-10-09 10:46:43,139 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/medical_history.yaml
2025-10-09 10:46:43,146 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/_definitions.yaml
2025-10-09 10:46:43,146 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/_settings.yaml
2025-10-09 10:46:43,148 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/program.yaml
2025-10-09 10:46:43,149 [INFO] Successfully wrote YAML file: ../tests/gen3_schema/examples/yaml/sample.yaml


demographic.yaml
project.yaml
acknowledgement.yaml
medical_history.yaml
program.yaml
sample.yaml
lipidomics_file.yaml
publication.yaml
subject.yaml
core_metadata_collection.yaml


In [None]:
write_yaml()

{'demographic.yaml': {'$schema': 'http://json-schema.org/draft-04/schema#',
  'id': 'demographic',
  'title': 'Demographic',
  'type': 'object',
  'namespace': 'http://commons.heartdata.baker.edu.au/',
  'category': 'clinical',
  'program': '*',
  'project': '*',
  'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
  'additionalProperties': False,
  'submittable': True,
  'validators': None,
  'systemProperties': ['id',
   'project_id',
   'state',
   'created_datetime',
   'updated_datetime'],
  'links': [{'name': 'subjects',
    'backref': 'demographics',
    'label': 'describes',
    'target_type': 'subject',
    'multiplicity': 'one_to_one',
    'required': True}],
  'required': ['type', 'submitter_id', 'subjects'],
  'uniqueKeys': [['id'], ['project_id', 'submitter_id']],
  'properties': {'$ref': '_definitions.yaml#/ubiquitous_properties',
   'subjects': {'$ref': '_definitions.yaml#/to