In [None]:
import gen3_validator
# loading gen3 bundled jsonschema into dictionary class
dd = gen3_validator.dict.DataDictionary('../examples/schema/json/schema_dev.json')
dd.parse_schema()

In [None]:
# getting list of entities
dd.get_nodes()

In [None]:
# Returning the jsonschema for a given entity
dd.return_schema('lipidomics_file.yaml')

In [None]:
!pip install dictionaryutils

In [None]:

import dictionaryutils
import json
import yaml
dd = dictionaryutils.DataDictionary('/Users/harrijh/projects/gen3schemadev/examples/schema/yaml')
dd.load_data(directory='/Users/harrijh/projects/gen3schemadev/examples/schema/yaml')
dd_resolved = dd.schema

for k, v in dd_resolved.items():
    with open(f'/Users/harrijh/projects/gen3schemadev/examples/schema/yaml/resolved/{k}.yaml', 'w') as f:
        yaml.safe_dump(v, f)


with open(f'/Users/harrijh/projects/gen3schemadev/examples/schema/json/gen3_bundled_schema_resolved.json', 'w') as f:
    json.dump(dd_resolved, f)

In [None]:
dd = dictionaryutils.load_schemas_from_file('/Users/harrijh/projects/gen3schemadev/examples/schema/json/schema_dev.json')
dd

# To validate a gen3schemadev input yaml with the input schema


In [None]:
# !pip install check-jsonschema

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/input_schema.yml ../src/gen3schemadev/schema/input_example.yml --verbose

# To validate a single gen3 schema yaml with the metaschema
- note, the schema file needs to be resolved

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/gen3_metaschema.yml ../examples/schema/yaml/resolved/lipidomics_file.yaml --verbose

if you give a non-resolved schema, it will not validate

In [None]:
!check-jsonschema --schemafile ../src/gen3schemadev/schema/gen3_metaschema.yml ../examples/schema/yaml/lipidomics_file.yaml --verbose

# Compiling gen3schemadev input yaml to gen3 yamls
## Steps:
1. Validate input yaml against the input schema
2. Load the input yaml into python dictionary
3. The compiler should start with an empty data structure, with default values for a gen3 schema. This yaml is defined in the `src/gen3schemadev/schema/gen3_schema_template.yml`
4. 

In [2]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
import logging

# Set up basic logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

metaschema_path = "../src/gen3schemadev/schema/gen3_metaschema.yml"
converter_template = generate_gen3_template(metaschema_path)
metaschema = load_yaml(metaschema_path)

# loading input example
data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)


2025-09-30 13:23:47,210 [INFO] Successfully loaded YAML file: ../src/gen3schemadev/schema/gen3_metaschema.yml
2025-09-30 13:23:47,219 [INFO] Successfully loaded YAML file: ../src/gen3schemadev/schema/gen3_metaschema.yml
2025-09-30 13:23:47,223 [INFO] Successfully loaded YAML file: ../tests/input_example.yml


In [None]:
converter_template

In [None]:
metaschema

In [None]:
validated_model.model_dump()

Now we need to read to:
1. validate the input yaml against the input metaschema
2. now we know the input yaml is validated, we can extract the data from the yaml into a data class which has the structure `input.entity.properties.links`
3. We then use the data class to populated the converter template for each entity
4. The populated templates are then written to the output directory


In [6]:
from dataclasses import dataclass
class Entity:
    name: str
    description: str
    category: str
    properties: list
    links: list

from typing import Any

def get_entity_data(entity: str, data: Any) -> Entity:
    try:
        for ent in data.entities:
            if ent.name == entity:
                return ent
        raise ValueError(f"Entity '{entity}' not found in data.entities")
    except AttributeError as e:
        raise AttributeError(f"Invalid data structure: {e}")
    except Exception as e:
        raise Exception(f"An error occurred while retrieving entity data: {e}")

def get_entity_links(entity: str, data: Any) -> list[dict]:
    links = data.links
    entity_links = []
    for link in links:
        if link.child == entity:
            entity_links.append(link.model_dump())
    return entity_links



from dataclasses import dataclass, asdict
from typing import Optional, List, Union

@dataclass
class LinkObj:
    name: str
    backref: str
    label: Optional[str]
    target_type: str
    multiplicity: str
    required: bool

    def to_dict(self):
        return asdict(self)

@dataclass
class LinkGroup:
    exclusive: bool
    required: bool
    subgroup: List[dict]

    def to_dict(self):
        return asdict(self)
    
def create_core_metadata_link(child_name: str) -> dict:
    link_obj = LinkObj(
        name=f"core_metadata_collections",
        backref=f"{child_name}s",
        label=None,
        target_type="core_metadata_collection",
        multiplicity="one_to_one",
        required=True 
    )
    return link_obj.to_dict()

def convert_entity_links(links: dict, entity_file: bool = False) -> dict:
    link_list = []
    for link in links:
        link_obj = LinkObj(
            name=f"{link['parent']}s",
            backref=f"{link['child']}s",
            label=None,
            target_type=link['parent'],
            multiplicity=link['multiplicity'],
            required=True  # TODO remove this hard code later, should pull from input yaml
        )
        link_list.append(link_obj.to_dict())

    if entity_file:
        core_link = create_core_metadata_link(links[0]['child'])
        link_list.append(core_link)

    if len(link_list) > 1:
        group = LinkGroup(
            exclusive=False,
            required=True,
            subgroup=link_list
        )
        output = group.to_dict()
    else:
        output = link_list
    return output

### Note to self, you will now need to find a way to add in the _definitions and _terms references.
### You also need to define a standard for the _definitions and _terms references files to read from as templates

def get_properties(entity_name: str, data: Any) -> list[dict]:
    output = []
    ent = get_entity_data(entity_name, data)
    props = ent.properties
    if props:
        for prop in props:
            pdict = {
                prop.name: {k: v for k, v in prop.model_dump().items() if k != "name"}
            }
            output.append(pdict)
    else:
        raise Exception(f'No properties found for entity {entity_name}')
    return output


def get_category(entity_name: str, data: Any) -> str:
    ent = get_entity_data(entity_name, data)
    category = ent.category
    # If it's an Enum, get its value; otherwise, return as is
    if hasattr(category, "value"):
        return category.value
    return category

def get_entity_value(entity_name: str, key: str, data: Any):
    """
    Returns the value of a single key within an entity object.

    Args:
        entity_name (str): The name of the entity to retrieve.
        key (str): The key whose value is to be returned.
        data (Any): The data structure containing entities.

    Returns:
        The value associated with the specified key in the entity object.
    """
    ent = get_entity_data(entity_name, data)
    return ent.model_dump()[key]


def populate_template(entity_name: str, input_data, template) -> dict:
    """
    Populate a Gen3 schema template dictionary with values from a Pydantic data model.

    This function takes an entity name, a Pydantic model instance containing entity data,
    and a Gen3 schema template dictionary. It fills a copy of the template with values
    from the input data, applying special logic for certain keys (e.g., 'name', 'category',
    'properties', 'links'). If a key from the input data is not found in the template,
    it is added with a value of None and a warning is logged.

    Args:
        entity_name (str): The name of the entity to populate in the template.
        input_data: A Pydantic model instance containing the entity's data.
        template (dict): A Gen3 schema template dictionary to be populated.

    Returns:
        dict: A new Gen3 schema template dictionary populated with values from the input data.

    Side Effects:
        Logs a warning if a key from the input data is not found in the template.
    """
    # ... function body ...
    ent = get_entity_data(entity_name, input_data)
    ent_dict = ent.model_dump()
    output_schema = template.copy()
    
    # Checking if entity is file category
    file_cat = False
    if get_entity_value(entity_name, 'category', input_data) == 'file':
        file_cat = True
    
    for key, value in ent_dict.items():
        if key == 'name':
            output_schema['id'] = value
        elif key == 'category':
            output_schema[key] = get_category(entity_name, input_data)
        elif key == 'properties':
            output_schema[key] = get_properties(entity_name, input_data)
        elif key == 'links':
            links = get_entity_links(entity_name, input_data)
            output_schema[key] = convert_entity_links(links, entity_file=file_cat)
        elif key in output_schema:
            output_schema[key] = value
        else:
            logger.warning(f"Key '{key}' not found in template")
    return output_schema

validated_model = DataModel.model_validate(data)
# links = get_entity_links('lipidomics_file', validated_model)
# convert_entity_links(links, entity_file=True)
# get_properties('sample', validated_model)
# get_category('sample', validated_model)

out_template = populate_template('lipidomics_file', validated_model, converter_template)
out_template
# write_yaml(out_template, 'output.yml')

Exception: No properties found for entity lipidomics_file

# Testing converter


In [3]:
from gen3schemadev.schema.gen3_template import *
from gen3schemadev.utils import *
from gen3schemadev.schema.input_schema import DataModel
from gen3schemadev.converter import *



# Loading template and metaschema
metaschema_path = "../src/gen3schemadev/schema/gen3_metaschema.yml"
converter_template = generate_gen3_template(metaschema_path)
metaschema = load_yaml(metaschema_path)

# loading input example
data = load_yaml('../tests/input_example.yml')
validated_model = DataModel.model_validate(data)


out_template = populate_template('lipidomics_file', validated_model, converter_template)
write_yaml(out_template, 'output.yml')

2025-09-30 16:36:47,338 [INFO] Successfully loaded YAML file: ../src/gen3schemadev/schema/gen3_metaschema.yml


2025-09-30 16:36:47,355 [INFO] Successfully loaded YAML file: ../src/gen3schemadev/schema/gen3_metaschema.yml
2025-09-30 16:36:47,362 [INFO] Successfully loaded YAML file: ../tests/input_example.yml
2025-09-30 16:36:47,384 [INFO] Successfully wrote YAML file: output.yml


Key: name, Value: lipidomics_file
Key: description, Value: Info about lipidomics file
Key: category, Value: data_file
Key: properties, Value: []


In [2]:
get_properties('lipidomics_file', validated_model)
construct_props('lipidomics_file', validated_model)




{'samples': {'$ref': '_definitions.yaml#/one_to_many'},
 'assays': {'$ref': '_definitions.yaml#/one_to_many'}}

In [2]:
get_entity_data('lipidomics_file', validated_model)

Entity(name='lipidomics_file', description='Info about lipidomics file', category=<CategoryEnum.DATA_FILE: 'data_file'>, properties=[])

In [2]:
links = get_entity_links('lipidomics_file', validated_model)
convert_entity_links(links, entity_file=True)
get_entity_value('lipidomics_file', 'category', validated_model) == 'data_file'

True

***
## Trying to create template from metaschema


In [None]:
from gen3schemadev.schema.gen3_template import *

out_template = generate_gen3_template('../src/gen3schemadev/schema/gen3_metaschema.yml')

write_yaml(out_template, 'output.yml')