# Resolve Schema

**input**
```
+schema_path: str
```

**methods**
```
+read_json(schema_path: str): dict
+split_json(schema: dict): list
+resolver(entity1: dict, entity2: dict): dict
+resolve_defs(terms: dict, defs: dict) : dict
+ node_order(schema: dict): list
+resolve_nodes(nodeList: list, splitJsonList: list): list
+recombine_nodes(resolvedList: list) : dict
```

In [30]:
import json
from collections import defaultdict, deque

class ResolveSchema:
    
    def __init__(self, schema_path: str):
        """
        Initialize the ResolveSchema class.

        Parameters:
        - schema_path (str): The path to the JSON schema file.
        """
        self.schema_path = schema_path
        self.schema = self.read_json(self.schema_path)
        self.nodes = self.get_nodes()
        self.node_pairs = self.get_all_node_pairs()
        self.node_order = self.get_node_order(edges=self.node_pairs)
        self.schema_list = self.split_json()
        self.schema_def = self.return_schema("_definitions.yaml")
        self.schema_term = self.return_schema("_terms.yaml")
        self.schema_def_resolved = self.resolve_references(self.schema_def, self.schema_term)
        self.schema_list_resolved = self.resolve_all_references()
        
    def read_json(self, path: str) -> dict:
        """
        Read a JSON file and return its contents as a dictionary.

        Parameters:
        - path (str): The path to the JSON file.

        Returns:
        - dict: The contents of the JSON file.
        """
        with open(path) as f:
            return json.load(f)
    
    def get_nodes(self) -> list:
        """
        Retrieve all node names from the schema.

        Returns:
        - list: A list of node names.
        """
        nodes = list(self.schema.keys())
        return nodes
    
    def get_node_link(self, node_name: str) -> tuple:
        """
        Retrieve the links and ID for a given node.

        Parameters:
        - node_name (str): The name of the node.

        Returns:
        - tuple: A tuple containing the node ID and its links.
        """
        links = self.schema[node_name]['links']
        node_id = self.schema[node_name]['id']
        if 'subgroup' in links[0]:
            return node_id, links[0]['subgroup']
        else:
            return node_id, links

    def find_upstream_downstream(self, node_name: str) -> list:
        """
        Takes a node name and returns the upstream and downstream nodes.

        Parameters:
        - node_name (str): The name of the node.

        Returns:
        - list: A list of tuples representing upstream and downstream nodes.
        """
        node_id, links = self.get_node_link(node_name)
        
        # Ensure links is a list
        if isinstance(links, dict):
            links = [links]

        results = []

        for link in links:
            target_type = link.get("target_type")
            
            if not node_id or not target_type:
                print("Missing essential keys in link:", link)
                results.append((None, None))
                continue

            results.append((target_type, node_id))

        return results

    def get_all_node_pairs(self, excluded_nodes=["_definitions.yaml", "_terms.yaml", "_settings.yaml", "program.yaml"]) -> list:
        """
        Retrieve all node pairs, excluding specified nodes.

        Parameters:
        - excluded_nodes (list): A list of node names to exclude.

        Returns:
        - list: A list of node pairs.
        """
        node_pairs = []
        for node in self.nodes:
            if not node in excluded_nodes:
                node_pairs.extend(self.find_upstream_downstream(node))
            else:
                continue
        return node_pairs
    
    def get_node_order(self, edges: list) -> list:
        """
        Determine the order of nodes based on their dependencies.

        Parameters:
        - edges (list): A list of tuples representing node dependencies.

        Returns:
        - list: A list of nodes in topological order.
        """
        # Build graph representation
        graph = defaultdict(list)
        in_degree = defaultdict(int)

        for upstream, downstream in edges:
            graph[upstream].append(downstream)
            in_degree[downstream] += 1
            if upstream not in in_degree:
                in_degree[upstream] = 0

        # Perform Topological Sorting (Kahn's Algorithm)
        sorted_order = []
        zero_in_degree = deque([node for node in in_degree if in_degree[node] == 0])

        while zero_in_degree:
            node = zero_in_degree.popleft()
            sorted_order.append(node)
            
            for neighbor in graph[node]:
                in_degree[neighbor] -= 1
                if in_degree[neighbor] == 0:
                    zero_in_degree.append(neighbor)

        # Ensure core_metadata_collection is last
        sorted_order.remove("core_metadata_collection")
        sorted_order.append("core_metadata_collection")

        return sorted_order
    
    def split_json(self) -> list:
        """
        Split the schema into a list of individual node schemas.

        Returns:
        - list: A list of node schemas.
        """
        schema_list = []
        for node in self.nodes:
            schema_list.append(self.schema[node])
        return schema_list
    
    def return_schema(self, target_id: str) -> dict:
        """
        Retrieves the first dictionary from a list where the 'id' key matches the target_id.

        Parameters:
        - target_id (str): The value of the 'id' key to match.

        Returns:
        - dict: The dictionary that matches the target_id, or None if not found.
        """
        if target_id.endswith('.yaml'):
            target_id = target_id[:-5]
        
        result = next((item for item in self.schema_list if item.get('id') == target_id), None)
        if result is None:
            print(f"{target_id} not found")
        return result
    
    def resolve_references(self, schema: dict, reference: dict) -> dict:
        """
        Takes a gen3 jsonschema draft 4 as a dictionary and recursively resolves any references using a reference schema which has no references.

        Parameters:
        - schema (dict): The JSON node to resolve references in.
        - reference (dict): The schema containing the references.

        Returns:
        - dict: The resolved JSON node with references resolved.
        """
        ref_input_content = reference

        def resolve_node(node, manual_ref_content=ref_input_content):
            if isinstance(node, dict):
                if '$ref' in node:
                    ref_path = node['$ref']
                    ref_file, ref_key = ref_path.split('#')
                    ref_file = ref_file.strip()
                    ref_key = ref_key.strip('/')
                
                    # if a reference file is in the reference, load the pre-defined reference, if no file exists, then use the schema itself as reference
                    if ref_file:
                        ref_content = manual_ref_content
                    else:
                        ref_content = schema
                    
                    for part in ref_key.split('/'):
                        ref_content = ref_content[part]

                    resolved_content = resolve_node(ref_content)
                    # Merge resolved content with the current node, excluding the $ref key
                    return {**resolved_content, **{k: resolve_node(v) for k, v in node.items() if k != '$ref'}}
                else:
                    return {k: resolve_node(v) for k, v in node.items()}
            elif isinstance(node, list):
                return [resolve_node(item) for item in node]
            else:
                return node

        return resolve_node(schema)
    
    def resolve_all_references(self) -> list:
        """
        Resolves references in all other schema dictionaries using the resolved definitions schema.

        Returns:
        - list: A list of resolved schema dictionaries.
        """
        resolved_schema_list = []
        for node in self.nodes:
            
            if node == "_definitions.yaml" or node == "_terms.yaml":
                continue
            
            try:
                resolved_schema = self.resolve_references(self.schema[node], self.schema_def_resolved)
                resolved_schema_list.append(resolved_schema)
                print(f"Resolved {node}")
            except KeyError as e:
                print(f"Error resolving {node}: Missing key {e}")
            except Exception as e:
                print(f"Error resolving {node}: {e}")
                
        return resolved_schema_list

In [32]:
resolved_schema = ResolveSchema("../schema/gen3_test_schema.json")

Resolved demographic.yaml
Resolved project.yaml
Resolved serum_marker_assay.yaml
Resolved alignment_workflow.yaml
Resolved imaging_file.yaml
Resolved lipidomics_assay.yaml
Resolved metabolomics_file.yaml
Resolved acknowledgement.yaml
Resolved medical_history.yaml
Resolved _settings.yaml
Resolved blood_pressure_test.yaml
Resolved genomics_assay.yaml
Resolved variant_file.yaml
Resolved program.yaml
Resolved serum_marker_file.yaml
Resolved proteomics_assay.yaml
Resolved sample.yaml
Resolved unaligned_reads_file.yaml
Resolved aligned_reads_index_file.yaml
Resolved variant_workflow.yaml
Resolved proteomics_file.yaml
Resolved exposure.yaml
Resolved metabolomics_assay.yaml
Resolved lipidomics_mapping_file.yaml
Resolved lipidomics_file.yaml
Resolved aligned_reads_file.yaml
Resolved lab_result.yaml
Resolved medication.yaml
Resolved publication.yaml
Resolved subject.yaml
Resolved core_metadata_collection.yaml


In [13]:
resolved_schema = ResolveSchema("../schema/gen3_test_schema.json")
def_schema = resolved_schema.return_schema('_definitions.yaml')
terms_schema = resolved_schema.return_schema('_terms.yaml')
sample_schema = resolved_schema.return_schema('sample.yaml')
demographic_schema = resolved_schema.return_schema('demographic.yaml')


def resolve_references(schema: dict, reference: dict) -> dict:
    """
    Takes a gen3 jsonschema draft 4 as a dictionary and recursively resolves any references using a reference schema which has no references.

    Parameters:
    - schema (dict): The JSON node to resolve references in.
    - reference (dict): the schema containing the references

    Returns:
    - dict: The resolved JSON node with references resolved.
    """
    
    ref_input_content = reference


    def resolve_node(node, manual_ref_content=ref_input_content):
        if isinstance(node, dict):
            if '$ref' in node:
                ref_path = node['$ref']
                ref_file, ref_key = ref_path.split('#')
                ref_file = ref_file.strip()
                ref_key = ref_key.strip('/')
                # print(f'Resolving $ref: {ref_file}#{ref_key}')
            
                # if a reference file is in the reference, load the pre-defined reference, if no file exists, then use the schema itself as reference
                if ref_file:
                    ref_content = manual_ref_content
                else:
                    ref_content = schema
                
                
                for part in ref_key.split('/'):
                    ref_content = ref_content[part]

                resolved_content = resolve_node(ref_content)
                # Merge resolved content with the current node, excluding the $ref key
                return {**resolved_content, **{k: resolve_node(v) for k, v in node.items() if k != '$ref'}}
            else:
                return {k: resolve_node(v) for k, v in node.items()}
        elif isinstance(node, list):
            return [resolve_node(item) for item in node]
        else:
            return node

    return resolve_node(schema)

def_resolved = resolve_references(def_schema, terms_schema)
def_resolved
resolve_references(sample_schema, def_resolved)

{'$schema': 'http://json-schema.org/draft-04/schema#',
 'additionalProperties': False,
 'category': 'biospecimen',
 'description': "Biospecimen information that links subjects to samples including sample's provider and source.",
 'id': 'sample',
 'links': [{'backref': 'samples',
   'label': 'taken_from',
   'multiplicity': 'many_to_one',
   'name': 'subjects',
   'required': True,
   'target_type': 'subject'}],
 'namespace': 'https://data.test.biocommons.org.au/',
 'program': '*',
 'project': '*',
 'properties': {'created_datetime': {'oneOf': [{'format': 'date-time',
     'type': 'string'},
    {'type': 'null'}],
   'term': {'description': 'A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n'}},
  'id': {'pattern': '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$',
   'term': {'description': 'A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUID

In [None]:
ref_dict = {
    resolved_schema.schema['demographic.yaml']['links']: resolved_schema.schema['demographic.yaml']['links']
    }


In [22]:
resolved_schema.get_node_link('lipidomics_file.yaml')

[{'backref': 'lipidomics_files',
  'label': 'data_from',
  'multiplicity': 'many_to_one',
  'name': 'lipidomics_assays',
  'required': False,
  'target_type': 'lipidomics_assay'},
 {'backref': 'lipidomics_files',
  'label': 'data_from',
  'multiplicity': 'one_to_one',
  'name': 'core_metadata_collections',
  'required': False,
  'target_type': 'core_metadata_collection'}]

In [23]:
# for getting upstream downstream from links
def find_upstream_downstream(links):
    """Takes a list of dictionaries or a single dictionary of links and returns the upstream and downstream nodes"""

    # Ensure links is a list
    if isinstance(links, dict):
        links = [links]

    results = []

    for link in links:
        backref = link.get("backref")
        target_type = link.get("target_type")
        
        if not backref or not target_type:
            print("Missing essential keys in link:", link)
            results.append((None, None))
            continue
        
        # strip last s from name
        if backref.endswith('s'):
            backref = backref[:-1]

        # Determine upstream/downstream logic
        upstream = target_type
        downstream = backref
        
        print(f"Upstream: {upstream}, Downstream: {downstream}")

        results.append((upstream, downstream))

    return results

In [25]:
link = resolved_schema.get_node_link('lipidomics_file.yaml')
find_upstream_downstream(link)

Upstream: lipidomics_assay, Downstream: lipidomics_file
Upstream: core_metadata_collection, Downstream: lipidomics_file


[('lipidomics_assay', 'lipidomics_file'),
 ('core_metadata_collection', 'lipidomics_file')]