# Resolve Schema

**input**
```
+schema_path: str
```

**methods**
```
+read_json(schema_path: str): dict
+split_json(schema: dict): list
+resolver(entity1: dict, entity2: dict): dict
+resolve_defs(terms: dict, defs: dict) : dict
+ node_order(schema: dict): list
+resolve_nodes(nodeList: list, splitJsonList: list): list
+recombine_nodes(resolvedList: list) : dict
```

In [61]:
import json
from collections import defaultdict, deque

class ResolveSchema:
    
    def __init__(self, schema_path: str):
        self.schema_path = schema_path
        self.schema = self.read_json(self.schema_path)
        self.nodes = self.get_nodes()
        self.node_pairs = self.get_all_node_pairs()
        self.node_order = self.get_node_order(edges=self.node_pairs)
        
        
    def read_json(self, path: str):
        with open(path) as f:
            return json.load(f)
    
    
    def get_nodes(self):
        nodes = list(self.schema.keys())
        return nodes
    
    
    def get_node_link(self, node_name: str):
        links = self.schema[node_name]['links']
        node_id = self.schema[node_name]['id']
        if 'subgroup' in links[0]:
            return node_id, links[0]['subgroup']
        else:
            return node_id, links

    def find_upstream_downstream(self, node_name: str):
        """Takes a node name and returns the upstream and downstream nodes"""
        
        node_id, links = self.get_node_link(node_name)
        
        # Ensure links is a list
        if isinstance(links, dict):
            links = [links]

        results = []

        for link in links:
            target_type = link.get("target_type")
            
            if not node_id or not target_type:
                print("Missing essential keys in link:", link)
                results.append((None, None))
                continue

            results.append((target_type, node_id))

        return results

    def get_all_node_pairs(self, excluded_nodes=["_definitions.yaml", "_terms.yaml", "_settings.yaml", "program.yaml"]):
        node_pairs = []
        for node in self.nodes:
            if not node in excluded_nodes:
                node_pairs.extend(self.find_upstream_downstream(node))
            else:
                continue
        return node_pairs
    
    def get_node_order(self, edges: list):
        # Build graph representation
        graph = defaultdict(list)
        in_degree = defaultdict(int)

        for upstream, downstream in edges:
            graph[upstream].append(downstream)
            in_degree[downstream] += 1
            if upstream not in in_degree:
                in_degree[upstream] = 0

        # Perform Topological Sorting (Kahn's Algorithm)
        sorted_order = []
        zero_in_degree = deque([node for node in in_degree if in_degree[node] == 0])

        while zero_in_degree:
            node = zero_in_degree.popleft()
            sorted_order.append(node)
            
            for neighbor in graph[node]:
                in_degree[neighbor] -= 1
                if in_degree[neighbor] == 0:
                    zero_in_degree.append(neighbor)

        # Ensure core_metadata_collection is last
        sorted_order.remove("core_metadata_collection")
        sorted_order.append("core_metadata_collection")

        return sorted_order
        
    # def split_json(self):

In [65]:
resolved_schema = ResolveSchema("../schema/gen3_test_schema.json")
resolved_schema.schema
# resolved_schema.get_node_link("serum_marker_assay.yaml")

{'demographic.yaml': {'$schema': 'http://json-schema.org/draft-04/schema#',
  'additionalProperties': False,
  'category': 'clinical',
  'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
  'id': 'demographic',
  'links': [{'backref': 'demographics',
    'label': 'describes',
    'multiplicity': 'one_to_one',
    'name': 'subjects',
    'required': True,
    'target_type': 'subject'}],
  'namespace': 'https://data.test.biocommons.org.au/',
  'program': '*',
  'project': '*',
  'properties': {'$ref': '_definitions.yaml#/ubiquitous_properties',
   'abs_cluster': {'description': 'Australian bureau of statistics: Name of the suburb where the participant lives',
    'type': 'string'},
   'abs_state': {'description': 'Australian bureau of statistics: Name of the state where participant lives',
    'type': 'string'},
   'abs_weight_2000': {'description': 'Australian bureau of statistics: Populatio

In [5]:
ref_dict = {
    resolved_schema.schema['demographic.yaml']['links']: resolved_schema.schema['demographic.yaml']['links']
    }


TypeError: unhashable type: 'list'

In [22]:
resolved_schema.get_node_link('lipidomics_file.yaml')

[{'backref': 'lipidomics_files',
  'label': 'data_from',
  'multiplicity': 'many_to_one',
  'name': 'lipidomics_assays',
  'required': False,
  'target_type': 'lipidomics_assay'},
 {'backref': 'lipidomics_files',
  'label': 'data_from',
  'multiplicity': 'one_to_one',
  'name': 'core_metadata_collections',
  'required': False,
  'target_type': 'core_metadata_collection'}]

In [23]:
# for getting upstream downstream from links
def find_upstream_downstream(links):
    """Takes a list of dictionaries or a single dictionary of links and returns the upstream and downstream nodes"""

    # Ensure links is a list
    if isinstance(links, dict):
        links = [links]

    results = []

    for link in links:
        backref = link.get("backref")
        target_type = link.get("target_type")
        
        if not backref or not target_type:
            print("Missing essential keys in link:", link)
            results.append((None, None))
            continue
        
        # strip last s from name
        if backref.endswith('s'):
            backref = backref[:-1]

        # Determine upstream/downstream logic
        upstream = target_type
        downstream = backref
        
        print(f"Upstream: {upstream}, Downstream: {downstream}")

        results.append((upstream, downstream))

    return results

In [25]:
link = resolved_schema.get_node_link('lipidomics_file.yaml')
find_upstream_downstream(link)

Upstream: lipidomics_assay, Downstream: lipidomics_file
Upstream: core_metadata_collection, Downstream: lipidomics_file


[('lipidomics_assay', 'lipidomics_file'),
 ('core_metadata_collection', 'lipidomics_file')]