# Resolve Schema

**input**
```
+schema_path: str
```

**methods**
```
+read_json(schema_path: str): dict
+split_json(schema: dict): list
+resolver(entity1: dict, entity2: dict): dict
+resolve_defs(terms: dict, defs: dict) : dict
+ node_order(schema: dict): list
+resolve_nodes(nodeList: list, splitJsonList: list): list
+recombine_nodes(resolvedList: list) : dict
```

In [1]:
import gen3_data_validator

In [2]:
ResolverClass = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_test_schema.json")
DataClass = gen3_data_validator.ParseData(data_folder_path = "../data/fail")
LinkageClass = gen3_data_validator.TestLinkage(schema_resolver = ResolverClass, data_parser = DataClass)


=== Resolving Schema References ===
Resolved demographic.yaml
Resolved project.yaml
Resolved serum_marker_assay.yaml
Resolved alignment_workflow.yaml
Resolved imaging_file.yaml
Resolved lipidomics_assay.yaml
Resolved metabolomics_file.yaml
Resolved acknowledgement.yaml
Resolved medical_history.yaml
Resolved _settings.yaml
Resolved blood_pressure_test.yaml
Resolved genomics_assay.yaml
Resolved variant_file.yaml
Resolved program.yaml
Resolved serum_marker_file.yaml
Resolved proteomics_assay.yaml
Resolved sample.yaml
Resolved unaligned_reads_file.yaml
Resolved aligned_reads_index_file.yaml
Resolved variant_workflow.yaml
Resolved proteomics_file.yaml
Resolved exposure.yaml
Resolved metabolomics_assay.yaml
Resolved lipidomics_mapping_file.yaml
Resolved lipidomics_file.yaml
Resolved aligned_reads_file.yaml
Resolved lab_result.yaml
Resolved medication.yaml
Resolved publication.yaml
Resolved subject.yaml
Resolved core_metadata_collection.yaml
=== Validating Config Map ===
Root Node = ['subject

In [None]:
# Using the Linkage class which has the resolved schema with custom data and config
config_map = {
    "samples": {"primary_key": "sample_id", "foreign_key": "subject_id"},
    "files": {"primary_key": "file_id", "foreign_key": "sample_id"},
    "subjects": {"primary_key": "subject_id", "foreign_key": "project_id"},
    "project": {"primary_key": "project_id", "foreign_key": None}
}

data_map = {
    "samples": [
        {"sample_id": "sample_1", "subject_id": "subject_9"},
        {"sample_id": "sample_2", "subject_id": "subject_3"},  # Invalid FK
        {"sample_id": "sample_3", "subject_id": "subject_4"}, # Invalid FK
        {"sample_id": "sample_4", "subject_id": "subject_5"} # Invalid FK
    ],
    "files": [
        {"file_id": "file_1", "sample_id": "sample_1"},
        {"file_id": "file_2", "sample_id": "sample_27"}  # Invalid FK
    ],
    "subjects": [
        {"subject_id": "subject_1", "project_id": "project_1"},  
        {"subject_id": "subject_2", "project_id": "project_2"}, # Missing project 2
    ],
    "project": [
        {"project_id": "project_1"}
    ]
}

LinkageClass.validate_links(data_map, config_map)

In [3]:
ResolverClass = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_test_schema.json")
xlsxData = gen3_data_validator.ParseXlsxMetadata(xlsx_path = "../data/restricted/ausdiab_lipid_manifest.xlsx", skip_rows=1)


=== Resolving Schema References ===
Resolved demographic.yaml
Resolved project.yaml
Resolved serum_marker_assay.yaml
Resolved alignment_workflow.yaml
Resolved imaging_file.yaml
Resolved lipidomics_assay.yaml
Resolved metabolomics_file.yaml
Resolved acknowledgement.yaml
Resolved medical_history.yaml
Resolved _settings.yaml
Resolved blood_pressure_test.yaml
Resolved genomics_assay.yaml
Resolved variant_file.yaml
Resolved program.yaml
Resolved serum_marker_file.yaml
Resolved proteomics_assay.yaml
Resolved sample.yaml
Resolved unaligned_reads_file.yaml
Resolved aligned_reads_index_file.yaml
Resolved variant_workflow.yaml
Resolved proteomics_file.yaml
Resolved exposure.yaml
Resolved metabolomics_assay.yaml
Resolved lipidomics_mapping_file.yaml
Resolved lipidomics_file.yaml
Resolved aligned_reads_file.yaml
Resolved lab_result.yaml
Resolved medication.yaml
Resolved publication.yaml
Resolved subject.yaml
Resolved core_metadata_collection.yaml


In [4]:
xlsxData.write_dict_to_json(output_dir="../data/restricted/ausdiab_lipid_metadata/")

JSON files written to ../data/restricted/ausdiab_lipid_metadata/


In [5]:
DataClass = gen3_data_validator.ParseData(data_folder_path = "../data/restricted/ausdiab_lipid_metadata/")
LinkageClass = gen3_data_validator.TestLinkage(schema_resolver = ResolverClass, data_parser = DataClass)

=== Validating Config Map ===
Root Node = ['subject']
Config Map Validated
=== Validating Links ===
Entity 'lipidomics_assay' has 0 invalid foreign keys: []
Entity 'lipidomics_mapping_file' has 0 invalid foreign keys: []
Entity 'sample' has 0 invalid foreign keys: []
Entity 'subject' has 10339 invalid foreign keys: ['project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdiab-001', 'project-ausdia

# validation prototype


In [None]:
from jsonschema import Draft4Validator
from functools import wraps
from datetime import datetime
from functools import wraps
from time import time


class Validate:
    
    def __init__(self, data_map, resolved_schema):
        self.data_map = data_map
        self.resolved_schema = resolved_schema
        self.validation_result = self.validate_schema(self.data_map, self.resolved_schema)
    
    
    def validate_object(self, obj, idx, validator) -> list:
        """
        Validates a single JSON object against a provided JSON schema validator.

        Parameters:
        - obj (dict): The JSON object to validate.
        - idx (int): The index of the object in the dataset.
        - validator (Draft4Validator): The JSON schema validator to use for validation.

        Returns:
        - list: A list of dictionaries containing validation results and log messages.
        """
        validation_results = []
        errors = list(validator.iter_errors(obj))

        if len(errors[1:]) == 0:
            result = {
                "index": idx,
                "validation_result": "PASS",
                "invalid_key": None,
                "schema_path": None,
                "validator": None,
                "validator_value": None,
                "validation_error": None
            }
            validation_results.append(result)
            # print(f"=== SUCCESS === | object {idx} | No errors found")
        else:
            for error in errors[1:]:
                invalid_key = ".".join(str(k) for k in error.path) if error.path else "root"
                schema_path = ".".join(str(k) for k in error.schema_path)

                result = {
                    "index": idx,
                    "validation_result": "FAIL",
                    "invalid_key": invalid_key,
                    "schema_path": schema_path,
                    "validator": error.validator,
                    "validator_value": error.validator_value,
                    "validation_error": error.message
                }
                validation_results.append(result)
                # print(f"=== FAIL === | object {idx} | Error in {invalid_key}")

        validation_results
        return validation_results

        

    def validate_schema(self, data_map: dict, resolved_schema: dict) -> dict:
        """
        Takes in a dictionary of data, where the key is the entity name, and the value is a list of jsons containing the data.
        The function then validates the data against the resolved schema.
        
        Args:
        - data_map (dict): A dictionary where keys are entity names and values are lists of JSON objects to be validated.
        - resolved_schema (dict): A dict of resolved JSON schema objects to validate against.

        Returns:
        - dict: A dictionary containing validation results for each entity.
        """
        # Initialize counters and error storage
        validation_results = {}
        
        data_nodes = list(data_map.keys())
        print(f"Data nodes: {data_nodes}")
        schema_keys = [key[:-5] if key.endswith('.yaml') else key for key in resolved_schema.keys()]
        print(f"Schema keys: {schema_keys}")
        
        for node in data_nodes:
            if node not in schema_keys:
                print(f"Warning: {node} not found in resolved schema keys.")
                continue
        
            data = data_map[node]
            schema = resolved_schema[f"{node}.yaml"]
            validator = Draft4Validator(schema)

            node_results = []
            for idx, obj in enumerate(data):
                result = self.validate_object(obj, idx, validator)
                result = {"index_" + str(idx): result}
                node_results.append(result)
                
            validation_results[node] = node_results
        
        return validation_results
    
    
        
    def pull_entity(self, entity: str, result_type: str = "FAIL") -> list:
        """
        Retrieves the validation results for a specified entity.

        Args:
            entity (str): The name of the entity to retrieve validation results for.
            result_type (str, optional): The type of validation result to return. Either ["PASS", "FAIL", "ALL"]

        Returns:
            list: A list of validation results for the specified entity.
        """
        
        return_objects = []
        for obj in self.validation_result[entity]:
            obj_values = list(obj.values())
            val_result = obj_values[0][0]["validation_result"]
            
            if result_type == "ALL":
                return_objects.append(obj)
                continue
            
            if val_result == result_type:
                return_objects.append(obj)

        return return_objects

    def pull_index_of_entity(self, entity: str, index_key: int, result_type: str = "FAIL", return_failed: bool = True) -> dict:
        """
        Retrieves the validation result for a specified entity and index key.

        Args:
            entity (str): The name of the entity to retrieve validation results for.
            index_key (int): The index key of the validation result to retrieve.
            result_type (str, optional): The type of validation result to return. Either ["PASS", "FAIL", "ALL"]
            return_failed (bool, optional): Flag to determine if only failed results should be returned.

        Returns:
            dict: The validation result for the specified entity and index key, or None if not found.
        """
        data = self.validation_result[entity]
        index_data = next((item[index_key] for item in data if index_key in item), None)
        
        return_list = []
        for obj in index_data:
            val_result = obj.get("validation_result")
            
            if result_type == "ALL":
                return_list.append(obj)
                continue
            
            if val_result == result_type:
                return_list.append(obj)
        
        return return_list
    
    
    def list_entities(self) -> list:
        """
        Lists all entities present in the validation results.

        Returns:
            list: A list of entity names.
        """
        return list(self.validation_result.keys())
    
    def list_index_by_entity(self, entity: str) -> list:
        """
        Lists all index keys for a specified entity.

        Args:
            entity (str): The name of the entity to list index keys for.

        Returns:
            list: A list of index keys for the specified entity.
        """
        index_list = []
        for obj in self.validation_result[entity]:
            index_list.append(list(obj.keys())[0])
        return index_list


class ValidateStats(Validate):
    def __init__(self, validate_instance: Validate):
        self.data_map = validate_instance.data_map
        self.resolved_schema = validate_instance.resolved_schema
        self.validation_result = validate_instance.validation_result
        
    
    def n_rows_with_errors(self, entity: str) -> int:
        """
        Returns the number of rows that have validation errors for a given entity.

        Args:
            entity (str): The name of the entity to check for validation errors.

        Returns:
            int: The number of rows with validation errors.
        """
        return len(self.pull_entity(entity))
    
    def count_results_by_index(self, entity: str, index_key: str, result_type: str = "FAIL", print_results: bool = False):
        """
        Counts the number of validation results based on a specified entity and index_key.
        For example the entity 'sample' will have an error in row 1 / index 1, which contains
        5 validation errors due to errors in 5 columns for that row. So the method will return
        5 validation errors.

        Args:
            entity (str): The name of the entity to count validation results for.
            index_key (str): The key/index to count validation results for.
            result_type (str, optional): The type of validation result to count. Either ["PASS", "FAIL", "ALL"]
            print_results (bool, optional): Flag to print the results.

        Returns:
            int: The number of validation results for the specified key/index.
        """
        validation_count = 0
        
        index_data = self.pull_index_of_entity(entity = entity, index_key = index_key, result_type = result_type)
        for obj in index_data:
            val_result = obj["validation_result"]
            if result_type == "ALL":
                validation_count += 1
                continue

            if val_result == result_type:
                validation_count += 1

        if print_results:
            print(f"Number of {result_type} validations for {entity} at {index_key}': {validation_count}")
        return validation_count


    def count_results_by_entity(self, entity: str, result_type: str = "FAIL", print_results: bool = False) -> int:
        """
        Counts the number of validation results for a specified entity. Each entry in the 
        entity may produce more than one validation error, which will be counted. For 
        example, one entry, in 'sample' may result in 5 validation errors. This function counts
        the total number of validation errors for a whole entity.

        Args:
            entity (str): The name of the entity to count failed validation results for.
            result_type (str, optional): The type of validation result to count. Either ["PASS", "FAIL", "ALL"]
            print_results (bool, optional): Flag to print the results.

        Returns:
            int: The number of failed validation results for the specified entity.
        """
        
        index_keys = self.list_index_by_entity(entity=entity)
        validation_count = 0
        
        for index_key in index_keys:
            count = self.count_results_by_index(entity=entity, index_key=index_key, result_type=result_type)
            validation_count += count
        
        if print_results:
            print(f"Number of total {result_type} validations for '{entity}': {validation_count}")
        return validation_count
    
    def n_errors_per_entity(self, entity: str) -> int:
        """
        Returns the number of errors that have validation errors for a given entity.

        Args:
            entity (str): The name of the entity to check for validation errors.

        Returns:
            int: The number of rows with validation errors.
        """
        n_errors = len(self.pull_entity(entity, return_failed=True))
        return n_errors
    
    
    def n_errors_per_entry(self, entity: str, index_key: int) -> int:
        """
        Returns the number of validation errors for a given entity and index.

        Args:
            entity (str): The name of the entity to check for validation errors.
            index_key (int): The index of the row to check for validation errors.

        Returns:
            int: The number of validation errors for the given entity and index.
        """
        n_errors = len(self.pull_index_of_entity(entity, index_key))
        return n_errors
    
    def total_validation_errors(self) -> int:
        """
        Calculates the total number of validation errors across all entities.

        Returns:
            int: The total number of validation errors.
        """
        error_count = 0
        for entity in self.list_entities():
                error_count += self.count_results_by_entity(entity=entity, result_type="FAIL")
        return error_count
    
    
    def summary_stats(self):
        """
        Prints a summary of validation statistics including total validation errors and errors per entity.
        """
        print(f"Total validation errors: {self.total_validation_errors()}")
        for entity in self.list_entities():
            print(f"\nEntity: {entity}")
            print(f"Number of rows with errors: {self.n_rows_with_errors(entity)}")
            print(f"Number of errors per entity: {self.count_results_by_entity(entity, result_type='FAIL')}")
    
    



import gen3_data_validator
resolver = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_test_schema.json")
data = gen3_data_validator.ParseData(data_folder_path = "../data/fail")
# linkage = gen3_data_validator.TestLinkage(schema_resolver = resolver, data_parser = data)
validator = Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)
validate_stats = ValidateStats(validator)
validate_stats.summary_stats()

=== Resolving Schema References ===
Resolved demographic.yaml
Resolved project.yaml
Resolved serum_marker_assay.yaml
Resolved alignment_workflow.yaml
Resolved imaging_file.yaml
Resolved lipidomics_assay.yaml
Resolved metabolomics_file.yaml
Resolved acknowledgement.yaml
Resolved medical_history.yaml
Resolved _settings.yaml
Resolved blood_pressure_test.yaml
Resolved genomics_assay.yaml
Resolved variant_file.yaml
Resolved program.yaml
Resolved serum_marker_file.yaml
Resolved proteomics_assay.yaml
Resolved sample.yaml
Resolved unaligned_reads_file.yaml
Resolved aligned_reads_index_file.yaml
Resolved variant_workflow.yaml
Resolved proteomics_file.yaml
Resolved exposure.yaml
Resolved metabolomics_assay.yaml
Resolved lipidomics_mapping_file.yaml
Resolved lipidomics_file.yaml
Resolved aligned_reads_file.yaml
Resolved lab_result.yaml
Resolved medication.yaml
Resolved publication.yaml
Resolved subject.yaml
Resolved core_metadata_collection.yaml
Data nodes: ['metabolomics_file', 'medical_history'

In [34]:
# Use this for writing tests

sample_validation_results = {
    'sample': [
        [
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'freeze_thaw_cycles',
                'schema_path': 'properties.freeze_thaw_cycles.type',
                'validator': 'type',
                'validator_value': 'integer',
                'validation_error': "'10' is not of type 'integer'"
            },
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_provider',
                'schema_path': 'properties.sample_provider.enum',
                'validator': 'enum',
                'validator_value': ['Baker', 'USYD', 'UMELB', 'UQ'],
                'validation_error': "45 is not one of ['Baker', 'USYD', 'UMELB', 'UQ']"
            },
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_storage_method',
                'schema_path': 'properties.sample_storage_method.enum',
                'validator': 'enum',
                'validator_value': [
                    'not stored',
                    'ambient temperature',
                    'cut slide',
                    'fresh',
                    'frozen, -70C freezer',
                    'frozen, -150C freezer',
                    'frozen, liquid nitrogen',
                    'frozen, vapor phase',
                    'paraffin block',
                    'RNAlater, frozen',
                    'TRIzol, frozen'
                ],
                'validation_error': "'Autoclave' is not one of ['not stored', 'ambient temperature', 'cut slide', 'fresh', 'frozen, -70C freezer', 'frozen, -150C freezer', 'frozen, liquid nitrogen', 'frozen, vapor phase', 'paraffin block', 'RNAlater, frozen', 'TRIzol, frozen']"
            }
        ],
        [
            {
                'index': 1,
                'validation_result': 'FAIL',
                'invalid_key': 'freeze_thaw_cycles',
                'schema_path': 'properties.freeze_thaw_cycles.type',
                'validator': 'type',
                'validator_value': 'integer',
                'validation_error': "'76' is not of type 'integer'"
            },
            {
                'index': 1,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_storage_method',
                'schema_path': 'properties.sample_storage_method.enum',
                'validator': 'enum',
                'validator_value': [
                    'not stored',
                    'ambient temperature',
                    'cut slide',
                    'fresh',
                    'frozen, -70C freezer',
                    'frozen, -150C freezer',
                    'frozen, liquid nitrogen',
                    'frozen, vapor phase',
                    'paraffin block',
                    'RNAlater, frozen',
                    'TRIzol, frozen'
                ],
                'validation_error': "'In the Pantry' is not one of ['not stored', 'ambient temperature', 'cut slide', 'fresh', 'frozen, -70C freezer', 'frozen, -150C freezer', 'frozen, liquid nitrogen', 'frozen, vapor phase', 'paraffin block', 'RNAlater, frozen', 'TRIzol, frozen']"
            }
        ],
        [
            {
                'index': 2,
                'validation_result': 'PASS',
                'invalid_key': None,
                'schema_path': None,
                'validator': None,
                'validator_value': None,
                'validation_error': None
            }
        ],
        [
            {
                'index': 3,
                'validation_result': 'PASS',
                'invalid_key': None,
                'schema_path': None,
                'validator': None,
                'validator_value': None,
                'validation_error': None
            }
        ]
    ]
}




Number of objects: 4
Number of passes: 2
