In [1]:
import gen3_data_validator
from gen3_data_validator.logging_config import setup_logging
setup_logging()

## Reading in xlsx data and writing to json
- xlsx data comes from xlsx manifest file created from acdc_submission_template

In [2]:
# ResolverClass = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_test_schema.json")
xlsxData = gen3_data_validator.ParseXlsxMetadata(xlsx_path = "/Users/harrijh/projects/gen3-data-validator/data/lipid_metadata_example.xlsx", skip_rows=1)
xlsxData.parse_metadata_template()
xlsxData.write_dict_to_json(xlsx_data_dict=xlsxData.xlsx_data_dict, output_dir="/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example")

## Creating Resolver Instance
- This class reads in the gen3schema.json then resolves the schema for use in the other classes


In [3]:
Resolver = gen3_data_validator.ResolveSchema(schema_path = "../tests/schema/gen3_test_schema.json")
Resolver.resolve_schema()

In [4]:
# you can check the graph nodes in the resolved schema with 
Resolver.nodes

['demographic.yaml',
 'project.yaml',
 'serum_marker_assay.yaml',
 'alignment_workflow.yaml',
 'imaging_file.yaml',
 'lipidomics_assay.yaml',
 'metabolomics_file.yaml',
 'acknowledgement.yaml',
 'medical_history.yaml',
 '_definitions.yaml',
 '_settings.yaml',
 'blood_pressure_test.yaml',
 'genomics_assay.yaml',
 'variant_file.yaml',
 'program.yaml',
 'serum_marker_file.yaml',
 'proteomics_assay.yaml',
 'sample.yaml',
 'unaligned_reads_file.yaml',
 '_terms.yaml',
 'aligned_reads_index_file.yaml',
 'variant_workflow.yaml',
 'proteomics_file.yaml',
 'exposure.yaml',
 'metabolomics_assay.yaml',
 'lipidomics_mapping_file.yaml',
 'lipidomics_file.yaml',
 'aligned_reads_file.yaml',
 'lab_result.yaml',
 'medication.yaml',
 'publication.yaml',
 'subject.yaml',
 'core_metadata_collection.yaml']

You can return the resolved schema with

In [16]:
Resolver.schema_resolved

{'demographic.yaml': {'$schema': 'http://json-schema.org/draft-04/schema#',
  'additionalProperties': False,
  'category': 'clinical',
  'description': 'Data for the characterization of the patient by means of segementing the population (e.g. characterization by age, sex, or race).',
  'id': 'demographic',
  'links': [{'backref': 'demographics',
    'label': 'describes',
    'multiplicity': 'one_to_one',
    'name': 'subjects',
    'required': True,
    'target_type': 'subject'}],
  'namespace': 'https://data.test.biocommons.org.au/',
  'program': '*',
  'project': '*',
  'properties': {'created_datetime': {'oneOf': [{'format': 'date-time',
      'type': 'string'},
     {'type': 'null'}],
    'term': {'description': 'A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n'}},
   'id': {'pattern': '^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$',
    'term': {'description': 'A 128-bit identifier. Depending on the mechanism 

## Parsing data
- The parse data class takes in a data folder path containing json files for each data node


In [5]:
# Testing linkage for test data that passes
Data = gen3_data_validator.ParseData(data_folder_path = "../tests/data/pass")

To list the files read into the Data instance, you can use the following code:

In [6]:
Data.file_path_list

['/Users/harrijh/projects/gen3-data-validator/tests/data/pass/metabolomics_file.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/pass/medical_history.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/pass/metabolomics_assay.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/pass/sample.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/pass/subject.json']

All of the read data is stored in Data.data_dict as a dictionary, where the key is the entity and the value is a list of json objects

In [7]:
Data.data_dict

{'metabolomics_file': [{'alternate_timepoint': '1a914a1577',
   'baseline_timepoint': True,
   'cv': 56.94475432813319,
   'data_category': 'mass spec analysed',
   'data_format': 'wiff',
   'data_type': 'MS/MS',
   'file_format': 'e387cadce7',
   'file_name': 'dummy_metab',
   'file_size': 87,
   'ga4gh_drs_uri': '150bf4b457',
   'md5sum': '756c381b71c2a7d346c72998ab334c00',
   'metabolomic_unit': 'pmol/mL',
   'metabolomics_assays': {'submitter_id': 'metabolomics_assay_356580ff6d'},
   'submitter_id': 'metabolomics_file_547f3d4417',
   'type': 'metabolomics_file',
   'metabolomics_files': 'metabolomics_file_547f3d4417'},
  {'alternate_timepoint': '578a14ee53',
   'baseline_timepoint': True,
   'cv': 43.00152620641602,
   'data_category': 'mass spec analysed',
   'data_format': 'wiff',
   'data_type': 'MS/MS',
   'file_format': '47a60862ef',
   'file_name': 'dummy_metab',
   'file_size': 0,
   'ga4gh_drs_uri': '2beb8c16ea',
   'md5sum': '43640335849622369f4843b817c1da2e',
   'metabolo

The default link suffix is 's'
- This links suffix can be changed depending on what the key_name for the linked information is.

In [8]:
Data.link_suffix

's'

For example, in the json object below, we can see that the key "subjects" is what describes the link from sample to subject, since the value of 'subjects' is an array containing the key "submitter_id".
- Furthermore, the backref is called 'subjects' while the entity is called 'sample'
- Therefore, the link suffix is 's'

In [9]:
Data.data_dict["sample"][0]

{'alternate_timepoint': '1f56770b0b',
 'baseline_timepoint': True,
 'freeze_thaw_cycles': 10,
 'sample_collection_method': '2fddbe7d09',
 'sample_id': 'd4f31f7bb6',
 'sample_in_preservation': 'snap Frozen',
 'sample_in_storage': 'yes',
 'sample_provider': 'USYD',
 'sample_source': 'UBERON:3781554',
 'sample_storage_method': 'not stored',
 'sample_type': '59a8fd8005',
 'storage_location': 'UMELB',
 'subjects': {'submitter_id': 'subject_e5616257f8'},
 'submitter_id': 'sample_efdbe56d20',
 'type': 'sample',
 'samples': 'sample_efdbe56d20'}

Finally, you can also check what the detected entities are below:

In [10]:
Data.data_nodes

['metabolomics_file',
 'medical_history',
 'metabolomics_assay',
 'sample',
 'subject']

## Testing Linkage

The first thing you should do is create a linkage configuration map. The `.generate_config` method will do this for you, it will read in the data (stored in the `data_dict` attribute) and return a linkage configuration map.

The linkage configuration map is a dictionary that maps each entity to a dictionary of its primary and foreign keys, with the format:

```
{
    "entity_name": {
        "primary_key": "primary_key_field",
        "foreign_key": "foreign_key_field"
    }
}
```

Also, you can define the linkage configuration map yourself, but you need to make sure that the primary and foreign keys are defined for each entity.

In [None]:
import gen3_data_validator
DataPass = gen3_data_validator.ParseData(data_folder_path = "../tests/data/pass")
LinkagePass = gen3_data_validator.Linkage()
link_pass_config = LinkagePass.generate_config(DataPass.data_dict)
link_pass_config

{'metabolomics_file': {'primary_key': 'metabolomics_files',
  'foreign_key': 'metabolomics_assays'},
 'medical_history': {'primary_key': 'medical_historys',
  'foreign_key': 'subjects'},
 'metabolomics_assay': {'primary_key': 'metabolomics_assays',
  'foreign_key': 'samples'},
 'sample': {'primary_key': 'samples', 'foreign_key': 'subjects'},
 'subject': {'primary_key': 'subjects', 'foreign_key': None}}

Once you have the linkage configuration map, you can validate the links. The `.validate_links` method will do this for you, it will read in the data and the linkage configuration map then return a dictionary of the linkage validation results.

As a reminder, the data parsed to the `.validate_links` method as the `data_map` argument, has the format:

```python
{
    "entity_name_1": [
        {
            "field_name": "field_value"
        },
        {
            "field_name": "field_value"
        }
    ],
    "entity_name_2": [
        {
            "field_name": "field_value"
        },
        {
            "field_name": "field_value"
        }
    ]
}
```
Where `entity_name_1` and `entity_name_2` are the names of the entities in the data, and value is a list of json objects, each representing a record in the entity.

In [None]:
import gen3_data_validator
DataPass = gen3_data_validator.ParseData(data_folder_path = "../tests/data/pass")
LinkagePass = gen3_data_validator.Linkage()
link_pass_config = LinkagePass.generate_config(DataPass.data_dict)
LinkagePass.validate_links(data_map = DataPass.data_dict, config = link_pass_config, root_node = 'subject')

=== Validating Config Map ===
Root Node = subject
Config Map Validated
=== Validating Links ===
Entity 'metabolomics_file' has 0 invalid foreign keys: []
Entity 'medical_history' has 0 invalid foreign keys: []
Entity 'metabolomics_assay' has 0 invalid foreign keys: []
Entity 'sample' has 0 invalid foreign keys: []
Entity 'subject' has 0 invalid foreign keys: []


{'metabolomics_file': [],
 'medical_history': [],
 'metabolomics_assay': [],
 'sample': [],
 'subject': []}

Testing linkage for test data that fails:
- Note that the `root_node` argument tells the validate_links method which entitie is a root node, therefore will not have any upstream links.

In [None]:
DataFail = gen3_data_validator.ParseData(data_folder_path = "../tests/data/fail")
LinkageFail = gen3_data_validator.Linkage()
link_fail_config = LinkageFail.generate_config(DataFail.data_dict)
LinkageFail.validate_links(data_map = DataFail.data_dict, config = link_fail_config, root_node = 'subject')

=== Validating Config Map ===
Root Node = subject
Config Map Validated
=== Validating Links ===
Entity 'metabolomics_file' has 1099 invalid foreign keys: ['metabolomics_assay_356580ff6d', 'metabolomics_assay_44f829fa47', 'metabolomics_assay_974d137216', 'metabolomics_assay_3d1f400b27', 'metabolomics_assay_d1cd2f492c', 'metabolomics_assay_c025b20da0', 'metabolomics_assay_439725d38f', 'metabolomics_assay_d0350804b1', 'metabolomics_assay_63cef60fa4', 'metabolomics_assay_78465fe5b1', 'metabolomics_assay_3754fe418d', 'metabolomics_assay_0cdd244c6e', 'metabolomics_assay_40a94ece37', 'metabolomics_assay_adc5f88af9', 'metabolomics_assay_b646004109', 'metabolomics_assay_69bcc995f0', 'metabolomics_assay_37be2a2136', 'metabolomics_assay_119df8af52', 'metabolomics_assay_8dedaeccc1', 'metabolomics_assay_b353a7f9b8', 'metabolomics_assay_ebe904af55', 'metabolomics_assay_5bed5ab90c', 'metabolomics_assay_1417aac36c', 'metabolomics_assay_4e133f8d44', 'metabolomics_assay_38d2765ae0', 'metabolomics_assay_

{'metabolomics_file': ['metabolomics_assay_356580ff6d',
  'metabolomics_assay_44f829fa47',
  'metabolomics_assay_974d137216',
  'metabolomics_assay_3d1f400b27',
  'metabolomics_assay_d1cd2f492c',
  'metabolomics_assay_c025b20da0',
  'metabolomics_assay_439725d38f',
  'metabolomics_assay_d0350804b1',
  'metabolomics_assay_63cef60fa4',
  'metabolomics_assay_78465fe5b1',
  'metabolomics_assay_3754fe418d',
  'metabolomics_assay_0cdd244c6e',
  'metabolomics_assay_40a94ece37',
  'metabolomics_assay_adc5f88af9',
  'metabolomics_assay_b646004109',
  'metabolomics_assay_69bcc995f0',
  'metabolomics_assay_37be2a2136',
  'metabolomics_assay_119df8af52',
  'metabolomics_assay_8dedaeccc1',
  'metabolomics_assay_b353a7f9b8',
  'metabolomics_assay_ebe904af55',
  'metabolomics_assay_5bed5ab90c',
  'metabolomics_assay_1417aac36c',
  'metabolomics_assay_4e133f8d44',
  'metabolomics_assay_38d2765ae0',
  'metabolomics_assay_c096f97680',
  'metabolomics_assay_e345b3e502',
  'metabolomics_assay_064911ed4c',

You can check the json files read into the DataFail instance

In [14]:
DataFail.file_path_list

['/Users/harrijh/projects/gen3-data-validator/tests/data/fail/metabolomics_file.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/fail/medical_history.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/fail/metabolomics_assay.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/fail/sample.json',
 '/Users/harrijh/projects/gen3-data-validator/tests/data/fail/subject.json']

This returns all of the foreign keys that are not linked to a primary key

In [15]:
LinkageFail.link_validation_results

{'metabolomics_file': ['metabolomics_assay_356580ff6d',
  'metabolomics_assay_44f829fa47',
  'metabolomics_assay_974d137216',
  'metabolomics_assay_3d1f400b27',
  'metabolomics_assay_d1cd2f492c',
  'metabolomics_assay_c025b20da0',
  'metabolomics_assay_439725d38f',
  'metabolomics_assay_d0350804b1',
  'metabolomics_assay_63cef60fa4',
  'metabolomics_assay_78465fe5b1',
  'metabolomics_assay_3754fe418d',
  'metabolomics_assay_0cdd244c6e',
  'metabolomics_assay_40a94ece37',
  'metabolomics_assay_adc5f88af9',
  'metabolomics_assay_b646004109',
  'metabolomics_assay_69bcc995f0',
  'metabolomics_assay_37be2a2136',
  'metabolomics_assay_119df8af52',
  'metabolomics_assay_8dedaeccc1',
  'metabolomics_assay_b353a7f9b8',
  'metabolomics_assay_ebe904af55',
  'metabolomics_assay_5bed5ab90c',
  'metabolomics_assay_1417aac36c',
  'metabolomics_assay_4e133f8d44',
  'metabolomics_assay_38d2765ae0',
  'metabolomics_assay_c096f97680',
  'metabolomics_assay_e345b3e502',
  'metabolomics_assay_064911ed4c',

# Data Validation
- Validating json data objects to the gen3jsonschema


Creating the validation class
- You will need to preload the data under the `data_map` attribute and the resolved schema under the `resolved_schema` attribute in the `Validate` class.

In [17]:
import gen3_data_validator

resolver = gen3_data_validator.ResolveSchema(schema_path = "../tests/schema/gen3_test_schema.json")
resolver.resolve_schema()
data = gen3_data_validator.ParseData(data_folder_path = "../tests/data/fail")
validator = gen3_data_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)


You can call the orchestrator method to run the validation pipeline with `.validate_schema`

In [18]:
validator.validate_schema()

{'metabolomics_file': [{'index_0': [{'index': 0,
     'validation_result': 'FAIL',
     'invalid_key': 'data_format',
     'schema_path': 'properties.data_format.enum',
     'validator': 'enum',
     'validator_value': ['wiff'],
     'validation_error': "True is not one of ['wiff']"},
    {'index': 0,
     'validation_result': 'FAIL',
     'invalid_key': 'data_type',
     'schema_path': 'properties.data_type.enum',
     'validator': 'enum',
     'validator_value': ['MS', 'MS/MS'],
     'validation_error': "'1' is not one of ['MS', 'MS/MS']"}]},
  {'index_1': [{'index': 1,
     'validation_result': 'PASS',
     'invalid_key': None,
     'schema_path': None,
     'validator': None,
     'validator_value': None,
     'validation_error': None}]},
  {'index_2': [{'index': 2,
     'validation_result': 'PASS',
     'invalid_key': None,
     'schema_path': None,
     'validator': None,
     'validator_value': None,
     'validation_error': None}]},
  {'index_3': [{'index': 3,
     'validation_

What is returned is a data structure in the following format:

```python
{
    'entity_name': [
        {
            'row_index_number': [
                {
                    'index': 0, # this is the index of the row in the entity
                    'invalid_key': 'this_is_the_column_name',
                    'validation_result': 'FAIL',
                    'schema_path': 'this_is_the_path_to_the_property_in_the_schema',
                    'validator': 'the_target_data_type',
                    'validator_value': 'the_correct_value',
                    'validation_error': 'this_is_the_validation_error_message'
                },
                {
                    'index': 0, # this is the index of the row in the entity
                    'invalid_key': 'same_row_validation_error_in_another_column',
                    'validation_result': 'FAIL',
                    'schema_path': 'this_is_the_path_to_the_property_in_the_schema',
                    'validator': 'the_target_data_type',
                    'validator_value': 'the_correct_value',
                    'validation_error': 'this_is_the_validation_error_message'
                }
            ]
        }
    ],
    'metabolomics_file': [
        {
            'index_0': [
                {'index': 0, # error in first row
                'validation_result': 'FAIL',
                'invalid_key': 'data_format', # error in column called data_format
                'schema_path': 'properties.data_format.enum',
                'validator': 'enum',
                'validator_value': ['wiff'],
                'validation_error': "True is not one of ['wiff']"
                },
                {'index': 0, # error in first row
                'validation_result': 'FAIL',
                'invalid_key': 'data_type', # error in column called data_type
                'schema_path': 'properties.data_type.enum',
                'validator': 'enum',
                'validator_value': ['MS', 'MS/MS'],
                'validation_error': "'1' is not one of ['MS', 'MS/MS']"
                }
            ]
        },
        {
            'index_1': [
                {
                    'index': 1, # error in second row
                    'validation_result': 'FAIL',
                    'invalid_key': 'data_format', # error in column called data_format
                    'schema_path': 'properties.data_format.enum',
                    'validator': 'enum',
                    'validator_value': ['wiff'],
                    'validation_error': "True is not one of ['wiff']"
                }
            ]
        }
    ]
}


     
```

Lets say we want to pull the validation results for a specific entity, at a specific row / index:
- `result_type` can either be `['ALL', 'FAIL', 'PASS']`
- This will return a list of json objects, each representing a validation result for a specific row in the entity

In [19]:
validator.pull_index_of_entity(entity="sample", index_key=0, result_type="ALL")

[{'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'freeze_thaw_cycles',
  'schema_path': 'properties.freeze_thaw_cycles.type',
  'validator': 'type',
  'validator_value': 'integer',
  'validation_error': "'10' is not of type 'integer'"},
 {'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'sample_provider',
  'schema_path': 'properties.sample_provider.enum',
  'validator': 'enum',
  'validator_value': ['Baker', 'USYD', 'UMELB', 'UQ'],
  'validation_error': "45 is not one of ['Baker', 'USYD', 'UMELB', 'UQ']"},
 {'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'sample_storage_method',
  'schema_path': 'properties.sample_storage_method.enum',
  'validator': 'enum',
  'validator_value': ['not stored',
   'ambient temperature',
   'cut slide',
   'fresh',
   'frozen, -70C freezer',
   'frozen, -150C freezer',
   'frozen, liquid nitrogen',
   'frozen, vapor phase',
   'paraffin block',
   'RNAlater, frozen',
   'TRIzol, frozen'],
  'validation_error': "'Auto

You can print what entites were validated by using the `.list_entities` method.

In [23]:
validator.list_entities()

['metabolomics_file',
 'medical_history',
 'metabolomics_assay',
 'sample',
 'subject']

if you want to see the row / index names of an entity you can use the `.list_index_by_entity` method:

In [24]:
validator.list_index_by_entity("sample")

['index_0', 'index_1', 'index_2']

You can pull out a validation results for a specific entity with the `.pull_entity` method

In [10]:
validator.pull_entity("sample")

[{'index_0': [{'index': 0,
    'validation_result': 'FAIL',
    'invalid_key': 'freeze_thaw_cycles',
    'schema_path': 'properties.freeze_thaw_cycles.type',
    'validator': 'type',
    'validator_value': 'integer',
    'validation_error': "'10' is not of type 'integer'"},
   {'index': 0,
    'validation_result': 'FAIL',
    'invalid_key': 'sample_provider',
    'schema_path': 'properties.sample_provider.enum',
    'validator': 'enum',
    'validator_value': ['Baker', 'USYD', 'UMELB', 'UQ'],
    'validation_error': "45 is not one of ['Baker', 'USYD', 'UMELB', 'UQ']"},
   {'index': 0,
    'validation_result': 'FAIL',
    'invalid_key': 'sample_storage_method',
    'schema_path': 'properties.sample_storage_method.enum',
    'validator': 'enum',
    'validator_value': ['not stored',
     'ambient temperature',
     'cut slide',
     'fresh',
     'frozen, -70C freezer',
     'frozen, -150C freezer',
     'frozen, liquid nitrogen',
     'frozen, vapor phase',
     'paraffin block',
     '

In [9]:
len(validator.pull_entity("sample"))

2

You can pull validation results for a specific entity and then a specific index / row with the `pull_index_of_entity` method.

In [3]:
validator.pull_index_of_entity("sample", 0)

[{'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'freeze_thaw_cycles',
  'schema_path': 'properties.freeze_thaw_cycles.type',
  'validator': 'type',
  'validator_value': 'integer',
  'validation_error': "'10' is not of type 'integer'"},
 {'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'sample_provider',
  'schema_path': 'properties.sample_provider.enum',
  'validator': 'enum',
  'validator_value': ['Baker', 'USYD', 'UMELB', 'UQ'],
  'validation_error': "45 is not one of ['Baker', 'USYD', 'UMELB', 'UQ']"},
 {'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'sample_storage_method',
  'schema_path': 'properties.sample_storage_method.enum',
  'validator': 'enum',
  'validator_value': ['not stored',
   'ambient temperature',
   'cut slide',
   'fresh',
   'frozen, -70C freezer',
   'frozen, -150C freezer',
   'frozen, liquid nitrogen',
   'frozen, vapor phase',
   'paraffin block',
   'RNAlater, frozen',
   'TRIzol, frozen'],
  'validation_error': "'Auto

# Getting validation stats
- The `ValidateStats` class is used to get summary statistics and data frames of the validation results.

First we create a validator object and validate the data with the schema using the `validate_schema` method.

In [1]:
import gen3_data_validator
from gen3_data_validator.logging_config import setup_logging
setup_logging(level="INFO")

resolver = gen3_data_validator.ResolveSchema(schema_path = "../tests/schema/gen3_test_schema.json")
resolver.resolve_schema()
data = gen3_data_validator.ParseData(data_folder_path = "../tests/data/fail")
validator = gen3_data_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)
validator.validate_schema()

{'metabolomics_file': [{'index_0': [{'index': 0,
     'validation_result': 'FAIL',
     'invalid_key': 'data_format',
     'schema_path': 'properties.data_format.enum',
     'validator': 'enum',
     'validator_value': ['wiff'],
     'validation_error': "True is not one of ['wiff']"},
    {'index': 0,
     'validation_result': 'FAIL',
     'invalid_key': 'data_type',
     'schema_path': 'properties.data_type.enum',
     'validator': 'enum',
     'validator_value': ['MS', 'MS/MS'],
     'validation_error': "'1' is not one of ['MS', 'MS/MS']"}]},
  {'index_1': [{'index': 1,
     'validation_result': 'PASS',
     'invalid_key': None,
     'schema_path': None,
     'validator': None,
     'validator_value': None,
     'validation_error': None}]},
  {'index_2': [{'index': 2,
     'validation_result': 'PASS',
     'invalid_key': None,
     'schema_path': None,
     'validator': None,
     'validator_value': None,
     'validation_error': None}]},
  {'index_3': [{'index': 3,
     'validation_

We then pass the validator instance to the `ValidateStats` class to get the summary statistics and data frames of the validation results which are stored in the `validation_result` attribute of the validator instance.

In [None]:
validate_stats = gen3_data_validator.ValidateStats(validator)

To get a high level summary we can call the `.summary_stats` method on the `ValidateStats` instance.

In [None]:
validate_stats.summary_stats()

Total validation errors: 15


Unnamed: 0,entity,number_of_rows_with_errors,number_of_errors_per_entity
0,metabolomics_file,1,2
1,medical_history,2,6
2,metabolomics_assay,2,2
3,sample,2,5
4,subject,0,0


There are several other methods in the `ValidateStats` class that provide detailed metrics about your validation results:

- `n_rows_with_errors(entity)`: Returns the number of rows (entries) with at least one validation error for a given entity.
- `n_errors_per_entry(entity, index_key)`: Returns the number of validation errors for a specific row (by index) within an entity.
- `count_results_by_entity(entity, result_type="FAIL")`: Counts the number of validation results of a specific type (e.g., "FAIL", "PASS", or "ALL") for an entity.
- `count_results_by_index(entity, index_key, result_type="FAIL")`: Counts the number of validation results of a specific type for a specific row (by index) within an entity.
- `total_validation_errors()`: Returns the total number of validation errors across all entities.
These methods allow you to drill down into the validation results and generate custom summaries or reports as needed.

In [14]:

# Usage examples for ValidateStats methods

entity = "sample"

rows_with_errors = validate_stats.n_rows_with_errors(entity)
print(f"Number of rows with errors for entity '{entity}': {rows_with_errors}")

index_key = 0
errors_per_entry = validate_stats.n_errors_per_entry(entity, index_key)
print(f"Number of errors for entity '{entity}' at index {index_key}: {errors_per_entry}")

fail_count = validate_stats.count_results_by_entity(entity, result_type="FAIL")
print(f"Total number of FAIL results for entity '{entity}': {fail_count}")

pass_count = validate_stats.count_results_by_entity(entity, result_type="PASS")
print(f"Total number of PASS results for entity '{entity}': {pass_count}")

all_count = validate_stats.count_results_by_entity(entity, result_type="ALL")
print(f"Total number of validation results for entity '{entity}': {all_count}")

fail_count_index = validate_stats.count_results_by_index(entity, index_key, result_type="FAIL")
print(f"Number of FAIL results for entity '{entity}' at index {index_key}: {fail_count_index}")

total_errors = validate_stats.total_validation_errors()
print(f"Total number of validation errors: {total_errors}")

summary_df = validate_stats.summary_stats()
print("Summary statistics DataFrame:")
print(summary_df)


Number of rows with errors for entity 'sample': 2
Number of errors for entity 'sample' at index 0: 3
Total number of FAIL results for entity 'sample': 5
Total number of PASS results for entity 'sample': 1
Total number of validation results for entity 'sample': 6
Number of FAIL results for entity 'sample' at index 0: 3
Total validation errors: 15
Total number of validation errors: 15
Total validation errors: 15
Summary statistics DataFrame:
               entity  number_of_rows_with_errors  number_of_errors_per_entity
0   metabolomics_file                           1                            2
1     medical_history                           2                            6
2  metabolomics_assay                           2                            2
3              sample                           2                            5
4             subject                           0                            0


# Creating validation summary data
- We can also pass the validator instance to the `ValidateSummary` class to get a flattened summary of the validation results.

Creating ValidateSummary instance

In [7]:
import gen3_data_validator
from gen3_data_validator.logging_config import setup_logging
setup_logging(level="INFO")

resolver = gen3_data_validator.ResolveSchema(schema_path = "../tests/schema/gen3_test_schema.json")
resolver.resolve_schema()
data = gen3_data_validator.ParseData(data_folder_path = "../tests/data/fail")
validator = gen3_data_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)
validator.validate_schema() # make sure validation has been run by calling .validate_schema()

Summary = gen3_data_validator.ValidateSummary(validator) 


This returns the validation results in a flattened dictionary format.

In [8]:
Summary.flatten_validation_results()

[{'row': '0',
  'entity': 'metabolomics_file',
  'guid': 'f97de42d-2dc2-4fe5-b05f-2088de6a7190',
  'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'data_format',
  'schema_path': 'properties.data_format.enum',
  'validator': 'enum',
  'validator_value': ['wiff'],
  'validation_error': "True is not one of ['wiff']"},
 {'row': '0',
  'entity': 'metabolomics_file',
  'guid': 'c60278bb-6682-4752-b06f-0c733150bbe9',
  'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'data_type',
  'schema_path': 'properties.data_type.enum',
  'validator': 'enum',
  'validator_value': ['MS', 'MS/MS'],
  'validation_error': "'1' is not one of ['MS', 'MS/MS']"},
 {'row': '0',
  'entity': 'medical_history',
  'guid': '03f6f4b5-3cae-4ae0-9e45-c09f06d19b84',
  'index': 0,
  'validation_result': 'FAIL',
  'invalid_key': 'cvd_family_history',
  'schema_path': 'properties.cvd_family_history.enum',
  'validator': 'enum',
  'validator_value': ['yes', 'no'],
  'validation_error': "90 is not one o

This returns the validation results in a flattened pandas dataframe.

In [10]:
Summary.flattened_results_to_pd()

Unnamed: 0,row,entity,guid,index,validation_result,invalid_key,schema_path,validator,validator_value,validation_error
0,0,medical_history,03f6f4b5-3cae-4ae0-9e45-c09f06d19b84,0,FAIL,cvd_family_history,properties.cvd_family_history.enum,enum,"[yes, no]","90 is not one of ['yes', 'no']"
1,0,medical_history,30d29586-2cae-4e6a-a561-a4438d0af89d,0,FAIL,cvd_reported,properties.cvd_reported.enum,enum,"[yes, no]","'NO' is not one of ['yes', 'no']"
2,0,medical_history,35c4d033-6b31-4791-b3ef-8b808ed35847,0,FAIL,date_death,properties.date_death.pattern,pattern,^\d{4}-\d{2}-\d{2},'85834-24-35' does not match '^\\d{4}-\\d{2}-\...
3,0,medical_history,9ccf97fe-4eea-4459-a3d5-da988878d505,0,FAIL,diabetes_type,properties.diabetes_type.enum,enum,"[type-1, type-2]","'type-4' is not one of ['type-1', 'type-2']"
4,0,medical_history,e53b0bd2-74c0-4b2c-8ef8-9b03e9aa6c81,0,FAIL,heart_rate,properties.heart_rate.type,type,integer,'80' is not of type 'integer'
5,1,medical_history,5a417704-3f4a-47fc-aece-d42d16127bdd,1,FAIL,heart_rate,properties.heart_rate.type,type,integer,'12' is not of type 'integer'
6,0,metabolomics_assay,f5b51773-98ab-4f69-98e5-594c16f6e369,0,FAIL,baseline_timepoint,properties.baseline_timepoint.type,type,boolean,'nothing' is not of type 'boolean'
7,1,metabolomics_assay,37d6e86c-77ac-4d54-b207-b136bd17bfe8,1,FAIL,assay_description,properties.assay_description.type,type,string,1 is not of type 'string'
8,0,metabolomics_file,f97de42d-2dc2-4fe5-b05f-2088de6a7190,0,FAIL,data_format,properties.data_format.enum,enum,[wiff],True is not one of ['wiff']
9,0,metabolomics_file,c60278bb-6682-4752-b06f-0c733150bbe9,0,FAIL,data_type,properties.data_type.enum,enum,"[MS, MS/MS]","'1' is not one of ['MS', 'MS/MS']"


Finally you can also create an aggreated summary of the flattened validation results with:

In [11]:
Summary.collapse_flatten_results_to_pd()

Unnamed: 0,entity,count,validation_error
0,medical_history,1,'12' is not of type 'integer'
1,medical_history,1,'80' is not of type 'integer'
2,medical_history,1,'85834-24-35' does not match '^\\d{4}-\\d{2}-\...
3,medical_history,1,"'NO' is not one of ['yes', 'no']"
4,medical_history,1,"'type-4' is not one of ['type-1', 'type-2']"
5,medical_history,1,"90 is not one of ['yes', 'no']"
6,metabolomics_assay,1,'nothing' is not of type 'boolean'
7,metabolomics_assay,1,1 is not of type 'string'
8,metabolomics_file,1,"'1' is not one of ['MS', 'MS/MS']"
9,metabolomics_file,1,True is not one of ['wiff']
