# Resolve Schema

**input**
```
+schema_path: str
```

**methods**
```
+read_json(schema_path: str): dict
+split_json(schema: dict): list
+resolver(entity1: dict, entity2: dict): dict
+resolve_defs(terms: dict, defs: dict) : dict
+ node_order(schema: dict): list
+resolve_nodes(nodeList: list, splitJsonList: list): list
+recombine_nodes(resolvedList: list) : dict
```

In [1]:
import gen3_data_validator
from gen3_data_validator.logging_config import setup_logging
setup_logging()

In [None]:
# pulling manifest
!aws s3 cp s3://ausdiab-data-receive-bucket/data/2025-01-31_AusDiab_ACDC_Data_Transfer/manifest.xlsx ../data/restricted/ausdiab_lipid_manifest.xlsx

In [None]:
# pulling schema
!aws s3 cp s3://gen3schema-cad-staging-biocommons.org.au/cad.json ../schema/gen3_schema.json

## Reading in xlsx data and writing to json
- xlsx data comes from xlsx manifest file created from acdc_submission_template

In [6]:
# ResolverClass = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_test_schema.json")
xlsxData = gen3_data_validator.ParseXlsxMetadata(xlsx_path = "/Users/harrijh/projects/gen3-data-validator/data/lipid_metadata_example.xlsx", skip_rows=1)
xlsxData.parse_metadata_template()
xlsxData.write_dict_to_json(xlsx_data_dict=xlsxData.xlsx_data_dict, output_dir="/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example")

## Testing Linkage

In [8]:
Data = gen3_data_validator.ParseData(data_folder_path = "/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example")

In [12]:
Data.file_path_list

['/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example/lipidomics_assay.json',
 '/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example/lipidomics_mapping_file.json',
 '/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example/sample.json',
 '/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example/subject.json',
 '/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example/lipidomics_file.json']

In [2]:
Data = gen3_data_validator.ParseData(data_folder_path = "/Users/harrijh/projects/gen3-data-validator/data/restricted/lipid_metadata_example")
Resolver = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_schema.json")
Resolver.resolve_schema()
Linkage = gen3_data_validator.TestLinkage(schema_resolver = Resolver, data_parser = Data)

NameError: name 'gen3_data_validator' is not defined

In [3]:
Resolver = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_schema.json")

In [6]:
Resolver.nodes

['demographic.yaml',
 'project.yaml',
 'serum_marker_assay.yaml',
 'alignment_workflow.yaml',
 'imaging_file.yaml',
 'lipidomics_assay.yaml',
 'metabolomics_file.yaml',
 'acknowledgement.yaml',
 'medical_history.yaml',
 '_definitions.yaml',
 '_settings.yaml',
 'blood_pressure_test.yaml',
 'genomics_assay.yaml',
 'variant_file.yaml',
 'program.yaml',
 'serum_marker_file.yaml',
 'proteomics_assay.yaml',
 'sample.yaml',
 'unaligned_reads_file.yaml',
 '_terms.yaml',
 'aligned_reads_index_file.yaml',
 'variant_workflow.yaml',
 'proteomics_file.yaml',
 'exposure.yaml',
 'metabolomics_assay.yaml',
 'lipidomics_mapping_file.yaml',
 'lipidomics_file.yaml',
 'aligned_reads_file.yaml',
 'lab_result.yaml',
 'medication.yaml',
 'publication.yaml',
 'subject.yaml',
 'core_metadata_collection.yaml']

You can also bypass the data attribute in the linkage class and input your own data_map and config_map

In [7]:
# # Using the Linkage class which has the resolved schema with custom data and config
# config_map = {
#     "samples": {"primary_key": "sample_id", "foreign_key": "subject_id"},
#     "files": {"primary_key": "file_id", "foreign_key": "sample_id"},
#     "subjects": {"primary_key": "subject_id", "foreign_key": "project_id"},
#     "project": {"primary_key": "project_id", "foreign_key": None}
# }

# data_map = {
#     "samples": [
#         {"sample_id": "sample_1", "subject_id": "subject_9"},
#         {"sample_id": "sample_2", "subject_id": "subject_3"},  # Invalid FK
#         {"sample_id": "sample_3", "subject_id": "subject_4"}, # Invalid FK
#         {"sample_id": "sample_4", "subject_id": "subject_5"} # Invalid FK
#     ],
#     "files": [
#         {"file_id": "file_1", "sample_id": "sample_1"},
#         {"file_id": "file_2", "sample_id": "sample_27"}  # Invalid FK
#     ],
#     "subjects": [
#         {"subject_id": "subject_1", "project_id": "project_1"},  
#         {"subject_id": "subject_2", "project_id": "project_2"}, # Missing project 2
#     ],
#     "project": [
#         {"project_id": "project_1"}
#     ]
# }

# Linkage.validate_links(data_map, config_map)

# validation prototype


## Creating the validation class

In [2]:
import gen3_data_validator

resolver = gen3_data_validator.ResolveSchema(schema_path = "../schema/gen3_schema.json")
resolver.resolve_schema()
data = gen3_data_validator.ParseData(data_folder_path = "../data/restricted/ausdiab_lipid_metadata/")
validator = gen3_data_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)

In [3]:
validator.validate_schema()

{'lipidomics_assay': [{'index_0': [{'index': 0,
     'validation_result': 'FAIL',
     'invalid_key': 'instrument_type',
     'schema_path': 'properties.instrument_type.enum',
     'validator': 'enum',
     'validator_value': ['LC-MS',
      'MS/MS',
      'Multidimensional MS',
      'Ion Mobility MS',
      'MALDI MS',
      'GC-MS',
      'High Mass Accuracy MS'],
     'validation_error': "'Agilent QQQ LC-MS' is not one of ['LC-MS', 'MS/MS', 'Multidimensional MS', 'Ion Mobility MS', 'MALDI MS', 'GC-MS', 'High Mass Accuracy MS']"}]},
  {'index_1': [{'index': 1,
     'validation_result': 'FAIL',
     'invalid_key': 'instrument_type',
     'schema_path': 'properties.instrument_type.enum',
     'validator': 'enum',
     'validator_value': ['LC-MS',
      'MS/MS',
      'Multidimensional MS',
      'Ion Mobility MS',
      'MALDI MS',
      'GC-MS',
      'High Mass Accuracy MS'],
     'validation_error': "'Agilent QQQ LC-MS' is not one of ['LC-MS', 'MS/MS', 'Multidimensional MS', 'Ion M

In [4]:
validator.make_keymap()

{'lipidomics_assay': ['index_0',
  'index_1',
  'index_2',
  'index_3',
  'index_4',
  'index_5',
  'index_6',
  'index_7',
  'index_8',
  'index_9',
  'index_10',
  'index_11',
  'index_12',
  'index_13',
  'index_14',
  'index_15',
  'index_16',
  'index_17',
  'index_18',
  'index_19',
  'index_20',
  'index_21',
  'index_22',
  'index_23',
  'index_24',
  'index_25',
  'index_26',
  'index_27',
  'index_28',
  'index_29',
  'index_30',
  'index_31',
  'index_32',
  'index_33',
  'index_34',
  'index_35',
  'index_36',
  'index_37',
  'index_38',
  'index_39',
  'index_40',
  'index_41',
  'index_42',
  'index_43',
  'index_44',
  'index_45',
  'index_46',
  'index_47',
  'index_48',
  'index_49',
  'index_50',
  'index_51',
  'index_52',
  'index_53',
  'index_54',
  'index_55',
  'index_56',
  'index_57',
  'index_58',
  'index_59',
  'index_60',
  'index_61',
  'index_62',
  'index_63',
  'index_64',
  'index_65',
  'index_66',
  'index_67',
  'index_68',
  'index_69',
  'index_7

In [8]:
data.data_dict

{'lipidomics_assay': [{'assay_id': 'AD01_012#01-004-990910001',
   'assay_description': 'Targeted mass spec lipidome',
   'instrument_type': 'Agilent QQQ LC-MS',
   'type': 'lipidomics_assay',
   'key_fk': 'sample-ausdiab-0000101',
   'key_pk': 'lipidomics-assay-ausdiab-01-004-990910001',
   'samples': {'submitter_id': 'sample-ausdiab-0000101'},
   'submitter_id': 'lipidomics-assay-ausdiab-01-004-990910001',
   'lipidomics_assays': 'lipidomics-assay-ausdiab-01-004-990910001'},
  {'assay_id': 'AD01_013#01-005-990910002',
   'assay_description': 'Targeted mass spec lipidome',
   'instrument_type': 'Agilent QQQ LC-MS',
   'type': 'lipidomics_assay',
   'key_fk': 'sample-ausdiab-0000201',
   'key_pk': 'lipidomics-assay-ausdiab-01-005-990910002',
   'samples': {'submitter_id': 'sample-ausdiab-0000201'},
   'submitter_id': 'lipidomics-assay-ausdiab-01-005-990910002',
   'lipidomics_assays': 'lipidomics-assay-ausdiab-01-005-990910002'},
  {'assay_id': 'AD01_014#01-006-990910003',
   'assay_de

### Getting nested validation results
- returns a nested dictionary by entity/data node then by the row/index number, and then the validation objects

In [None]:
validation_dict = validator.validation_result
validation_dict

In [None]:
validator.list_entities()

In [None]:
validator.list_index_by_entity("lipidomics_assay")

You can pull out a validation results for a specific entity with

In [None]:
validator.pull_entity("lipidomics_assay")

You can pull validation results for a specific entity and then a specific index / row

In [None]:
validator.pull_index_of_entity("lipidomics_assay", "index_1")

# Getting validation stats

In [None]:
validate_stats = gen3_data_validator.ValidateStats(validator)
stats_df = validate_stats.summary_stats()
stats_df

# Creating validation summary data

In [None]:
Summary = gen3_data_validator.ValidateSummary(validator) 
flattened_results_dict = Summary.flatten_validation_results()
flattened_results_dict

### Converting flattened dict to pandas

In [None]:
flatten_summary_pd = Summary.flattened_results_to_pd()
flatten_summary_pd

### Collapsing flattened dict to pandas
- This collapsed data frame summarises common validation errors

In [None]:
collapse_df = Summary.collapse_flatten_results_to_pd()
collapse_df

# Writing validation results to folder

In [None]:
import os
output_dir = "../data/restricted/ausdiab_lipid_metadata/validation/"
os.makedirs(output_dir, exist_ok=True)


def write_dict_to_json(input_dict, output_dir, filename:str):
    with open(f"{output_dir}/{filename}.json", "w") as f:
        json.dump(input_dict, f)
    print(f"JSON files written to {output_dir}")

write_dict_to_json(validation_dict, output_dir, "validation_dict")
write_dict_to_json(flattened_results_dict, output_dir, "flattened_results_dict")

# Writing pandas df
stats_df.to_csv(f"{output_dir}/stats_df.csv")
flatten_summary_pd.to_csv(f"{output_dir}/flatten_summary_pd.csv")
collapse_df.to_csv(f"{output_dir}/collapse_df.csv")


In [None]:
# Use this for writing tests

sample_validation_results = {
    'sample': [
        [
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'freeze_thaw_cycles',
                'schema_path': 'properties.freeze_thaw_cycles.type',
                'validator': 'type',
                'validator_value': 'integer',
                'validation_error': "'10' is not of type 'integer'"
            },
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_provider',
                'schema_path': 'properties.sample_provider.enum',
                'validator': 'enum',
                'validator_value': ['Baker', 'USYD', 'UMELB', 'UQ'],
                'validation_error': "45 is not one of ['Baker', 'USYD', 'UMELB', 'UQ']"
            },
            {
                'index': 0,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_storage_method',
                'schema_path': 'properties.sample_storage_method.enum',
                'validator': 'enum',
                'validator_value': [
                    'not stored',
                    'ambient temperature',
                    'cut slide',
                    'fresh',
                    'frozen, -70C freezer',
                    'frozen, -150C freezer',
                    'frozen, liquid nitrogen',
                    'frozen, vapor phase',
                    'paraffin block',
                    'RNAlater, frozen',
                    'TRIzol, frozen'
                ],
                'validation_error': "'Autoclave' is not one of ['not stored', 'ambient temperature', 'cut slide', 'fresh', 'frozen, -70C freezer', 'frozen, -150C freezer', 'frozen, liquid nitrogen', 'frozen, vapor phase', 'paraffin block', 'RNAlater, frozen', 'TRIzol, frozen']"
            }
        ],
        [
            {
                'index': 1,
                'validation_result': 'FAIL',
                'invalid_key': 'freeze_thaw_cycles',
                'schema_path': 'properties.freeze_thaw_cycles.type',
                'validator': 'type',
                'validator_value': 'integer',
                'validation_error': "'76' is not of type 'integer'"
            },
            {
                'index': 1,
                'validation_result': 'FAIL',
                'invalid_key': 'sample_storage_method',
                'schema_path': 'properties.sample_storage_method.enum',
                'validator': 'enum',
                'validator_value': [
                    'not stored',
                    'ambient temperature',
                    'cut slide',
                    'fresh',
                    'frozen, -70C freezer',
                    'frozen, -150C freezer',
                    'frozen, liquid nitrogen',
                    'frozen, vapor phase',
                    'paraffin block',
                    'RNAlater, frozen',
                    'TRIzol, frozen'
                ],
                'validation_error': "'In the Pantry' is not one of ['not stored', 'ambient temperature', 'cut slide', 'fresh', 'frozen, -70C freezer', 'frozen, -150C freezer', 'frozen, liquid nitrogen', 'frozen, vapor phase', 'paraffin block', 'RNAlater, frozen', 'TRIzol, frozen']"
            }
        ],
        [
            {
                'index': 2,
                'validation_result': 'PASS',
                'invalid_key': None,
                'schema_path': None,
                'validator': None,
                'validator_value': None,
                'validation_error': None
            }
        ],
        [
            {
                'index': 3,
                'validation_result': 'PASS',
                'invalid_key': None,
                'schema_path': None,
                'validator': None,
                'validator_value': None,
                'validation_error': None
            }
        ]
    ]
}


