# Synthetic Data Validation and Transformation

In [1]:
import gen3schemadev

In [2]:
# Defining projects
project_list = ['tmp_project']
# Define the data import order file
data_import_order_path = '../output/synthetic_data/raw_umccr/tmp_project/DataImportOrder.txt'

In [None]:
# Reading data import order file and removing unnecessary nodes
data_import_nodes = []
with open(data_import_order_path, 'r') as f:
    data_import_nodes = f.read().splitlines()
exclude_nodes = ['acknowledgement', 'publication', 'program', 'project']
data_import_nodes = [node for node in data_import_nodes if node not in exclude_nodes]
data_import_nodes

#### Copying metadata to transformed folder

In [None]:
gen3schemadev.copy_directory('../output/synthetic_data/raw_gen3schemadev_updated_fn/', '../output/synthetic_data/transformed/')

## Adding in Enums and date time values
This code block does the following:

1. Reads the ETL mappings from the YAML file in `../config/etl_mappings.yaml`
    - The etl mappings look like this:
    ```yaml
    enum_mappings:
    - filename: medical_history.json
        key: hypertension_measurement_type
    time_mappings:
    - filename: core_metadata_collection.json
        key: date
    ```
    - The enum mappings say that the filename `medical_history.json` has a key `hypertension_measurement_type` that should be updated with the enum values from the `enums.csv` file.
    - The time mappings say that the filename `core_metadata_collection.json` has a key `date` that should be updated with a date/time value.

2. Iterates over each project in the project list
    - For each project, it updates the enum and date/time values in the JSON files. 

*Note: Without doing this step, the synthetic data generator adds in null enum values and incorrect date/time values.*

In [None]:
import yaml

# Define the path to the transformed synthetic data directory
transformed_path = '../output/synthetic_data/transformed'

# Define the paths to the enum and property definition CSV files
enum_file_path = '../output/input_google_sheets/csv/enum_def.csv'
prop_file_path = '../output/input_google_sheets/csv/prop_def.csv'

# Initialize the SchemaEnums object to provide enum and property lookups
enum_lookup = gen3schemadev.SchemaEnums(enum_file_path=enum_file_path, prop_file_path=prop_file_path)

# Load ETL mappings from the YAML configuration file
with open('../config/etl_mappings.yaml', 'r') as file:
    etl_mappings = yaml.safe_load(file)

# Extract enum and time key mappings from the loaded configuration
enum_key_mapping = etl_mappings['enum_mappings']
time_key_mapping = etl_mappings['time_mappings']

# Iterate over each project in the project list
for project in project_list:
    
    # --- Update enum values in JSON files ---
    for i in enum_key_mapping:
        json_filename = i['filename']  # Target JSON file name
        key = i['key']                 # Key in the JSON file to update
        # Retrieve allowed enum values for this key and file
        enums = enum_lookup.pull_enums(key, json_filename)
        # Update the JSON file with valid enum values for the specified key
        gen3schemadev.update_json_key_values(
            base_path=f"{transformed_path}/{project}", 
            json_filename=json_filename, 
            key=key,  
            enums=enums,
            write_inplace=True
        )
    
    # --- Update date/time values in JSON files ---
    for i in time_key_mapping:
        json_filename = i['filename']  # Target JSON file name
        key = i['key']                 # Key in the JSON file to update
        # Update the JSON file with a generated date/time value for the specified key
        gen3schemadev.update_json_key_values(
            base_path=f"{transformed_path}/{project}", 
            json_filename=json_filename, 
            key=key,  
            insert_date_time=True,
            write_inplace=True
        )

#### Coping data import order

In [None]:
synth_dir = '../output/synthetic_data'
gen3schemadev.copy_data_import_order(project_list, f"{synth_dir}/raw_umccr", f"{synth_dir}/transformed")

## Running Validation on Synthetic Data
- This is checking validation after transformation

In [None]:
import acdctools
quick = acdctools.QuickValidateSynth(data_dir="../output/synthetic_data/transformed",
                                              project_name_list= project_list,
                                              exclude_nodes=['acknowledgement', 'publication', 'program', 'project'],
                                              resolved_schema_path="../output/schema/json/schema_dev_resolved.json")
quick.quick_validate()

In [None]:
# Example of pulling out specific errors
key = ('tmp_project', 'demographic')
quick.errors.get(key)