# Custom Transformations for Synthetic Data
This notebook is used to develop transformations to ensure the data follows the schema.

In [None]:
# shifting working directory to one level above jupyter - Set as needed!
%cd ../

## Functions
- Prototype functions here. alternatively you can delete from here and move to a script


In [62]:

# function to randomly generate date time
import random
import os
from datetime import datetime, timedelta, timezone


def gen_random_date_time_tz(num_dates: int, start_date="2008-01-01", end_date="2023-03-16"):
    """
    Generates a list of random ISO 8601 datetime strings between start_date and end_date, including timezone.

    Parameters:
    - start_date (str): The start date in ISO 8601 format (YYYY-MM-DD).
    - end_date (str): The end date in ISO 8601 format (YYYY-MM-DD).
    - num_dates (int): Number of random dates to generate.

    Returns:
    - list: A list of random ISO 8601 datetime strings between start_date and end_date, including timezone.
    """
    start = datetime.fromisoformat(start_date)
    end = datetime.fromisoformat(end_date)
    
    random_dates = []
    for _ in range(num_dates):
        random_date = start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))
        # Randomly choose to add a timezone or UTC ('Z')
        if random.choice([True, False]):
            # Random timezone offset between -12:00 and +14:00 hours
            tz_offset_hours = random.randint(-12, 14)
            tz_offset_minutes = random.choice([0, 15, 30, 45]) * int(tz_offset_hours/abs(tz_offset_hours)) if tz_offset_hours != 0 else 0
            tz_info = timezone(timedelta(hours=tz_offset_hours, minutes=tz_offset_minutes))
            random_date = random_date.replace(tzinfo=tz_info)
        else:
            random_date = random_date.replace(tzinfo=timezone.utc)
        random_dates.append(random_date.isoformat())

    return random_dates


def update_json_key_values(base_path: str, json_filename: str, key: str, replacement: list, write_inplace: False):
    """
    Updates the date values for a specified key in a JSON file with randomly generated dates.

    Parameters:
    - base_path (str): The base directory path where the JSON file is located.
    - json_filename (str): The name of the JSON file to be updated.
    - key (str): The key in the JSON file whose values are to be updated with dates.
    - replacement (list): The number of random dates to generate for updating.

    This function reads the specified JSON file, updates the date values for the given key with
    randomly generated dates, and writes the updated data to a new file in an 'output' directory
    within the same base path.
    """
    import json
    import os
    from datetime import datetime, timedelta


    # Construct the full path to the JSON file
    full_path = os.path.join(base_path, json_filename)

    # Reading the JSON file
    with open(full_path, 'r') as f:
        data = json.load(f)
        

    # Updating the value of the specified key

    if len(data) == len(replacement):
        print('editing ' + str(key))# Ensure there's a date for each entry
        for i, entry in enumerate(data):
            data[i][key] = replacement[i]
    elif len(data) < len(replacement):
        print('editing ' + str(key))
        for i, entry in enumerate(data):
            replacement_sub = replacement[0:len(data)]
            data[i][key] = replacement_sub[i]
    else:
        return print('not enough values in replacement list')


    # Constructing the path for the output file
    if write_inplace:
        output_dir = base_path
    elif write_inplace == False:
        output_dir = os.path.join(os.path.dirname(base_path), 'output')
    else:
        raise ValueError("write_inplace must be True or False")
    
    os.makedirs(output_dir, exist_ok=True)  # Ensuring the output directory exists
    output_file = os.path.join(output_dir, os.path.basename(json_filename))

    # Writing the updated data to a new file in the 'output' directory
    print('writing edited json')
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)



def gen_random_enums(enums: list, n: int):
    return [random.choice(enums) for _ in range(n)]

    


# Running transformations
- update_json_key_values is a general function to grab a key value and define a replacement
- gen_random functions are used to help generate random values for the update_json_key_values function


### Example code:
```python
update_json_key_values(base_path = 'synthetic_data/', 
                       json_filename = 'serum_marker_file.json', 
                       key = 'data_category',  
                       replacement = gen_random_enums(['mass spec raw', 'mass spec analysed', 'summarised results'], 11000),
                       write_inplace = True)

update_json_key_values(base_path = 'synthetic_data/', 
                       json_filename = 'core_metadata_collection.json', 
                       key = 'date',  
                       replacement = gen_random_date_time_tz(20000))
```