# 

# Generating synthetic data for ACDC
- This jupyter note book will run through set up and execution for generating synthetic data for the acdc project

In [3]:
!git clone https://github.com/AustralianBioCommons/gen3schemadev.git
!cd ./gen3schemadev
!python3 -m venv .venv
!source .venv/bin/activate
!pip3 install -r requirements.txt

LICENSE                        populate_metadata_service.py
Readme.md                      requirements.txt
[1m[36mconfigs[m[m/                       [1m[36mschema[m[m/
datas_submittor.py             [1m[36mschema_out[m[m/
delete_files.py                [1m[36mscripts[m[m/
delete_projects.py             setup.py
[1m[36mfile_type_templates[m[m/           sheet2yaml-CLI.py
fix_links.py                   sheet2yaml.py
[1m[36mgen3schemadev[m[m/                 transform_ausdiab.py
gen_acdc_synthetic_data.ipynb  [1m[36musers[m[m/
plausible_data_gen.py          users.yaml


In [74]:
# Pulling data schema from google sheets
!python3 sheet2yaml.py


In [72]:
# Run if  umccr-dict does not exist
!cd "$(pwd)/../" && git clone https://github.com/AustralianBioCommons/umccr-dictionary.git


Cloning into 'umccr-dictionary'...
remote: Enumerating objects: 830, done.[K
remote: Counting objects: 100% (130/130), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 830 (delta 106), reused 107 (delta 95), pack-reused 700[K
Receiving objects: 100% (830/830), 1005.23 KiB | 6.66 MiB/s, done.
Resolving deltas: 100% (471/471), done.


In [75]:
# Moving schema_out to umccr-dictionary
!mkdir -p ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas
!cp schema_out/* ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas/
!ls -lsha ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas/

total 424
  0 drwxr-xr-x@ 30 harrijh  staff   960B Apr  5 14:24 [1m[36m.[m[m
  0 drwxr-xr-x@  3 harrijh  staff    96B Apr  5 14:22 [1m[36m..[m[m
 16 -rw-r--r--@  1 harrijh  staff   4.0K Apr  5 14:24 _definitions.yaml
  8 -rw-r--r--@  1 harrijh  staff    25B Apr  5 14:24 _settings.yaml
144 -rw-r--r--@  1 harrijh  staff    72K Apr  5 14:24 _terms.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.2K Apr  5 14:24 acknowledgement.yaml
  8 -rw-r--r--@  1 harrijh  staff   2.3K Apr  5 14:24 aligned_reads_file.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.6K Apr  5 14:24 aligned_reads_index_file.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.5K Apr  5 14:24 blood_pressure_test.yaml
 16 -rw-r--r--@  1 harrijh  staff   4.8K Apr  5 14:24 core_metadata_collection.yaml
  8 -rw-r--r--@  1 harrijh  staff   2.1K Apr  5 14:24 demographic.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.2K Apr  5 14:24 exposure.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.4K Apr  5 14:24 genomics_assay.yaml
  8 -rw-r--r--@  1 harrijh 

In [76]:
# Compiling dictionary YAMLs to JSON
!cd ../umccr-dictionary && make pull
!cd ../umccr-dictionary && make down
!cd ../umccr-dictionary && make up
!cd ../umccr-dictionary && make ps


Using .env-sample
[1A[1B[0G[?25l[+] Pulling 0/0
 [33m⠋[0m dmutils Pulling                                                         [34m0.1s [0m
 [33m⠋[0m ddsim Pulling                                                           [34m0.1s [0m
 [33m⠋[0m ddvis Pulling                                                           [34m0.1s [0m
 [33m⠋[0m g3po Pulling                                                            [34m0.1s [0m
 [33m⠋[0m postgres Pulling                                                        [34m0.1s [0m
 [33m⠋[0m ddimporter Pulling                                                      [34m0.1s [0m
[?25h[1A[1A[1A[1A[1A[1A[1A[0G[?25l[+] Pulling 0/6
 [33m⠙[0m dmutils Pulling                                                         [34m0.2s [0m
 [33m⠙[0m ddsim Pulling                                                           [34m0.2s [0m
 [33m⠙[0m ddvis Pulling                                                           [34m0.2s [

In [78]:

!cd ../umccr-dictionary && make compile program=acdc


Using .env-sample
Writing schema into /schema/acdc.json...


In [79]:
# Running Validation
!cd ../umccr-dictionary && make validate program=acdc

Using .env-sample
Validating Data Dictionary: acdc
[2024-04-05 03:35:51,550][data-simulator][   INFO] Data simulator initialization...
[2024-04-05 03:35:51,551][data-simulator][   INFO] Loading dictionary from url http://ddvis/schema/acdc.json
[2024-04-05 03:35:51,605][data-simulator][   INFO] Initializing graph...
[2024-04-05 03:35:51,605][data-simulator][   INFO] Validating...
[2024-04-05 03:35:51,606][data-simulator][   INFO] Done!


In [80]:
# Visualising data dictionary
!open http://localhost:8080/#schema/acdc.json

### Prototyping functions to version the schema
- Versioning will be in the format `acdc_schema_vMAJOR.MINOR.PATCH_YYYYMMDD.json`
- When versioning: 
    - MAJOR = Completely new schema, no backwards compatibility
    - MINOR = Same Schema, different submitter_ids, different data values
    - PATCH = Same Schema, Same submitter_ids, different values
- MAJOR versions should be compatible with the corresponding MAJOR data version



In [90]:
# Pulling json schema back into gen3schemadev
!mkdir -p schema/json/acdc/
!cp ../umccr-dictionary/schema/acdc.json schema/json/acdc/

In [91]:
# function to rename schema with version
import datetime
import os

def version_schema(file_path, major, minor, patch, dir=False):
    """
    Renames a schema.json file with semantic versioning and a timestamp.

    Args:
        file_path (str): Path to the schema file.
        major (int): Major version number.
        minor (int): Minor version number.
        patch (int): Patch version number.
    """
    
    # Get the current date in YYYYMMDD format
    current_date = datetime.datetime.now().strftime("%Y%m%d")
    
    # Extract the directory and file extension
    dir_name, file_name = os.path.split(file_path)
    file_base, file_extension = os.path.splitext(file_name)
    
    # Construct the new file name with version and timestamp
    if dir == True:
        new_file_name = f"{file_base}_v{major}.{minor}.{patch}_{current_date}"
    elif dir == False:
        new_file_name = f"{file_base}_v{major}.{minor}.{patch}_{current_date}{file_extension}"
    
    # Construct the full path for the new file
    new_file_path = os.path.join(dir_name, new_file_name)
    
    # Checking if file already exists
    if os.path.exists(new_file_path):
        print(f"File {new_file_path} already exists. Not renaming.")
        # ask for user input to overwrite
        user_input = input(f"Are you sure you want to overwrite {new_file_path}? (y/n): ")
        if user_input.lower() != "y":
            print("Renaming aborted.")
            return
    
    # Rename the file
    os.rename(file_path, new_file_path)
    
    print(f"File renamed to: {new_file_name}")


In [92]:
# renaming schema with version
file_path = os.path.join(os.getcwd(), "schema/json/acdc/acdc.json")
# print(file_path)
version_schema(file_path, 1, 1, 1)
    

File /Users/harrijh/Library/CloudStorage/GoogleDrive-joshua@biocommons.org.au/My Drive/projects/gen3schemadev/schema/json/acdc/acdc_v1.0.1_20240405.json already exists. Not renaming.
File renamed to: acdc_v1.0.1_20240405.json


In [59]:
# cleaning temp files
!rm -R schema_out

# Generating synthetic data 
- Uses [ausbiocommons for of umccr-dictionary](https://github.com/AustralianBioCommons/umccr-dictionary.git)
- Then uses gen3SchemaDev to make plausible data values

In [None]:
# Clearing existing simulated data


In [85]:
# Generating synthetic data using umccr
!cd ../umccr-dictionary && make simulate program=acdc project=AusDiab max_samples=11000
!cd ../umccr-dictionary && make simulate program=acdc project=FIELD max_samples=10000
!cd ../umccr-dictionary && make simulate program=acdc project=BioHEART-CT max_samples=5000

Using .env-sample
Validating Data Dictionary: acdc
[2024-04-05 04:11:14,622][data-simulator][   INFO] Data simulator initialization...
[2024-04-05 04:11:14,623][data-simulator][   INFO] Loading dictionary from url http://ddvis/schema/acdc.json
[2024-04-05 04:11:14,670][data-simulator][   INFO] Initializing graph...
[2024-04-05 04:11:14,671][data-simulator][   INFO] Validating...
[2024-04-05 04:11:14,672][data-simulator][   INFO] Done!
Simulating Data Dictionary: acdc
[2024-04-05 04:11:15,641][data-simulator][   INFO] Data simulator initialization...
[2024-04-05 04:11:15,642][data-simulator][   INFO] Loading dictionary from url http://ddvis/schema/acdc.json
[2024-04-05 04:11:15,688][data-simulator][   INFO] Initializing graph...
[2024-04-05 04:11:15,690][data-simulator][   INFO] Generating data...
[2024-04-05 04:11:15,691][data-simulator simulate][   INFO] Simulating data for node project
[2024-04-05 04:11:29,827][data-simulator simulate][   INFO] Simulating data for node subject
[2024-

In [94]:
# Copying and versioning simulated data
import shutil
from os import makedirs

os.makedirs('./synthetic_data', exist_ok=True)


In [106]:
# copying data
src_dir = '../umccr-dictionary/data/acdc'
dst_dir = './synthetic_data/acdc'
shutil.copytree(src_dir, dst_dir, dirs_exist_ok=True)


'./synthetic_data/acdc'

In [107]:
# Running plausible data generation
!python3 plausible_data_gen.py --path synthetic_data/acdc/AusDiab --gurl https://docs.google.com/spreadsheets/d/1AX9HLzIV6wtkVylLkwOr3kdKDaZf4ukeYACTJ7lYngk/edit\#gid\=1400179124

2024-04-05 15:58:15 [INFO] Parsing simulated jsons from synthetic_data/acdc/AusDiab
2024-04-05 15:58:15 [INFO] Parsing distribution values from sheet/csv
2024-04-05 15:58:17 [INFO] Writing metadata jsons to file...
2024-04-05 15:58:20 [INFO] Metadata jsons written to: ./edited_jsons


In [108]:
# Versioning data
file_path = os.path.join(os.getcwd(), "edited_jsons/acdc")
# print(file_path)
version_schema(file_path, 1, 1, 2, dir=True)


File renamed to: acdc_v1.0.2_20240405


In [111]:
# copying
import subprocess
src_dir = 'edited_jsons/acdc_v1.0.2_20240405'
dst_dir = './synthetic_data/'
# Using subprocess to execute a bash command
subprocess.run(["mv", src_dir, dst_dir], check=True)


CompletedProcess(args=['mv', 'edited_jsons/acdc_v1.0.2_20240405', './synthetic_data/'], returncode=0)

## Next steps
1. Apply formatting fixes