# 

# Generating synthetic data for ACDC
- This jupyter note book will run through set up and execution for generating synthetic data for the acdc project

In [3]:
!git clone https://github.com/AustralianBioCommons/gen3schemadev.git
!cd ./gen3schemadev
!python3 -m venv .venv
!source .venv/bin/activate
!pip3 install -r requirements.txt

LICENSE                        populate_metadata_service.py
Readme.md                      requirements.txt
[1m[36mconfigs[m[m/                       [1m[36mschema[m[m/
datas_submittor.py             [1m[36mschema_out[m[m/
delete_files.py                [1m[36mscripts[m[m/
delete_projects.py             setup.py
[1m[36mfile_type_templates[m[m/           sheet2yaml-CLI.py
fix_links.py                   sheet2yaml.py
[1m[36mgen3schemadev[m[m/                 transform_ausdiab.py
gen_acdc_synthetic_data.ipynb  [1m[36musers[m[m/
plausible_data_gen.py          users.yaml


In [46]:
# Pulling data schema from google sheets
!python3 sheet2yaml.py


In [18]:
# Run if  umccr-dict does not exist
!path_umccr_dict="$(pwd)/../umccr-dictionary"
# !git clone https://github.com/umccr/umccr-dictionary.git "$(pwd)/../"
!echo $(pwd)/../umccr-dictionary


/Users/harrijh/Library/CloudStorage/GoogleDrive-joshua@biocommons.org.au/My Drive/projects/gen3schemadev/../umccr-dictionary


In [47]:
# Moving schema_out to umccr-dictionary
!mkdir -p ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas
!cp schema_out/* ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas/
!ls -lsha ../umccr-dictionary/dictionary/acdc/gdcdictionary/schemas/

total 424
  0 drwxr-xr-x@ 30 harrijh  staff   960B Mar 27 14:33 [1m[36m.[m[m
  0 drwxr-xr-x@  3 harrijh  staff    96B Mar 27 14:33 [1m[36m..[m[m
 16 -rw-r--r--@  1 harrijh  staff   4.0K Mar 28 16:12 _definitions.yaml
  8 -rw-r--r--@  1 harrijh  staff    25B Mar 28 16:12 _settings.yaml
144 -rw-r--r--@  1 harrijh  staff    72K Mar 28 16:12 _terms.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.2K Mar 28 16:12 acknowledgement.yaml
  8 -rw-r--r--@  1 harrijh  staff   2.3K Mar 28 16:12 aligned_reads_file.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.6K Mar 28 16:12 aligned_reads_index_file.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.5K Mar 28 16:12 blood_pressure_test.yaml
 16 -rw-r--r--@  1 harrijh  staff   4.8K Mar 28 16:12 core_metadata_collection.yaml
  8 -rw-r--r--@  1 harrijh  staff   2.1K Mar 28 16:12 demographic.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.2K Mar 28 16:12 exposure.yaml
  8 -rw-r--r--@  1 harrijh  staff   1.4K Mar 28 16:12 genomics_assay.yaml
  8 -rw-r--r--@  1 harrijh 

In [48]:
# Compiling dictionary YAMLs to JSON
!cd ../umccr-dictionary && make pull
!cd ../umccr-dictionary && make up
!cd ../umccr-dictionary && make ps
!cd ../umccr-dictionary && make compile program=acdc


Using .env-sample


[1A[1B[0G[?25l[+] Pulling 0/0
 [33m⠋[0m ddvis Pulling                                                           [34m0.1s [0m
 [33m⠋[0m dmutils Pulling                                                         [34m0.1s [0m
 [33m⠋[0m ddimporter Pulling                                                      [34m0.1s [0m
 [33m⠋[0m g3po Pulling                                                            [34m0.1s [0m
 [33m⠋[0m ddsim Pulling                                                           [34m0.1s [0m
 [33m⠋[0m postgres Pulling                                                        [34m0.1s [0m
[?25h[1A[1A[1A[1A[1A[1A[1A[0G[?25l[+] Pulling 0/6
 [33m⠙[0m ddvis Pulling                                                           [34m0.2s [0m
 [33m⠙[0m dmutils Pulling                                                         [34m0.2s [0m
 [33m⠙[0m ddimporter Pulling                                                      [34m0.2s [0m
 [33m⠙[0m g3p

In [49]:
# Running Validation
!cd ../umccr-dictionary && make validate program=acdc

Using .env-sample
Validating Data Dictionary: acdc
[2024-03-28 05:13:32,935][data-simulator][   INFO] Data simulator initialization...
[2024-03-28 05:13:32,936][data-simulator][   INFO] Loading dictionary from url http://ddvis/schema/acdc.json
[2024-03-28 05:13:32,992][data-simulator][   INFO] Initializing graph...
[2024-03-28 05:13:32,993][data-simulator][   INFO] Validating...
[2024-03-28 05:13:32,994][data-simulator][   INFO] Done!


In [63]:
# Visualising data dictionary
!open http://localhost:8080/#schema/acdc.json

### Prototyping functions to version the schema
- Versioning will be in the format `acdc_schema_vMAJOR.MINOR.PATCH_YYYYMMDD.json`

In [55]:
# Pulling json schema back into gen3schemadev
!mkdir -p schema/json/acdc/
!cp ../umccr-dictionary/schema/acdc.json schema/json/acdc/

In [54]:
# function to rename schema with version
import datetime
import os

def version_schema(file_path, major, minor, patch):
    """
    Renames a schema.json file with semantic versioning and a timestamp.

    Args:
        file_path (str): Path to the schema file.
        major (int): Major version number.
        minor (int): Minor version number.
        patch (int): Patch version number.
    """
    
    # Get the current date in YYYYMMDD format
    current_date = datetime.datetime.now().strftime("%Y%m%d")
    
    # Extract the directory and file extension
    dir_name, file_name = os.path.split(file_path)
    file_base, file_extension = os.path.splitext(file_name)
    
    # Construct the new file name with version and timestamp
    new_file_name = f"{file_base}_v{major}.{minor}.{patch}_{current_date}{file_extension}"
    
    # Construct the full path for the new file
    new_file_path = os.path.join(dir_name, new_file_name)
    
    # Checking if file already exists
    if os.path.exists(new_file_path):
        print(f"File {new_file_path} already exists. Not renaming.")
        # ask for user input to overwrite
        user_input = input(f"Are you sure you want to overwrite {new_file_path}? (y/n): ")
        if user_input.lower() != "y":
            print("Renaming aborted.")
            return
    
    # Rename the file
    os.rename(file_path, new_file_path)
    
    print(f"File renamed to: {new_file_name}")


In [58]:
# renaming schema with version
file_path = os.path.join(os.getcwd(), "schema/json/acdc/acdc.json")
# print(file_path)
version_schema(file_path, 1, 0, 0)
    

File /Users/harrijh/Library/CloudStorage/GoogleDrive-joshua@biocommons.org.au/My Drive/projects/gen3schemadev/schema/json/acdc/acdc_v1.0.0_20240328.json already exists. Not renaming.
File renamed to: acdc_v1.0.0_20240328.json


In [59]:
# cleaning temp files
!rm -R schema_out

In [61]:
!python3 setup.py install

running install
running bdist_egg
running egg_info
creating gen3schemadev.egg-info
writing gen3schemadev.egg-info/PKG-INFO
writing dependency_links to gen3schemadev.egg-info/dependency_links.txt
writing requirements to gen3schemadev.egg-info/requires.txt
writing top-level names to gen3schemadev.egg-info/top_level.txt
writing manifest file 'gen3schemadev.egg-info/SOURCES.txt'
reading manifest file 'gen3schemadev.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'gen3schemadev.egg-info/SOURCES.txt'
installing library code to build/bdist.macosx-10.9-universal2/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/gen3schemadev
copying gen3schemadev/gen3object.py -> build/lib/gen3schemadev
copying gen3schemadev/__init__.py -> build/lib/gen3schemadev
copying gen3schemadev/gen3properties.py -> build/lib/gen3schemadev
copying gen3schemadev/schemabundle.py -> build/lib/gen3schemadev
creating build/bdist.macosx-10.9-universal2
crea

In [62]:
import gen3schemadev
gen3schemadev.