This notebook is intended to mimic the actions of what the TSPS service would be doing.  

User and Control workspace creation are outside the scope of this notebook and will need to be created elsewhere (through the UI is probably the easiest.  For the most realistic scenario, you would want the user and control workspace to be created by different accounts and the control workspace account given writer access to the user workspace.

1. copy file from user workspace to control workspace
2. update control workspace WDS with new row containing a unique id and location of the copied file
3. create method in cbas (currently hello world wdl) - note this should be done as part of control workspace creation but since that is being done outside of this notebook we still need to make sure the method we're running is consistent
4. launch submission of newly created method using the copied file as input
5. copy file from control workspace to user workspace
6. update user workspace WDS with new row containing same unique id and location of the copied file


## Setup

In [1]:
import json
import os
import requests
import csv
from pprint import pprint

In [2]:
# env API urls
env = "prod"
WSM_URL = f'https://workspace.dsde-{env}.broadinstitute.org'
TSPS_URL = f'https://tsps.dsde-{env}.broadinstitute.org'
ORCH_URL = f'https://firecloud-orchestration.dsde-{env}.broadinstitute.org/'
LEONARDO_URL= f'https://leonardo.dsde-{env}.broadinstitute.org/'


In [3]:
def get_access_token(verbose=False):
    """Get access token for pet managed identity in Azure."""
    if verbose:
        !az login --identity --allow-no-subscriptions
    else:
        !az login --identity --allow-no-subscriptions --output none
    cli_token = !az account get-access-token | jq .accessToken

    return cli_token[0].replace('"', '')

def get_headers(verb='GET', verbose=False):
    headers = {
        'Authorization': 'Bearer ' + get_access_token(verbose),
        'accept': '*/*'
    }
    
    if verb == 'POST':
        headers['Content-Type'] = 'application/json'

    return headers

In [14]:
# get headers once - refresh this as needed
HEADERS_GET = get_headers('GET')
HEADERS_POST = get_headers('POST')

### workspace functions
def get_workspace_id(ws_project, ws_name, orch_url=ORCH_URL, verbose=False):
    uri = f'{orch_url}/api/workspaces/{ws_project}/{ws_name}'
    
    response = requests.get(uri, headers=HEADERS_GET)
    
    return response.json()['workspace']['workspaceId']

def get_workspace_sc_resource_id(ws_id, wsm_url=WSM_URL, verbose=False):
    uri = f'{wsm_url}/api/workspaces/v1/{ws_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER'
    
    response = requests.get(uri, headers=HEADERS_GET)
    
    sc_resource_id = None
    for info_dict in response.json()['resources']:
        if info_dict['metadata']['controlledResourceMetadata']['accessScope'] == 'SHARED_ACCESS':
            sc_resource_id = info_dict['metadata']['resourceId']         
    
    return sc_resource_id


### file functions
def get_sas_token_for_blob(blob_name, 
                           ws_id, 
                           ws_sc_id, 
                           permissions='r', 
                           wsm_url=WSM_URL,
                           verbose=False):
    uri = f'{wsm_url}/api/workspaces/v1/{ws_id}/resources/controlled/azure/storageContainer/{ws_sc_id}/getSasToken?sasPermissions={permissions}&sasBlobName={blob_name}'
    
    response = requests.post(uri, headers=HEADERS_GET, data='')
    status_code = response.status_code
    
    return response.json()['url']

# leonardo functions
def get_apps_for_workspace_id(workspace_id,
                              leonardo_url=LEONARDO_URL,
                              verbose=False):
    uri = f'{leonardo_url}/api/apps/v2/{workspace_id}?includeDeleted=false'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()

def get_app_for_workspace_id(workspace_id,
                             app_name,
                             leonardo_url=LEONARDO_URL,
                             verbose=False):
    uri = f'{leonardo_url}/api/apps/v2/{workspace_id}?includeDeleted=false'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()

# wds functions
def get_types(instance_id,
              wds_url,
              api_version="0.2",
              verbose=False):
    uri = f'{wds_url}/{instance_id}/types/v{api_version}'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()
              
def query_records_for_type(instance_id,
                           type_name,
                           wds_url,
                           api_version="0.2",
                           offset=0,
                           limit=100,
                           sort="asc",
                           verbose=False):
    uri = f'{wds_url}/{instance_id}/search/v{api_version}/{type_name}'
    
    
    body = json.dumps({
        'offset': offset,
        'limit': limit,
        'sort': sort
    })
    
    response = requests.post(uri, headers=HEADERS_POST, data=body)
    status_code = response.status_code
    
    return response.json()

def upload_records_for_type(instance_id,
                            type_name,
                            wds_url,
                            tsv_file_path,
                            api_version="0.2",
                            verbose=False):
    uri = f'{wds_url}/{instance_id}/tsv/v{api_version}/{type_name}'
    test_file = open(tsv_file_path, "rb")
    
    response = requests.post(uri, headers=HEADERS_GET, files = {"records": test_file})
    status_code = response.status_code
    
    return response.json()

# cbas functions
def create_method_with_github_link(cbas_url,
                                   method_name,
                                   github_url,
                                   method_version,
                                   method_source="GitHub",
                                   verbose=False):
    
    uri = f'{cbas_url}/api/batch/v1/methods'
    post_body = json.dumps({
        'method_name': method_name,
        'method_source': method_source,
        'method_version': method_version,
        'method_url': github_url
    })
    
    response = requests.post(uri, headers=HEADERS_POST, data=post_body)
    status_code = response.status_code
    
    return response.json()

def get_methods(cbas_url,
                show_versions=True,
                verbose=False):
    
    uri = f'{cbas_url}/api/batch/v1/methods?show_versions={show_versions}'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()

def get_method_info(cbas_url,
                    method_id,
                    verbose=False):
    
    uri = f'{cbas_url}/api/batch/v1/methods?method_id={method_id}'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()

def get_run_sets_for_method(cbas_url,
                            method_id,
                            page_size=1,
                            verbose=False):
    
    uri = f'{cbas_url}/api/batch/v1/run_sets?method_id={method_id}&page_size=1'
    response = requests.get(uri, headers=HEADERS_GET)
    status_code = response.status_code
    
    return response.json()

def submit_run_set(cbas_url,
                   post_body,
                   verbose=False):
    
    uri = f'{cbas_url}/api/batch/v1/run_sets'
    
    response = requests.post(uri, headers=HEADERS_POST, data=post_body)
    status_code = response.status_code
    
    return response.json()

### grab user/control workspace related information - needs to be filled out

In [5]:
# retrieve info for control workspace
ctrl_ws_project= 'dsp-azure-general'
ctrl_ws_name = 'js-imputation-pipeline-testingg'

ctrl_ws_id = get_workspace_id(ctrl_ws_project, ctrl_ws_name)
ctrl_ws_sc_id = get_workspace_sc_resource_id(ctrl_ws_id)

print(f'control workspace ID:                  {ctrl_ws_id}')
print(f'control storage container resource ID: {ctrl_ws_sc_id}')

ctrl_cbas_uri = ''
ctrl_wds_uri = ''
get_apps_response = get_apps_for_workspace_id(ctrl_ws_id)
for app in get_apps_response:
    if app['appType'] == 'CROMWELL':
       ctrl_cbas_uri = app['proxyUrls']['cbas']
    if app['appType'] == 'WDS':
       ctrl_wds_uri = app['proxyUrls']['wds']

print(f'control cbas uri ID:                  {ctrl_cbas_uri}')
print(f'control wds uri:                      {ctrl_wds_uri}')
    
# retrieve info for user workspace
user_ws_project = 'azure-featured-workspaces'
user_ws_name = 'Imputation User Workspace'

user_ws_id = get_workspace_id(user_ws_project, user_ws_name)
user_ws_sc_id = get_workspace_sc_resource_id(user_ws_id)

print(f'user workspace ID:                  {user_ws_id}')
print(f'user storage container resource ID: {user_ws_sc_id}')

user_wds_uri = ''
get_apps_response = get_apps_for_workspace_id(user_ws_id)
for app in get_apps_response:
    if app['appType'] == 'WDS':
       user_wds_uri = app['proxyUrls']['wds']
    
print(f'user wds uri:                      {user_wds_uri}')


control workspace ID:                  7a6ab368-c165-48ae-8b9a-a2cf1138ef1c
control storage container resource ID: bd889533-6a32-4374-8dc4-74c9a697d6d5
control cbas uri ID:                  https://lzf42bf0dc63c251179adc6ee67ef00d66ba53808001a58d33.servicebus.windows.net/terra-app-7e72dad7-3ec0-4ce9-a767-aeee668c1b45-7a6ab368-c165-48ae-8b9a-a2cf1138ef1c/cbas
control wds uri:                      https://lzf42bf0dc63c251179adc6ee67ef00d66ba53808001a58d33.servicebus.windows.net/wds-7a6ab368-c165-48ae-8b9a-a2cf1138ef1c-7a6ab368-c165-48ae-8b9a-a2cf1138ef1c/
user workspace ID:                  a5ec64f3-69e4-4646-8fb6-db4f882a2dd6
user storage container resource ID: ebe08011-551d-4cd8-97eb-07970e90eade
user wds uri:                      https://lz73d55620961cdabc5847f29f7780900840a75d86a8043662.servicebus.windows.net/wds-a5ec64f3-69e4-4646-8fb6-db4f882a2dd6-a5ec64f3-69e4-4646-8fb6-db4f882a2dd6/


## Write to Control WDS inputs "passed in" by user and copy to control workspace

For this example we will just be copying one file from the user workspace to the control workspace and noting its path in WDS

In [6]:
# you need to upload this file to the "workspace files" (storage container) of the user workspace
user_file_to_copy = 'user_input_file.txt'  # change this as needed

# path to where you want to copy the file in the control workspace, no need to change
ctrl_file_destination = 'user_input_file_copy.txt'

# get sas token for user file
user_file_sas = get_sas_token_for_blob(user_file_to_copy, user_ws_id, user_ws_sc_id)

# create a target destination SAS token
# NOTE: according to documentation (https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-copy#guidelines)
# if the copy is between tenants (which we do want to support), we can't use Azure Active Directory (Azure AD) authentication
# and must instead use SAS tokens.
ctrl_file_sas = get_sas_token_for_blob(ctrl_file_destination, 
                                       permissions='w',
                                       ws_id=ctrl_ws_id,
                                       ws_sc_id=ctrl_ws_sc_id)

# azcopy needs the sas urls to be in quotes
source_file_sas_with_quotes = f"'{user_file_sas}'"
dest_file_sas_with_quotes = f"'{ctrl_file_sas}'"

# do the copy
!azcopy copy $source_file_sas_with_quotes $dest_file_sas_with_quotes

# path to where you want to save wds tsv for the control workspace, no real need to change
tsv_file_path = 'ctrl_wds.tsv'

# data you want to add to control workspace wds, change as needed
ctrl_file_uri = ctrl_file_sas.split("?")[0] #TODO is there a way to get this uri from an api call directly?
wds_table_name = "test_imputation_input"
wds_table_headers = ['submission_id', 'input_file_location']
wds_row_id = '123456'

with open(tsv_file_path, 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    writer.writerow(wds_table_headers)
    writer.writerow([wds_row_id, ctrl_file_uri])
    
upload_records_for_type(ctrl_ws_id, wds_table_name, ctrl_wds_uri, tsv_file_path)

INFO: Scanning...
INFO: Failed to create one or more destination container(s). Your transfers may still succeed if the container already exists.
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job a9baddd6-25b1-2942-5321-964395eda833 has started
Log file is located at: /home/jupyter/.azcopy/a9baddd6-25b1-2942-5321-964395eda833.log

INFO: azcopy: A newer version 10.20.0 is available to download

INFO: Could not read destination length. If the destination is write-only, use --check-length=false on the command line.
100.0 %, 1 Done, 0 Failed, 0 Pending, 0 Skipped, 1 Total, 


Job a9baddd6-25b1-2942-5321-964395eda833 summary
Elapsed Time (Minutes): 0.0333
Number of File Transfers: 1
Number of Folder Property Transfers: 0
Total Number of Transfers: 1
Number of Transfers Completed: 1
Number of Transfers Failed: 0
Number of Transfers Skipped: 0
TotalBytesTransferred: 0
Final Job Status: Completed



{'recordsModified': 1, 'message': 'Updated test_imputation_input'}

## create method from github link and run a workflow

In [16]:
# create a cbas method given a github url
method_name = 'tsps_notebook_hello_world_test_v1'
github_url = ' https://github.com/broadinstitute/warp/blob/js_try_imputation_azure/pipelines/broad/arrays/imputation/hello_world.wdl'
method_version = "1.0"

create_method_with_github_link(ctrl_cbas_uri, method_name, github_url, method_version)

# for the newly created method, get its method_id and method_version_id used for downstream functions
method_id = ""
method_version_id = ""
get_methods_response = get_methods(ctrl_cbas_uri)
for method in get_methods_response['methods']:
    if method['name'] == method_name:
        method_id = method['method_id']
        method_version_id = method['method_versions'][0]['method_version_id']
        
print(f'method id: {method_id}')
print(f'method version id: {method_version_id}')

# in order to get the workflow_input_description and workflow_output_description that is used when generating a submission you can call the run_sets endpoint with your method id
run_set_response = get_run_sets_for_method(ctrl_cbas_uri, method_id)
run_set = run_set_response['run_sets'][0]
workflow_input_definition = run_set['input_definition']
workflow_output_definition = run_set['output_definition']

# these can be very large so commenting them out by default

#print(f'input definition: {workflow_input_definition}')
#print(f'output definition: {workflow_output_definition}')


# these values needed for a submission
run_set_name = "what to do"
run_set_description = "any description you want from notebook"

# taking template and filling in the specific values for this hello world wdl.  you would want to generate this template using the workflow input/output definitions but as a PoC easier to just use one we already know about and fill in the unique values
run_set_submission_post_template = {"run_set_name":"","run_set_description":"","method_version_id":"","workflow_input_definitions":[{"input_name":"HelloWorld.input_file","input_type":{"type":"primitive","primitive_type":"File"},"source":{"type":"record_lookup","record_attribute":"input_file_location"}}],"workflow_output_definitions":[{"output_name":"HelloWorld.output_file","output_type":{"type":"primitive","primitive_type":"File"},"destination":{"type":"record_update","record_attribute":"output_file"}}],"wds_records":{"record_type":"","record_ids":[]}}
run_set_submission_post_template["run_set_name"] = run_set_name
run_set_submission_post_template["run_set_description"] = run_set_description
run_set_submission_post_template["method_version_id"] = method_version_id
run_set_submission_post_template["wds_records"] = {"record_type":f"{wds_table_name}","record_ids":[f"{wds_row_id}"]}

# create your submission
print(f'submission response: {submit_run_set(ctrl_cbas_uri, json.dumps(run_set_submission_post_template))}')

method id: 4ad3bd70-486f-4d6a-be75-3f05af2881ce
method version id: a1d09d99-332f-4046-b973-29e2db45b896
{'run_set_name': 'what to do', 'run_set_description': 'any description you want from notebook', 'method_version_id': 'a1d09d99-332f-4046-b973-29e2db45b896', 'workflow_input_definitions': [{'input_name': 'HelloWorld.input_file', 'input_type': {'type': 'primitive', 'primitive_type': 'File'}, 'source': {'type': 'record_lookup', 'record_attribute': 'input_file_location'}}], 'workflow_output_definitions': [{'output_name': 'HelloWorld.output_file', 'output_type': {'type': 'primitive', 'primitive_type': 'File'}, 'destination': {'type': 'record_update', 'record_attribute': 'output_file'}}], 'wds_records': {'record_type': 'test_imputation_input', 'record_ids': ['123456']}}
submission response: {'run_set_id': '253ec0dd-9834-4506-9ca7-05b904d617c6', 'runs': [{'run_id': 'df23e4f2-31a6-4dea-8abf-04be4caae8c9', 'state': 'UNKNOWN', 'errors': 'null'}], 'state': 'RUNNING'}


## Copy outputs to user workspace and update user WDS with copied output location

In [18]:
# you need to upload this file to the "workspace files" (storage container) of the control workspace
ctrl_file_to_copy = 'ctrl_output_file.txt'  # change this as needed

# where you want to copy this file in the users workspace, no need to change
user_file_destination = 'blahblah/ctrl_output_file_copy.txt'

# get sas token for user file
ctrl_file_sas = get_sas_token_for_blob(ctrl_file_to_copy, ctrl_ws_id, ctrl_ws_sc_id)

# create a target destination SAS token
# NOTE: according to documentation (https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-copy#guidelines)
# if the copy is between tenants (which we do want to support), we can't use Azure Active Directory (Azure AD) authentication
# and must instead use SAS tokens.
user_file_sas = get_sas_token_for_blob(user_file_destination, 
                                       permissions='w',
                                       ws_id=user_ws_id,
                                       ws_sc_id=user_ws_sc_id)

# azcopy needs the sas urls to be in quotes
source_file_sas_with_quotes = f"'{ctrl_file_sas}'"
dest_file_sas_with_quotes = f"'{user_file_sas}'"

# do the copy
!azcopy copy $source_file_sas_with_quotes $dest_file_sas_with_quotes

# add location of copied file to user WDS along with the "unique identifier"
tsv_file_path = 'user_wds.tsv'
ctrl_file_uri = user_file_sas.split("?")[0]
wds_table_name = "test_imputation_output"
wds_table_headers = ['submission_id', 'output_file_location']

with open(tsv_file_path, 'w', newline='') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
    writer.writerow(wds_table_headers)
    writer.writerow([wds_row_id, ctrl_file_uri])
    
upload_records_for_type(user_ws_id, wds_table_name, user_wds_uri, tsv_file_path)

INFO: Scanning...
INFO: Failed to create one or more destination container(s). Your transfers may still succeed if the container already exists.
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job fa5ebd5d-37c7-a846-697e-aa5c608713a3 has started
Log file is located at: /home/jupyter/.azcopy/fa5ebd5d-37c7-a846-697e-aa5c608713a3.log

INFO: azcopy: A newer version 10.20.0 is available to download

INFO: Could not read destination length. If the destination is write-only, use --check-length=false on the command line.
100.0 %, 1 Done, 0 Failed, 0 Pending, 0 Skipped, 1 Total, 


Job fa5ebd5d-37c7-a846-697e-aa5c608713a3 summary
Elapsed Time (Minutes): 0.0335
Number of File Transfers: 1
Number of Folder Property Transfers: 0
Total Number of Transfers: 1
Number of Transfers Completed: 1
Number of Transfers Failed: 0
Number of Transfers Skipped: 0
TotalBytesTransferred: 0
Final Job Status: Completed



{'recordsModified': 1, 'message': 'Updated test_imputation_output'}