In [3]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data

def make_hic2_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "65586d4b-1e3b-4b31-891e-11f48c816545",
                  "app_name": "pairsam-parse-sort",
                  "parameters": {
                      "nThreads": 1
                      },
                  "config": {
                      "ebs_type": "io1",
                      "json_bucket": "4dn-aws-pipeline-run-json",
                      "ebs_iops": 500,
                      "shutdown_min": 30,
                      "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                      "ami_id": "ami-7ff26968",
                      "copy_to_s3": True,
                      "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                      "launch_instance": True,
                      "password": "hahaha",
                      "log_bucket": "tibanna-output"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": "pairsam-parse-sort",
                               "run_id": accession}
                  }
    return input_json
    

In [15]:
from invoke import run
import time
from datetime import datetime

all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524'
            ]
   
my_rep_set = all_sets[0]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

bams = []
for exp in rep_resp:
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        
        paired_file = file_resp['related_files'][0]['file']['accession']
        print file_resp['accession'], paired_file
        
        workflows = file_resp.get('workflow_run_inputs')
        partI_wfr = {}
        partI_status = 'did not run'
        if workflows:
            for a_wfr in workflows:
                wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
                wfr_name = wfr_resp['display_title']
                if wfr_name.startswith(wf_partI):
                    partI_wfr = wfr_resp
                    partI_status = wfr_resp['run_status']

        if partI_status == 'complete':
            bam_outputs = partI_wfr.get('output_files')
            bam_file = [i['value'] for i in bam_outputs if i['format'] == 'bam'][0]
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff)
            print 'bam file is', bam_resp['accession']
        else:
            print 'no bam file for paired fastq files', file_resp['accession'], paired_file
        print


            


dcic:Selvaraj_gm12878_hic
4DNFI5VLWJVD 4DNFI9RZ5M46
bam file is 4DNFIKL2V1JD

4DNFISFTHGMO 4DNFIRHB1DY6
no bam file for paired fastq files 4DNFISFTHGMO 4DNFIRHB1DY6

4DNFIFLPPFEF 4DNFIEOF23ON
no bam file for paired fastq files 4DNFIFLPPFEF 4DNFIEOF23ON

4DNFIF682T66 4DNFIX3JHRA6
bam file is 4DNFIMCX3ZTZ

4DNFIBFNQAPD 4DNFI3B5A5F7
bam file is 4DNFIYY9N5TP

4DNFIPST6TJR 4DNFIBA9UA7A
no bam file for paired fastq files 4DNFIPST6TJR 4DNFIBA9UA7A

4DNFIWTWWMFV 4DNFIDJTCT3M
no bam file for paired fastq files 4DNFIWTWWMFV 4DNFIDJTCT3M

4DNFIBW7YMZK 4DNFIJT9H12M
no bam file for paired fastq files 4DNFIBW7YMZK 4DNFIJT9H12M



In [36]:
from core.utils import Tibanna
from core.utils import run_workflow
import time



paired_files = [('4DNFIWTWWMFV','4DNFIDJTCT3M')]

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
outfiles = tibanna.s3.outfile_bucket
tibanna.s3.outfile_bucket = 'elasticbeanstalk-fourfront-webprod-files'

# todo need a function to determin this given fastq1
index = make_input_file_json('4DNFIZQZ39L9', 'bwa_index', tibanna)

for pair in paired_files:
    fastq1 = make_input_file_json(pair[0], 'fastq1', tibanna)
    fastq2 = make_input_file_json(pair[1], 'fastq2', tibanna)
    
    input_files = [fastq1, fastq2, index]
    if all(input_files):
        name = fastq1['object_key'].split('.')[0] + "-" + fastq2['object_key'].split('.')[0]
        input_json = make_hic1_json(input_files, env, outfiles, name)
        #print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    #time.sleep(5)
    a = raw_input("Press Enter to continue...")

print('Done')


looking for upload key 1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key e893d235-708b-4a33-bd29-86103c310718/4DNFIWTWWMFV.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 3ddabeaa-c752-4097-a519-3f53caafdd6b/4DNFIDJTCT3M.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFIWTWWMFV-4DNFIDJTCT3Mab007193-c9a5-4e17-a569-b4b216479b84
response from aws was: 
 {u'startDate': datetime.datetime(2017, 9, 13, 16, 41, 42, 931000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': 'f3795846-98c3-11e7-b873-eda4ad7905af', 'HTTPHeaders': {'x-amzn-requestid': 'f3795846-98c3-11e7-b873-eda4ad7905af', 'content-length': '196', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_workflow_with_ponies:bwa-mem_4DNFIWTWWMFV-4DNFIDJTCT