In [1]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data


def make_hic2_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "65586d4b-1e3b-4b31-891e-11f48c816545",
                  "app_name": "pairsam-parse-sort",
                  "parameters": {
                      "nThreads": 16
                      },
                  "config": {
                      "ebs_type": "io1",
                      "json_bucket": "4dn-aws-pipeline-run-json",
                      "ebs_iops": 500,
                      "shutdown_min": 30,
                      "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                      "ami_id": "ami-cfb14bb5",
                      "copy_to_s3": True,
                      "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                      "launch_instance": True,
                      "password": "hahaha",
                      "log_bucket": "tibanna-output",
                      "key_name": "4dn-encode"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": "pairsam-parse-sort",
                               "run_id": accession}
                  }
    return input_json


def get_wfr_out(emb_file, wfr_name, file_format):
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"

In [9]:
from invoke import run
import time
from datetime import datetime


def form_hyp(id):
    hyp = '=HYPERLINK("https://data.4dnucleome.org/{0}","{0}")'.format(id)
    return hyp


all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524',
            'dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII'
            ]
   
my_rep_set = all_sets[2]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

bams = []
f_pairs = 0
for exp in rep_resp:
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        
        f_pairs += 1
        paired_file = file_resp['related_files'][0]['file']['accession']
        #print file_resp['accession'], paired_file,  
        
        bam_file = get_wfr_out(file_resp, "bwa-mem", 'bam')
        if bam_file.startswith('no') or not bam_file:
            print bam_file, file_resp['accession'], paired_file
        else:
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff, frame='embedded')

            f_s = round(bam_resp['file_size']/(1024*1024*1024.0),2)
            print form_hyp(bam_resp["accession"])+'\t'+bam_resp["uuid"]+"\t"+str(f_s)
            print "same"
                  
            #print 'bam file is', bam_resp['accession']
            bams.append((set_acc,bam_resp['accession']))
         
        
print f_pairs, "fastq file pairs in the set"
print len(bams), "bam files"


dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII
=HYPERLINK("https://data.4dnucleome.org/4DNFIYZ4KUS8","4DNFIYZ4KUS8")	02bc8d2e-0b8d-4512-a3ef-400e026fb6cb	36.23
same
=HYPERLINK("https://data.4dnucleome.org/4DNFITYX1R3C","4DNFITYX1R3C")	3d1f22fb-2d94-43ad-86f3-3bd77adb9da4	36.08
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIKJVRXX1","4DNFIKJVRXX1")	95edc4b0-122b-4ab1-99b3-ebbfa2e67466	36.31
same
=HYPERLINK("https://data.4dnucleome.org/4DNFITHNIOV9","4DNFITHNIOV9")	31edfbd5-be64-40f7-b736-a6b6b7ff67eb	36.65
same
=HYPERLINK("https://data.4dnucleome.org/4DNFI5KMW3O6","4DNFI5KMW3O6")	79329c2a-5d18-449e-886d-e0bb42ed6f8e	36.57
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIK2HBZTW","4DNFIK2HBZTW")	fd044b16-1448-4870-b346-d54f22ca6503	36.14
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIEQHXTA3","4DNFIEQHXTA3")	12c0687f-27d1-4100-8b5d-3691994deccc	35.58
same
=HYPERLINK("https://data.4dnucleome.org/4DNFISBYCCSM","4DNFISBYCCSM")	b70bd7bb-a52c-4260-b0ff-a7280d1bbb36	36.82
same
=HYPERLINK("

In [10]:
from core.utils import Tibanna
from core.utils import run_workflow
import time



bam_files = bams
print len(bam_files)


env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

# todo need a function to determin this given fastq1
chr_size = make_input_file_json('4DNFI823LSII', 'chromsize', tibanna,raw_file_bucket)
for bam_file in bam_files:
    
    bam1 = make_input_file_json(bam_file[1], 'bam', tibanna,output_file_bucket)
    input_files = [bam1, chr_size]
    if all(input_files):
        name = bam_file[0]+"_"+bam_file[1]
        input_json = make_hic2_json(input_files, env, output_file_bucket, name)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(5)
    #a = raw_input("Press Enter to continue...")

print('Done')


18
looking for upload key 4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 02bc8d2e-0b8d-4512-a3ef-400e026fb6cb/4DNFIYZ4KUS8.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2R6PUEK_4DNFIYZ4KUS8'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-cfb14bb5', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argume

looking for upload key 79329c2a-5d18-449e-886d-e0bb42ed6f8e/4DNFI5KMW3O6.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2R6PUEK_4DNFI5KMW3O6'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-cfb14bb5', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'79329c2a-5d18-449e-886d-e0bb42ed6f8e', 'object_key'

about to start run pairsam-parse-sort_4DNES2R6PUEK_4DNFIE11EYDD
response from aws was: 
 {u'startDate': datetime.datetime(2017, 9, 15, 22, 15, 26, 11000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': 'e6fd747e-9a84-11e7-9a3a-c5cf032eab9f', 'HTTPHeaders': {'x-amzn-requestid': 'e6fd747e-9a84-11e7-9a3a-c5cf032eab9f', 'content-length': '171', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_workflow_with_ponies:pairsam-parse-sort_4DNES2R6PUEK_4DNFIE11EYDD'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_awsem_workflow_with_ponies:pairsam-parse-sort_4DNES2R6PUEK_4DNFIE11EYDD
looking for upload key 5ae5edb2-8917-445a-b93f-46936a1478a8/4DNFI3F894Y3.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads'

looking for upload key 919533f3-7cdb-4a9e-918c-62b0756193b4/4DNFIXEGD83E.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2R6PUEK_4DNFIXEGD83E'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-cfb14bb5', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'919533f3-7cdb-4a9e-918c-62b0756193b4', 'object_key'

about to start run pairsam-parse-sort_4DNES2R6PUEK_4DNFIKHV8KVY
response from aws was: 
 {u'startDate': datetime.datetime(2017, 9, 15, 22, 16, 23, 596000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '094f7702-9a85-11e7-bd6a-5d2d28bed637', 'HTTPHeaders': {'x-amzn-requestid': '094f7702-9a85-11e7-bd6a-5d2d28bed637', 'content-length': '171', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_workflow_with_ponies:pairsam-parse-sort_4DNES2R6PUEK_4DNFIKHV8KVY'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_awsem_workflow_with_ponies:pairsam-parse-sort_4DNES2R6PUEK_4DNFIKHV8KVY
Done
