In [1]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data


def make_hic2_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "65586d4b-1e3b-4b31-891e-11f48c816545",
                  "app_name": "pairsam-parse-sort",
                  "parameters": {
                      "nThreads": 16
                      },
                  "config": {
                      "ebs_type": "io1",
                      "json_bucket": "4dn-aws-pipeline-run-json",
                      "ebs_iops": 500,
                      "shutdown_min": 30,
                      "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                      "ami_id": "ami-7ff26968",
                      "copy_to_s3": True,
                      "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                      "launch_instance": True,
                      "password": "hahaha",
                      "log_bucket": "tibanna-output"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": "pairsam-parse-sort",
                               "run_id": accession}
                  }
    return input_json


def get_wfr_out(emb_file, wfr_name, file_format):
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"

In [2]:
from invoke import run
import time
from datetime import datetime


def form_hyp(id):
    hyp = '=HYPERLINK("https://data.4dnucleome.org/{0}","{0}")'.format(id)
    return hyp


all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524',
            'dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII'
            ]
   
my_rep_set = all_sets[2]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

bams = []
f_pairs = 0
for exp in rep_resp:
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        
        f_pairs += 1
        paired_file = file_resp['related_files'][0]['file']['accession']
        print file_resp['accession'], paired_file,  
        bam_file = get_wfr_out(file_resp, "bwa-mem", 'bam')
        if bam_file.startswith('no') or not bam_file:
            print bam_file, file_resp['accession'], paired_file
        else:
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff)
            
#             f_s = round(bam_resp['file_size']/(1024*1024*1024.0),2)
#             print form_hyp(bam_resp["accession"])+'\t'+bam_resp["uuid"]+"\t"+str(f_s)
#             print "same"
                  
            print 'bam file is', bam_resp['accession']
            bams.append((set_acc,bam_resp['accession']))
         
        
print f_pairs, "fastq file pairs in the set"
print len(bams), "bam files"


dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII
4DNFIWDY2U4S 4DNFIS8I1Y5L bam file is 4DNFIYZ4KUS8
4DNFI65A88XH 4DNFIY67X18K bam file is 4DNFITYX1R3C
4DNFIONTKGCC 4DNFISXNGXNA bam file is 4DNFIKJVRXX1
4DNFIAAJYJ1Y 4DNFICOIG8A1 bam file is 4DNFITHNIOV9
4DNFINCPY93G 4DNFI6BFAKQI bam file is 4DNFI5KMW3O6
4DNFIF5LBBWQ 4DNFI9S272R7 bam file is 4DNFIK2HBZTW
4DNFID9WXC6L 4DNFIWIGZ5EW bam file is 4DNFIEQHXTA3
4DNFIPC1UAH2 4DNFIG2BLIPZ bam file is 4DNFISBYCCSM
4DNFIFXD526I 4DNFI77ORE8G bam file is 4DNFIE11EYDD
4DNFIPL4UKY5 4DNFI2EB5HBJ no completed run 4DNFIPL4UKY5 4DNFI2EB5HBJ
4DNFIKHJTBXN 4DNFIYCHCAAV no completed run 4DNFIKHJTBXN 4DNFIYCHCAAV
4DNFI6BHUZA6 4DNFIDKRWIPN bam file is 4DNFIMV78UPS
4DNFIAYGJL8A 4DNFI7GUU391 bam file is 4DNFIFVEGRR7
4DNFIW2NPXX9 4DNFI65W1NWV bam file is 4DNFIXEGD83E
4DNFISVE7HB2 4DNFIBPSVSZ6 no completed run 4DNFISVE7HB2 4DNFIBPSVSZ6
4DNFIQ4VTRCE 4DNFIX3X22EQ bam file is 4DNFI2U98V9B
4DNFIMXEJLHF 4DNFIFDD2YM2 bam file is 4DNFIMNMNO3U
4DNFIWZUG4AN 4DNFI65C492U bam file 

In [32]:
from core.utils import Tibanna
from core.utils import run_workflow
import time



bam_files = bams
print len(bam_files)


env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

# todo need a function to determin this given fastq1
chr_size = make_input_file_json('4DNFI823LSII', 'chromsize', tibanna,raw_file_bucket)
for bam_file in bam_files:
    
    bam1 = make_input_file_json(bam_file[1], 'bam', tibanna,output_file_bucket)
    input_files = [bam1, chr_size]
    if all(input_files):
        name = bam_file[0]+"_"+bam_file[1]
        input_json = make_hic2_json(input_files, env, output_file_bucket, name)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(5)
    #a = raw_input("Press Enter to continue...")

print('Done')


18
looking for upload key 4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 7df85cbb-d417-4166-a488-da8b5f6d79e3/4DNFI68ZESEE.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2M5JIGV_4DNFI68ZESEE'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-7ff26968', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argume

looking for upload key 7cbf180a-811b-40c3-b16f-646214f964bb/4DNFI7X462GJ.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2M5JIGV_4DNFI7X462GJ'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-7ff26968', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'7cbf180a-811b-40c3-b16f-646214f964bb', 'object_key'

looking for upload key 40b80196-2d26-4b2a-8101-cec2ecd68606/4DNFITR82M7H.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2M5JIGV_4DNFITR82M7H'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-7ff26968', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'40b80196-2d26-4b2a-8101-cec2ecd68606', 'object_key'

looking for upload key 68fbef1b-ea4e-4447-bda4-1f5a75583f12/4DNFIJM4WRKL.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2M5JIGV_4DNFIJM4WRKL'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-7ff26968', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'68fbef1b-ea4e-4447-bda4-1f5a75583f12', 'object_key'

looking for upload key 4791fb38-a66d-46c8-9e46-7bc3febd36c9/4DNFIXIIB4QZ.bam, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'pairsam-parse-sort', 'parameters': {'nThreads': 16}, '_tibanna': {'run_type': 'pairsam-parse-sort', 'env': 'fourfront-webprod', 'run_id': u'4DNES2M5JIGV_4DNFIXIIB4QZ'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'hahaha', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-7ff26968', 'json_bucket': '4dn-aws-pipeline-run-json', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': '65586d4b-1e3b-4b31-891e-11f48c816545', 'input_files': [{'workflow_argument_name': 'bam', 'bucket_name': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'uuid': u'4791fb38-a66d-46c8-9e46-7bc3febd36c9', 'object_key'