In [1]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data

def make_hic5_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "ef125750-8df2-418e-a1ee-402285f9dd93",
                  "app_name": "addfragtopairs",
                  "parameters": {
                      },
                  "config" : {
                      "ebs_iops": 500,
                      "EBS_optimized": True,
                      "instance_type": "c4.8xlarge",
                      "ebs_type" : "io1",
                      "s3_access_arn" : "arn:aws:iam::643366669028:instance-profile/S3_access",
                      "ami_id" : "ami-cfb14bb5",
                      "json_bucket": "4dn-aws-pipeline-run-json",
                      "password": "whateverpswd",
                      "shutdown_min" : 30,
                      "copy_to_s3" : True,
                      "launch_instance" : True,
                      "log_bucket": "tibanna-output",
                      "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                      "key_name": "4dn-encode"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": "addfragtopairs",
                               "run_id": accession}
                  }
    return input_json



def get_wfr_out(emb_file, wfr_name, file_format):
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        print wfr_name
        return "no completed run"

In [2]:
from invoke import run
import time
from datetime import datetime


def form_hyp(id):
    hyp = '=HYPERLINK("https://data.4dnucleome.org/{0}","{0}")'.format(id)
    return hyp


all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524',
            'dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII'
            ]
   
my_rep_set = all_sets[2]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

enzymes = []

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

exps_pairsems = []
all_fine = True

f_pairs = 0
for exp in rep_resp:    
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    enzyme = exp_resp['digestion_enzyme']
    enzymes.append(enzyme)
    exp_files = exp_resp['files']
    exp_acc = exp_resp['accession']
    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        f_pairs += 1
        paired_file = file_resp['related_files'][0]['file']['accession']
        # print file_resp['accession'], paired_file,
        
        #Check for partI
        bam_file = get_wfr_out(file_resp, "bwa-mem", 'bam')
        if bam_file.startswith('no') or not bam_file:
            print bam_file
            all_fine = False
            continue 
        else:
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff, frame='embedded')
            
            # Check for part II
            pairsem_file = get_wfr_out(bam_resp, "pairsam-parse-sort", 'pairsam')
            if pairsem_file.startswith('no') or not pairsem_file:
                print pairsem_file
                all_fine = False
                continue
            else:
                pairsem_resp = ff_utils.get_metadata(pairsem_file, connection=ff, frame='embedded')
                
                #check for result of part IIB
                pairsem_all_file = get_wfr_out(pairsem_resp, "pairsam-merge", 'pairsam')
                if pairsem_all_file.startswith('no') or not pairsem_all_file:
                    print pairsem_all_file
                    all_fine = False
                    continue
                else:
                    pairsem_all_resp = ff_utils.get_metadata(pairsem_all_file, connection=ff, frame='embedded')
                    
                    #check for result of part III
                    pairsem_md = get_wfr_out(pairsem_all_resp, "pairsam-markasdup", 'pairsam')
                    if pairsem_md.startswith('no') or not pairsem_md:
                        print pairsem_md
                        all_fine = False
                        continue
                    else:
                        pairsem_md_resp = ff_utils.get_metadata(pairsem_md, connection=ff, frame='embedded')
                        
                        
                        #check for result of part IV
                        pairsem_ft = get_wfr_out(pairsem_md_resp, "pairsam-filter", 'pairs')
                        if pairsem_ft.startswith('no') or not pairsem_ft:
                            print pairsem_ft
                            all_fine = False
                            continue
                        else:
                            pairsem_ft_resp = ff_utils.get_metadata(pairsem_ft, connection=ff, frame='embedded')
                    
                    
                    
                    # print 'pairsem file is', pairsem_resp['accession']
                    # exp_pairsems.append(pairsem_resp['accession'])
                
                
                            f_s = round(pairsem_ft_resp['file_size']/(1024*1024*1024.0),2)
                            print form_hyp(pairsem_ft_resp["accession"])+'\t'+pairsem_ft_resp["uuid"]+"\t"+str(f_s)
                        
        exps_pairsems.append((exp_acc,pairsem_ft_resp["accession"]))
        break


# Choose the right NZ reference file
re_ref_file = ''
choice = {'HindIII': '4DNFI823MBKE', 'MboI': '4DNFI823L812', 'DpnII':'/files-reference/4DNFIBNAPW30/'}
# Check if all experiments use the same enzyme
if len(list(set(enzymes))) != 1:
    print "ERROR Mixed Enzyme Content in Experiment Set"
else:
    nz_name = enzymes[0].split('/')[2]
    re_ref_file = choice[nz_name]
print 'using {} ({}) as the enzyme'.format(nz_name, re_ref_file)                 



dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII
=HYPERLINK("https://data.4dnucleome.org/4DNFIVZ5YV9Z","4DNFIVZ5YV9Z")	92bf900e-bb94-4e61-8c50-876de97c49ea	25.68
=HYPERLINK("https://data.4dnucleome.org/4DNFIUJZ4X9G","4DNFIUJZ4X9G")	5610464c-21c9-4a0b-8910-ff5cee141c2d	27.05
using DpnII (/files-reference/4DNFIBNAPW30/) as the enzyme


In [3]:
from core.utils import Tibanna
from core.utils import run_workflow
import time


pairsem_files = exps_pairsems

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

restrict = make_input_file_json(re_ref_file, 'restriction_file', tibanna, raw_file_bucket)

# todo need a function to determin this given fastq1
for exp_pairsem_files in pairsem_files:
    pairsam1 = make_input_file_json(exp_pairsem_files[1], 'input_pairs', tibanna, output_file_bucket)
    input_files = [pairsam1,restrict]
    if all(input_files):
        name = exp_pairsem_files[0]
        input_json = make_hic5_json(input_files, env, output_file_bucket, name)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(5)
    #a = raw_input("Press Enter to continue...")

print('Done')


looking for upload key 84db9821-3b82-4c6a-bf4f-5e0b3f43036e/4DNFIBNAPW30.txt, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 92bf900e-bb94-4e61-8c50-876de97c49ea/4DNFIVZ5YV9Z.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
{'app_name': 'addfragtopairs', 'parameters': {}, '_tibanna': {'run_type': 'addfragtopairs', 'env': 'fourfront-webprod', 'run_id': u'4DNEX7POCO84'}, 'output_bucket': 'elasticbeanstalk-fourfront-webprod-wfoutput', 'config': {'ebs_type': 'io1', 'key_name': '4dn-encode', 'EBS_optimized': True, 'ebs_iops': 500, 'shutdown_min': 30, 's3_access_arn': 'arn:aws:iam::643366669028:instance-profile/S3_access', 'launch_instance': True, 'password': 'whateverpswd', 'log_bucket': 'tibanna-output', 'ami_id': 'ami-cfb14bb5', 'json_bucket': '4dn-aws-pipeline-run-json', 'instance_type': 'c4.8xlarge', 'copy_to_s3': True, 'script_url': 'https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/'}, 'workflow_uuid': 'ef125750-8df2-418e-a1ee-402