In [None]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data

def make_hic4_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "3758e00c-2035-43c6-b783-bb92afe57c99",
                  "app_name": "pairsam-filter",
                  "parameters": {
                      },
                  "config" : {
                      "ebs_iops": 20000,
                      "instance_type": "m4.16xlarge",

                      "ebs_type" : "io1",
                      "s3_access_arn" : "arn:aws:iam::643366669028:instance-profile/S3_access",
                      "ami_id" : "ami-cfb14bb5",
                      "json_bucket": "4dn-aws-pipeline-run-json",
                      "password": "whateverpswd",
                      "shutdown_min" : "30",
                      "copy_to_s3" : True,
                      "launch_instance" : True,
                      "log_bucket": "tibanna-output",
                      "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                      "key_name": "4dn-encode"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": "pairsam-filter",
                               "run_id": accession}
                  }
    return input_json



def get_wfr_out(emb_file, wfr_name, file_format):
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"

In [None]:
from invoke import run
import time
from datetime import datetime


def form_hyp(id):
    hyp = '=HYPERLINK("https://data.4dnucleome.org/{0}","{0}")'.format(id)
    return hyp


all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524',
            'dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII'
            ]
   
my_rep_set = all_sets[2]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

exps_pairsems = []
all_fine = True

f_pairs = 0
for exp in rep_resp:    
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    exp_acc = exp_resp['accession']
    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        f_pairs += 1
        paired_file = file_resp['related_files'][0]['file']['accession']
        # print file_resp['accession'], paired_file,
        
        #Check for partI
        bam_file = get_wfr_out(file_resp, "bwa-mem", 'bam')
        if bam_file.startswith('no') or not bam_file:
            print bam_file
            all_fine = False
            continue 
        else:
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff, frame='embedded')
            
            # Check for part II
            pairsem_file = get_wfr_out(bam_resp, "pairsam-parse-sort", 'pairsam')
            if pairsem_file.startswith('no') or not pairsem_file:
                print pairsem_file
                all_fine = False
                continue
            else:
                pairsem_resp = ff_utils.get_metadata(pairsem_file, connection=ff, frame='embedded')
                
                #check for result of part IIB
                pairsem_all_file = get_wfr_out(pairsem_resp, "pairsam-merge", 'pairsam')
                if pairsem_all_file.startswith('no') or not pairsem_all_file:
                    print pairsem_all_file
                    all_fine = False
                    continue
                else:
                    pairsem_all_resp = ff_utils.get_metadata(pairsem_all_file, connection=ff, frame='embedded')
                    
                    #check for result of part III
                    pairsem_md = get_wfr_out(pairsem_all_resp, "pairsam-markasdup", 'pairsam')
                    if pairsem_md.startswith('no') or not pairsem_md:
                        print pairsem_md
                        all_fine = False
                        continue
                    else:
                        pairsem_md_resp = ff_utils.get_metadata(pairsem_md, connection=ff, frame='embedded')
                    
                    
                    
                    # print 'pairsem file is', pairsem_resp['accession']
                    # exp_pairsems.append(pairsem_resp['accession'])
                
                
                        f_s = round(pairsem_md_resp['file_size']/(1024*1024*1024.0),2)
                        print form_hyp(pairsem_md_resp["accession"])+'\t'+pairsem_md_resp["uuid"]+"\t"+str(f_s)
        exps_pairsems.append((exp_acc,pairsem_md_resp["accession"]))
        break


                  

        
        
print f_pairs, "fastq file pairs in the set"
print exps_pairsems


In [None]:
from core.utils import Tibanna
from core.utils import run_workflow
import time

pairsem_files = exps_pairsems

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

chrsizes = make_input_file_json('4DNFI823LSII', 'chromsize', tibanna, raw_file_bucket)
# todo need a function to determin this given fastq1
for exp_pairsem_files in pairsem_files:
    pairsam1 = make_input_file_json(exp_pairsem_files[1], 'input_pairsam', tibanna, output_file_bucket)
    input_files = [pairsam1,chrsizes]
    if all(input_files):
        name = exp_pairsem_files[0]
        input_json = make_hic4_json(input_files, env, output_file_bucket, name)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(5)
    #a = raw_input("Press Enter to continue...")

print('Done')
