In [1]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partII
def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
         
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])     
    if len(uuid_list)==1:
        uuid_list = uuid_list[0]
    if len(object_key_list)==1:
        object_key_list = object_key_list[0]  
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data

def make_hic2b_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "af8908bf-fdcb-40be-8bca-f1a49226bd20",
                  "app_name": "pairsam-merge",
                  "parameters": {
                      "nThreads": 1
                      },
                    "config": {
                        "ebs_type": "io1",
                        "json_bucket": "4dn-aws-pipeline-run-json",
                        "EBS_optimized": True,
                        "ebs_iops": 5000,
                        "shutdown_min": 30,
                        "instance_type": "m4.16xlarge",
                        "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                        "ami_id": "ami-cfb14bb5",
                        "copy_to_s3": True,
                        "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                        "launch_instance": True,
                        "password": "hahaha",
                        "log_bucket": "tibanna-output",
                        "key_name": "4dn-encode"
                      },
                  "_tibanna": {"env": env, 
                               "run_type": "pairsam-merge",
                               "run_id": accession}
                  }
    return input_json



def get_wfr_out(emb_file, wfr_name, file_format):
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"

In [3]:
from invoke import run
import time
from datetime import datetime


def form_hyp(id):
    hyp = '=HYPERLINK("https://data.4dnucleome.org/{0}","{0}")'.format(id)
    return hyp


all_sets = [
            'dcic:Selvaraj_gm12878_hic',
            'dekker-lab:ExperimentSet_U54_U54-ESC4DN-FA-DpnII-2017524',
            'dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII'
            ]
   
my_rep_set = all_sets[2]
print my_rep_set

wf_partI = "bwa-mem"
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_set_resp = ff_utils.get_metadata(my_rep_set, connection=ff)
rep_resp = rep_set_resp['experiments_in_set']
set_acc = rep_set_resp['accession']

exps_pairsems = []
f_pairs = 0
for exp in rep_resp:    
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    exp_acc = exp_resp['accession']
    
    exp_pairsems = []
    all_fine = True

    for fastq_file in exp_files:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
        #Some checks before running
        #check if status is deleted
        if file_resp['status'] == 'deleted':
            print "delete file", file_resp['accession']
            continue
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
        if not head_info:
            print file_resp['accession'], "does not have a file in S3"
            continue
        
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue
        f_pairs += 1
        paired_file = file_resp['related_files'][0]['file']['accession']
        #print file_resp['accession'], paired_file,
        
        #Check for partI
        bam_file = get_wfr_out(file_resp, "bwa-mem", 'bam')
        if bam_file.startswith('no') or not bam_file:
            print bam_file
            all_fine = False
            continue 
        else:
            bam_resp = ff_utils.get_metadata(bam_file, connection=ff, frame='embedded')
            
            # Check for part II
            pairsem_file = get_wfr_out(bam_resp, "pairsam-parse-sort", 'pairsam')
            if pairsem_file.startswith('no') or not pairsem_file:
                print pairsem_file
                all_fine = False
                continue
            else:
                pairsem_resp = ff_utils.get_metadata(pairsem_file, connection=ff)
                #print 'pairsem file is', pairsem_resp['accession']
                exp_pairsems.append(pairsem_resp['accession'])
                
                f_s = round(pairsem_resp['file_size']/(1024*1024*1024.0),2)
                print form_hyp(pairsem_resp["accession"])+'\t'+pairsem_resp["uuid"]+"\t"+str(f_s)
                print "same"
                
    if all_fine:
        exps_pairsems.append([exp_acc,exp_pairsems])
        print exp_acc, "has complete pairsem"
        print '------------'
    else:
        print exp_acc, "has missing pairsem"
        print '------------'


                  

        
        
print f_pairs, "fastq file pairs in the set"



 dekker-lab:ExperimentSet_U54_HFFc6-FA-DpnII
=HYPERLINK("https://data.4dnucleome.org/4DNFIRTAVPBQ","4DNFIRTAVPBQ")	f2e463b5-6d81-41e3-a506-4777a75f3001	37.05
same
=HYPERLINK("https://data.4dnucleome.org/4DNFINARWX11","4DNFINARWX11")	48568d74-b775-4b41-ab4e-6f06a8e4a3e6	35.23
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIN43BIDG","4DNFIN43BIDG")	f82c0083-1680-4712-97a0-8bc80c592a16	36.02
same
=HYPERLINK("https://data.4dnucleome.org/4DNFI8QXIKYI","4DNFI8QXIKYI")	a3364d1c-dd37-423d-9397-305bbec7e74a	34.76
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIV6SSSGF","4DNFIV6SSSGF")	9d59f12f-d3c3-4c3b-9e75-76494b9635c2	36.1
same
=HYPERLINK("https://data.4dnucleome.org/4DNFI65956HF","4DNFI65956HF")	685e52ac-548f-4fbe-ad22-8ddc47620432	35.66
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIVXVPPDK","4DNFIVXVPPDK")	a1ac50bf-6050-431d-8a51-04fca0fb5af9	34.61
same
=HYPERLINK("https://data.4dnucleome.org/4DNFIYGIX22T","4DNFIYGIX22T")	6b8c7971-aad5-480f-8254-cb36088161ea	35.1
same
=HYPERLINK("h

In [4]:
from core.utils import Tibanna
from core.utils import run_workflow
import time

pairsem_files = exps_pairsems

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

# todo need a function to determin this given fastq1
for exp_pairsem_files in pairsem_files:
    pairsam1 = make_input_file_json(exp_pairsem_files[1], 'input_pairsams', tibanna,output_file_bucket)
    input_files = [pairsam1]
    if all(input_files):
        name = exp_pairsem_files[0]
        input_json = make_hic2b_json(input_files, env, output_file_bucket, name)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(5)
    #a = raw_input("Press Enter to continue...")

print('Done')


looking for upload key f2e463b5-6d81-41e3-a506-4777a75f3001/4DNFIRTAVPBQ.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key 48568d74-b775-4b41-ab4e-6f06a8e4a3e6/4DNFINARWX11.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key f82c0083-1680-4712-97a0-8bc80c592a16/4DNFIN43BIDG.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key a3364d1c-dd37-423d-9397-305bbec7e74a/4DNFI8QXIKYI.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key 9d59f12f-d3c3-4c3b-9e75-76494b9635c2/4DNFIV6SSSGF.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key 685e52ac-548f-4fbe-ad22-8ddc47620432/4DNFI65956HF.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for upload key a1ac50bf-6050-431d-8a51-04fca0fb5af9/4DNFIVXVPPDK.sam.pairs.gz, on bucket elasticbeanstalk-fourfront-webprod-wfoutput
looking for u