In [84]:
from core.utils import Tibanna
from core import ff_utils

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
raw_bucket = tibanna.s3.raw_file_bucket
out_bucket = tibanna.s3.outfile_bucket
exclude_miseq = True

def extract_file_info(obj_id, arg_name):
    """Creates the formatted dictionary for input files.
    """
    # start a dictionary
    template = {"workflow_argument_name": arg_name}
    
    # if it is list of items, change the structure
    if isinstance(obj_id, list):
        object_key = []
        uuid = []
        buckets = []
        for obj in obj_id:
            metadata = ff_utils.get_metadata(obj, connection=ff)
            object_key.append(metadata['display_title'])
            uuid.append(metadata['uuid'])
            # get the bucket
            if 'FileProcessed' in metadata['@type']:
                my_bucket = out_bucket
            else:  # covers cases of FileFastq, FileReference, FileMicroscopy
                my_bucket = raw_bucket
            buckets.append(my_bucket)
        # check bucket consistency
        try:
            assert len(list(set(buckets))) == 1
        except:
            print('Files from different buckets', obj_id)
            return
        template['object_key'] = object_key
        template['uuid'] = uuid
        template['bucket_name'] = buckets[0]
    # if obj_id is a string
    else:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
        template['object_key'] = metadata['display_title']
        template['uuid'] = metadata['uuid']
        # get the bucket
        if 'FileProcessed' in metadata['@type']:
            my_bucket = out_bucket
        else:  # covers cases of FileFastq, FileReference, FileMicroscopy
            my_bucket = raw_bucket
        template['bucket_name'] = my_bucket
    return template
    

def run_json(input_files, env, parameters, wf_uuid, wf_name, tag, run_name):
    """Creates the trigger json that is used by tibanna.
    """
    input_json = {'input_files': input_files,
                  'output_bucket': out_bucket,
                  'workflow_uuid': wf_uuid,
                  "app_name": wf_name,
                  "parameters": parameters,
                  "config": {
                        "ebs_type": "io1",
                        "json_bucket": "4dn-aws-pipeline-run-json",
                        "ebs_iops": 500,
                        "shutdown_min": 30,
                        "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                        "ami_id": "ami-cfb14bb5",
                        "copy_to_s3": True,
                        "launch_instance": True,
                        "password": "dragonfly",
                        "log_bucket": "tibanna-output",
                        "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                        "key_name": "4dn-encode"
                    },
                  "_tibanna": {"env": env, 
                               "run_type": wf_name,
                               "run_id": run_name},
                  "tag": tag
                  }
    return input_json


def find_pairs(my_rep_set):
    """Find pairs and make sure they are fine my qc.
    """
    report = {}
    rep_resp = my_rep_set['experiments_in_set']
    enzymes = []
    organisms = []
    for exp in rep_resp:
        exp_resp = ff_utils.get_metadata(exp, connection=ff)
        report[exp_resp['accession']] = []
        if not organisms:
            biosample = ff_utils.get_metadata(exp_resp['biosample'], connection=ff, frame='embedded')      
            organisms = list(set([bs['individual']['organism']['display_title'] for bs in  biosample['biosource']]))
            if len(organisms) != 1:
                print 'multiple organisms in set', my_rep_set['accession']
                break
        exp_files = exp_resp['files']
        enzyme = exp_resp.get('digestion_enzyme')
        enzymes.append(enzyme)
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff)  
            # skip pair no 2
            if file_resp.get('paired_end')=='2':
                continue 
            # exclude miseq
            if exclude_miseq:
                if file_resp.get('instrument') == 'Illumina MiSeq':
                    print 'skipping miseq files', exp
                    continue
                
            #Some checks before running
            #check if status is deleted
            if file_resp['status'] == 'deleted':
                print 'deleted file', file_resp['accession'], 'in', my_rep_set['accession']
                continue
            #if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
            if not head_info:
                print file_resp['accession'], "does not have a file in S3"
                continue
            # check that file has a pair
            f1 = file_resp['@id']
            f2 = ''
            relations = file_resp.get('related_files')
            for relation in relations:
                if relation['relationship_type'] == 'paired with':
                    f2 = relation['file']
            if not f2:
                print f1, 'does not have a pair'
                continue
            report[exp_resp['accession']].append((f1, f2))
    return report


def get_wfr_out(file_id, wfr_name, file_format):
    emb_file = ff_utils.get_metadata(file_id, connection=ff, frame = 'embedded')
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"

def add_processed_files(item_id, list_pc, ff):
    item_status = ff_utils.get_metadata(item_id, connection=ff)['status']
    # bring files to same status as experiments and sets
    if item_status in ['released', 'released to project']:
        for a_file in list_pc:
            ff_utils.patch_metadata({"status":item_status}, obj_id=a_file ,connection=ff)
    # patch the exp or set
    patch_data = {'processed_files': list_pc}
    ff_utils.patch_metadata(patch_data, obj_id=item_id ,connection=ff)
    return

In [99]:
import time
from datetime import datetime

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately

wf_dict =[
    {'seq':1,
     'level': 'pair',
     'wf_name': 'bwa-mem',
     'wf_uuid': '3feedadc-50f9-4bb4-919b-09a8b731d0cc',
     'parameters':{"nThreads": 16},
     'inputs': []
    },
    {'seq':2,
     'level': 'experiment',
     'wf_name': 'hi-c-processing-bam',
     'wf_uuid': '023bfb3e-9a8b-42b9-a9d4-216079526f68',
     'parameters':{"nthreads_merge": 16, "nthreads_parse_sort": 16},
     'inputs': []
    },
    {'seq':3,
     'level': 'set',
     'wf_name': 'hi-c-processing-pairs',
     'wf_uuid': 'c9e0e6f7-b0ed-4a42-9466-cadc2dd84df0',
     'parameters': {"nthreads": 1, "maxmem": "32g"},
     'inputs': []
    }    
]

# url for hic exps
exp_types = ['in%20situ%20Hi-C', 'dilution%20Hi-C']
set_url = '/search/?'+'&'.join(['experiments_in_set.experiment_type='+i for i in exp_types])+'&type=ExperimentSetReplicate'
run_sets = ff_utils.get_metadata(set_url , connection=ff)['@graph']

# test_set = '4DNESJIYRA44'
# run_sets = [ff_utils.get_metadata(test_set , connection=ff)]
counter = 0
completed = 0
completed_acc = []
print len(run_sets)
for a_set in run_sets: 
    part3 = 'done'
    set_pairs = []
    
    counter += 1
    print
    print counter, a_set['accession']
    fastqpairs = find_pairs(a_set)
    # cycle through the experiments
    for exp in fastqpairs.keys():
        # Check Part 1 and See if all are okay
        exp_bams = []
        part1 = 'done'
        part2 = 'done'
        for pair in fastqpairs[exp]:
            #############
            bam1 = get_wfr_out(pair[0], 'bwa-mem 0.2.5', 'bam')
            bam2 = get_wfr_out(pair[1], 'bwa-mem 0.2.5', 'bam')
            if bam1.startswith('no') or not bam1 or bam1 != bam2:
                part1 = 'not ready'
                #########
                # check status of part1 run, and start the run if needed
                #########                
            else:
                exp_bams.append(bam1)
        # stop progress to part2 
        if part1 is not 'done':
            print exp, 'has missing Part1 runs'
            part2 = 'not ready'
            part3 = 'not ready'
            continue
        print exp, 'part1 complete'
        #check if part 2 is run already, it not start the run
        exp_com_bam = []
        exp_pairs = []
        for bam in exp_bams:
            com_bam = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'bam')
            pairs = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'pairs')
            if pairs.startswith('no') or not pairs:
                part2 = 'not ready'        
                #########
                # check status of part2 run, and start the run if needed
                #########                
            else:
                exp_com_bam.append(com_bam)
                exp_pairs.append(pairs)
        
        # make sure all bams went through the same wfr and produces same file
        if part2 != 'done' or len(list(set(exp_com_bam))) != 1 or len(list(set(exp_pairs))) !=1:
            print exp, 'Part2 did not complete'
            part3 = 'not ready'
            continue
        
        # add bam and pairs to exp proc file
        #print 'adding', [exp_com_bam[0],exp_pairs[0]], 'to', exp
        add_processed_files(exp, [exp_com_bam[0],exp_pairs[0]], ff)
        
        
        print exp, 'part2 complete'
        set_pairs.append(exp_pairs[0])
        
        
    if part3 != 'done':
        print 'Part3 not ready'
        continue
    merged_pairs = []
    for set_pair in set_pairs:
        merged_pair = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'pairs')
        hic = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'hic')
        mcool = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'mcool')
        normvec = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'normvector_juicerformat')
        
        if merged_pair.startswith('no') or not merged_pair:
            part3 = 'part3 did not complete'
         
                #########
                # check status of part3 run, and start the run if needed
                #########                
        else:
            merged_pairs.append(merged_pair)
                
    if part3 != 'done' or len(list(set(merged_pairs))) != 1:
        print a_set['accession'], 'is missing Part3'
        part3 = 'not ready'
        continue

    #####
    #add competed flag to experiment
    ff_utils.patch_metadata({"completed_processes":["HiC_Pipeline_0.2.5"]}, obj_id=a_set['accession'] ,connection=ff)
    # add processed files to set
    add_processed_files(a_set['accession'], [merged_pair, hic, mcool, normvec], ff)
    #print 'adding', [merged_pair, hic, mcool, normvec], 'to', a_set['accession']
    completed += 1
    completed_acc.append(a_set['accession'])
    print a_set['accession'], 'part3 complete'
print completed
print completed_acc

89

1 4DNES18BMU79
4DNEXVJSLZI5 has missing Part1 runs
4DNEXH98PC96 has missing Part1 runs
Part3 not ready

2 4DNESH4UTRNL
4DNEX4KRGMAQ has missing Part1 runs
4DNEXOHPSJTN has missing Part1 runs
Part3 not ready

3 4DNESNYBDSLY
4DNEXDSNPZOU has missing Part1 runs
4DNEXH1YN2XB has missing Part1 runs
Part3 not ready

4 4DNES54YB6TQ
4DNEXMM7MN7V has missing Part1 runs
4DNEXBSGTVWJ has missing Part1 runs
Part3 not ready

5 4DNESRE7AK5U
4DNEXCDN8ONW has missing Part1 runs
Part3 not ready

6 4DNES425UDGS
4DNEXKO3TRJT has missing Part1 runs
4DNEXTDMNUOO has missing Part1 runs
Part3 not ready

7 4DNESEPDL6KY
4DNEX5ICY4Q2 has missing Part1 runs
4DNEXVCPZOH3 has missing Part1 runs
Part3 not ready

8 4DNES2R6PUEK
4DNEXRAEERUF part1 complete
4DNEXRAEERUF part2 complete
4DNEX7POCO84 part1 complete
4DNEX7POCO84 part2 complete
4DNES2R6PUEK is missing Part3

9 4DNESRJ8KV4Q
4DNEXZEUZJH7 part1 complete
4DNEXZEUZJH7 part2 complete
4DNEXCX13ZF9 part1 complete
4DNEXCX13ZF9 part2 complete
4DNEXSUMVBKJ part1 

skipping miseq files /experiments-hi-c/4DNEXYNM1II5/
skipping miseq files /experiments-hi-c/4DNEXT3VJDNU/
skipping miseq files /experiments-hi-c/4DNEXWGWFK1T/
4DNEXNMDXSP1 part1 complete
4DNEXNMDXSP1 Part2 did not complete
4DNEXT3VJDNU part1 complete
4DNEXT3VJDNU Part2 did not complete
4DNEXWGWFK1T part1 complete
4DNEXWGWFK1T Part2 did not complete
4DNEXYNM1II5 part1 complete
4DNEXYNM1II5 Part2 did not complete
Part3 not ready

61 4DNESZ2PVZWR
skipping miseq files /experiments-hi-c/4DNEXMDNZ7T7/
skipping miseq files /experiments-hi-c/4DNEX8J4TXKA/
4DNEX8J4TXKA part1 complete
4DNEX8J4TXKA Part2 did not complete
4DNEXMDNZ7T7 part1 complete
4DNEXMDNZ7T7 Part2 did not complete
Part3 not ready

62 4DNESSCS4D46
skipping miseq files /experiments-hi-c/4DNEXECG9AQA/
4DNEXECG9AQA part1 complete
4DNEXECG9AQA Part2 did not complete
Part3 not ready

63 4DNESBJ1KYYH
skipping miseq files /experiments-hi-c/4DNEX681LMAK/
skipping miseq files /experiments-hi-c/4DNEXY1C3Y6L/
skipping miseq files /experim

4DNEX4F3JMWD part1 complete
4DNEX4F3JMWD part2 complete
4DNEXND9HATI part1 complete
4DNEXND9HATI Part2 did not complete
4DNEXDMYMEL2 part1 complete
4DNEXDMYMEL2 Part2 did not complete
4DNEX2XV7VGV part1 complete
4DNEX2XV7VGV Part2 did not complete
4DNEX9G4FVK5 part1 complete
4DNEX9G4FVK5 Part2 did not complete
4DNEX7ZLCUVL part1 complete
4DNEX7ZLCUVL Part2 did not complete
4DNEXYJQL5PX part1 complete
4DNEXYJQL5PX Part2 did not complete
4DNEX41ZSSQR part1 complete
4DNEX41ZSSQR Part2 did not complete
4DNEXV8RLWON part1 complete
4DNEXV8RLWON Part2 did not complete
Part3 not ready

83 4DNESNZZR2VD
skipping miseq files /experiments-hi-c/4DNEX5SKVSJX/
4DNEXVHF97YT part1 complete
4DNEXVHF97YT part2 complete
4DNEX17F64JD has missing Part1 runs
4DNEX5SKVSJX part1 complete
4DNEX5SKVSJX Part2 did not complete
4DNEXJ78M1F7 part1 complete
4DNEXJ78M1F7 part2 complete
Part3 not ready

84 4DNESPXW8XHY
4DNEXRNJMGNI part1 complete
4DNEXRNJMGNI part2 complete
4DNEXMHA9FL9 part1 complete
4DNEXMHA9FL9 part

4DNEXS8V14UB part1 complete
4DNEXS8V14UB part2 complete
4DNEXZKBU7YD part1 complete
4DNEXZKBU7YD Part2 did not complete
4DNEX9ROG1T9 part1 complete
4DNEX9ROG1T9 Part2 did not complete
4DNEXMJINPYT part1 complete
4DNEXMJINPYT Part2 did not complete
4DNEX1I1E292 part1 complete
4DNEX1I1E292 part2 complete
4DNEX5D543VH part1 complete
4DNEX5D543VH Part2 did not complete
4DNEXGICX1EU part1 complete
4DNEXGICX1EU Part2 did not complete
4DNEX6JBYI4K part1 complete
4DNEX6JBYI4K Part2 did not complete
4DNEXEJM8STO part1 complete
4DNEXEJM8STO part2 complete
Part3 not ready

89 4DNESCIHJOXA
skipping miseq files /experiments-hi-c/4DNEXKSMJTC7/
4DNEX2G1IW8D part1 complete
4DNEX2G1IW8D part2 complete
4DNEXGPAWPI8 part1 complete
4DNEXGPAWPI8 part2 complete
4DNEXKSMJTC7 part1 complete
4DNEXKSMJTC7 Part2 did not complete
Part3 not ready
39
[u'4DNESRJ8KV4Q', u'4DNES78Y8Y5K', u'4DNESB6MNCFE', u'4DNES8J78WV2', u'4DNESAPF27TG', u'4DNES9L4AK6Q', u'4DNES2M5JIGV', u'4DNESLLTENG9', u'4DNES98CI6GV', u'4DNESC5J3EI

In [100]:
print completed
print completed_acc

39
[u'4DNESRJ8KV4Q', u'4DNES78Y8Y5K', u'4DNESB6MNCFE', u'4DNES8J78WV2', u'4DNESAPF27TG', u'4DNES9L4AK6Q', u'4DNES2M5JIGV', u'4DNESLLTENG9', u'4DNES98CI6GV', u'4DNESC5J3EIX', u'4DNES21NPLZU', u'4DNESYTIFTEE', u'4DNESIG4ELE4', u'4DNESNHN919R', u'4DNES8ZUV5CQ', u'4DNESCCP4KTY', u'4DNEST9AVULS', u'4DNES7DFQZLI', u'4DNESFBT9P4O', u'4DNESE3ICNE1', u'4DNES4GSP9S4', u'4DNESTAPSPUC', u'4DNESI2UKI7P', u'4DNESJ1VX52C', u'4DNESHGL976U', u'4DNESYUYFD6H', u'4DNESUTEOMGQ', u'4DNESOSE2FYZ', u'4DNESEW5JLUC', u'4DNESHFBC56P', u'4DNESDEK4IH8', u'4DNESI7DEJTM', u'4DNESUB35TII', u'4DNESIE5R9HS', u'4DNESSM1H92K', u'4DNES1ZEJNRU', u'4DNES4269GKX', u'4DNESPXW8XHY', u'4DNESLLA3R1V']


In [5]:
from core.utils import Tibanna
from core.utils import run_workflow
import time

paired_files = pairs_ready_to_run

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
outfiles = tibanna.s3.outfile_bucket
tibanna.s3.outfile_bucket = 'elasticbeanstalk-fourfront-webprod-files'
index_h = make_input_file_json('4DNFIZQZ39L9', 'bwa_index', tibanna)
index_m = make_input_file_json('4DNFI823LSI8', 'bwa_index', tibanna)

for set_name, organisms, f1,f2 in paired_files: 

    # find the correct index
    if organisms == ['human']:
        index = index_h
    elif organisms == ['mouse']:
        #index = index_m
        continue
    else:
        continue

    fastq1 = make_input_file_json(f1, 'fastq1', tibanna)
    fastq2 = make_input_file_json(f2, 'fastq2', tibanna)

    input_files = [fastq1, fastq2, index]
    if all(input_files):
        name = fastq1['object_key'].split('.')[0] + "-" + fastq2['object_key'].split('.')[0]
        input_json = make_hic1_json(input_files, env, outfiles, name)
        # print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(2)
    #a = raw_input("Press Enter to continue...")

print('Done')


looking for upload key 1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 4a6d10ee-2edb-4402-a98f-0edb1d58f5e1/4DNFI823LSI8.bwaIndex.tgz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 0cfb16e8-b902-4977-a498-587d36497687/4DNFIG22ZQ7Y.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 37d96da6-1daf-4bd0-87fa-50ec8df1cf3f/4DNFIMM81AZ3.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFIG22ZQ7Y-4DNFIMM81AZ3
response from aws was: 
 {u'startDate': datetime.datetime(2018, 3, 19, 20, 27, 10, 288000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '6da9735e-2bd5-11e8-91f9-51cdf96108a6', 'HTTPHeaders': {'x-amzn-requestid': '6da9735e-2bd5-11e8-91f9-51cdf96108a6', 'content-length': '142', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws: