In [5]:
from core.utils import Tibanna
from core import ff_utils
from core.utils import run_workflow
from datetime import datetime
from core.wfr import *

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
tibanna.ff_keys['default']['server'] = 'https://data.4dnucleome.org'
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
tibanna.ff_keys['default']['server'] = 'https://data.4dnucleome.org'
exclude_miseq = True

In [7]:
import time

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately

wf_dict =[
    {'wf_name': 'bwa-mem',
     'wf_uuid': '3feedadc-50f9-4bb4-919b-09a8b731d0cc',
     'parameters':{"nThreads": 16},
    },
    {'wf_name': 'hi-c-processing-bam',
     'wf_uuid': '023bfb3e-9a8b-42b9-a9d4-216079526f68',
     'parameters':{"nthreads_merge": 16, "nthreads_parse_sort": 16},
    },
    {'wf_name': 'hi-c-processing-pairs-nore',
     'wf_uuid': 'c19ee11e-9d5a-454f-af50-600a0cf990b6',
     'parameters': {"nthreads": 1, "maxmem": "32g"},
    }    
]

# url for micro-C Dnase exps
exp_types = ['micro-C', 'DNase%20Hi-C']
set_url = '/search/?'+ \
            '&'.join(['experiments_in_set.experiment_type='+i for i in exp_types])+ \
            '&type=ExperimentSetReplicate' + \
            '&status=released&status=released%20to%20project'
            
            
run_sets = ff_utils.get_metadata(set_url , connection=ff)['@graph']

add_pc = False
add_rel = False
add_wfr = True

counter = 0
completed = 0
completed_acc = []

all_sets = len(run_sets)
print(str(all_sets)+' total number of sets')
run_sets = [i for i in run_sets if "HiC_Pipeline_0.2.5"  not in i.get('completed_processes', [])]
print(str(all_sets-len(run_sets))+ ' sets completed')

for a_set in run_sets: 
    counter += 1

    print 
    fastqpairs, organism, enzyme, bwa_ref, chrsize_ref, enz_ref, f_size = find_pairs(a_set, exclude_miseq, ff, tibanna)

    if not bwa_ref or not chrsize_ref:
        print counter, a_set['accession'], organism, enzyme, 'skipping set with not chrsize/bwa index'
        continue
    
    if f_size < 15:
        print counter, a_set['accession'], 'skipping small file size', str(f_size) 
        continue
        
        
    print counter, a_set['accession']
    print enzyme, organism
    part3 = 'done'
    list_release = []
    set_pairs = []        
    # cycle through the experiments
    for exp in fastqpairs.keys():
        if not fastqpairs.get(exp):
            print(exp, 'does not have any fastq pairs')
            continue
        # Check Part 1 and See if all are okay
        exp_bams = []
        part1 = 'done'
        part2 = 'done'
        for pair in fastqpairs[exp]:
            #############
            bam1 = get_wfr_out(pair[0], 'bwa-mem 0.2.5', 'bam', ff)
            bam2 = get_wfr_out(pair[1], 'bwa-mem 0.2.5', 'bam', ff)
            # if run is not successful
            if bam1.startswith('no') or not bam1 or bam1 != bam2:
                part1 = 'not ready'
                if add_wfr:
                    if not bwa_index:
                        print 'not yet usable', organism
                        continue
                    inp_f = {'fastq1':pair[0], 'fastq2':pair[1], 'bwa_index':bwa_ref}
                    name_tag = pair[0].split('/')[2]+'_'+pair[1].split('/')[2]
                    run_missing_wfr(wf_dict[0], inp_f, name_tag, ff, env, tibanna)
            elif bam1 == 'running':
                part1 = 'still running'
                print('part1 still running')
            # if successful
            else:
                exp_bams.append(bam1)
                list_release.append(bam1)
        # stop progress to part2 
        if part1 is not 'done':
            print exp, 'has missing Part1 runs'
            part2 = 'not ready'
            part3 = 'not ready'
            continue
        print exp, 'part1 complete'
        #check if part 2 is run already, it not start the run
        exp_com_bam = []
        exp_pairs = []
        for bam in exp_bams:
            com_bam = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'bam', ff)
            pairs = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'pairs', ff)
            # try to run if missing
            if pairs.startswith('no') or not pairs:
                part2 = 'not ready'
                part3 = 'not ready'
                
            elif pairs == 'running':
                part2 = 'still running'
                part3 = 'not ready'
                
            else:
                exp_com_bam.append(com_bam)
                exp_pairs.append(pairs)
                
        # if still running, skip to next experiment
        if part2 == 'still running':
            print('part2 still running')
            continue
        
        # make sure all bams went through the same wfr and produces same file
        if part2 != 'done' or len(list(set(exp_com_bam))) != 1 or len(list(set(exp_pairs))) !=1:
            print exp, 'Part2 did not complete'
            part3 = 'not ready' 
        
            if add_wfr:
                if not chrsize_ref:
                    print 'not yet usable', organism
                    continue
                # make sure no duplicates
                inp_f = {'input_bams':exp_bams, 'chromsize':chrsize_ref}           
                run_missing_wfr(wf_dict[1], inp_f, exp, ff, env, tibanna)   
            continue
            
        # add bam and pairs to exp proc file
        list_release.extend([exp_com_bam[0],exp_pairs[0]])
        if add_pc:
            add_processed_files(exp, [exp_com_bam[0],exp_pairs[0]], ff)
        
        print exp, 'part2 complete'
        set_pairs.append(exp_pairs[0])
    
    if part3 != 'done':
        print 'Part3 not ready'
        continue
    
    if not set_pairs:
        print 'no pairs can be produced from this set'
        continue
        
    merged_pairs = []
    for set_pair in set_pairs:
        merged_pair = get_wfr_out(set_pair, 'hi-c-processing-pairs-nore 0.2.5', 'pairs', ff)
        hic = get_wfr_out(set_pair, 'hi-c-processing-pairs-nore 0.2.5', 'hic', ff)
        mcool = get_wfr_out(set_pair, 'hi-c-processing-pairs-nore 0.2.5', 'mcool', ff)
        normvec = get_wfr_out(set_pair, 'hi-c-processing-pairs-nore 0.2.5', 'normvector_juicerformat', ff)
        if merged_pair.startswith('no') or not merged_pair:
            part3 = 'not ready'
            break
        elif merged_pair == 'running':
            part3 = 'still running'
            break
        else:
            merged_pairs.append(merged_pair)
    
    
    # if part3 is still running report it, and skip the rest of the script
    if part3 == 'still running':
        print 'part3', part3
        continue        
                
    if part3 != 'done' or len(list(set(merged_pairs))) != 1:
        print a_set['accession'], 'is missing Part3'
        
        # if it is not run, and add_wfr is true, go for it, then skip the rest of the script
        if add_wfr:
            if not chrsize_ref:
                print 'not yet usable', organism
                continue

            inp_f = {'input_pairs':set_pairs, 'chromsizes':chrsize_ref} 
            run_missing_wfr(wf_dict[2], inp_f, a_set['accession'], ff, env, tibanna)
        continue
    #####
    #add competed flag to experiment
    if add_pc and add_rel:
        ff_utils.patch_metadata({"completed_processes":["HiC_Pipeline_0.2.5"]}, obj_id=a_set['accession'] ,connection=ff)
    
    # add processed files to set
    list_release.extend([merged_pair, hic, mcool, normvec])
    if add_pc:
        add_processed_files(a_set['accession'], [merged_pair, hic, mcool, normvec], ff)
    
    #release files and wfrs
    if add_rel:
        release_files(a_set['accession'], list(set(list_release)), ff)
    
    completed += 1
    completed_acc.append(a_set['accession'])
    print a_set['accession'], 'part3 complete'

    
print completed
print completed_acc

14 total number of sets
0 sets completed

1 4DNESRWDFFF8
DNaseI mouse
4DNEXB8HHA81 part1 complete
4DNEXB8HHA81 Part2 did not complete
about to start run hi-c-processing-bam_4DNEXB8HHA81
response from aws was: 
 {u'startDate': datetime.datetime(2018, 5, 14, 10, 41, 48, 696000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': 'eeb467a6-5784-11e8-b9df-679104f41329', 'HTTPHeaders': {'x-amzn-requestid': 'eeb467a6-5784-11e8-b9df-679104f41329', 'content-length': '141', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:hi-c-processing-bam_4DNEXB8HHA81'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:hi-c-processing-bam_4DNEXB8HHA81
4DNEX8EW839G part1 complete
4DNEX8EW839G Part2 did not complete
about to start run hi-c-processing-bam_4DNEX8EW839G
response from


10 4DNES6G787KF
DNaseI human
4DNEXRGPHY4P part1 complete
4DNEXRGPHY4P part2 complete
4DNEXGZH6KEK part1 complete
4DNEXGZH6KEK part2 complete
4DNES6G787KF is missing Part3
about to start run hi-c-processing-pairs-nore_4DNES6G787KF9779c9b8-533f-4b13-bac7-946c43012403
response from aws was: 
 {u'startDate': datetime.datetime(2018, 5, 14, 10, 48, 42, 910000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': 'e596eb16-5785-11e8-8f3d-e34eacaaf50c', 'HTTPHeaders': {'x-amzn-requestid': 'e596eb16-5785-11e8-8f3d-e34eacaaf50c', 'content-length': '183', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:hi-c-processing-pairs-nore_4DNES6G787KF9779c9b8-533f-4b13-bac7-946c43012403'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:hi-c-processing-pairs-nore_4DNES6G787K