In [1]:
from core.utils import Tibanna
from core import ff_utils
from core.utils import run_workflow

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
raw_bucket = tibanna.s3.raw_file_bucket
out_bucket = tibanna.s3.outfile_bucket
exclude_miseq = True


def find_pairs(my_rep_set):
    """Find pairs and make sure they are fine my qc.
    """
    report = {}
    rep_resp = my_rep_set['experiments_in_set']
    enzymes = []
    organisms = []
    for exp in rep_resp:
        exp_resp = ff_utils.get_metadata(exp, connection=ff)
        report[exp_resp['accession']] = []
        if not organisms:
            biosample = ff_utils.get_metadata(exp_resp['biosample'], connection=ff, frame='embedded')      
            organisms = list(set([bs['individual']['organism']['display_title'] for bs in biosample['biosource']]))
            if len(organisms) != 1:
                print 'multiple organisms in set', my_rep_set['accession']
                break
        exp_files = exp_resp['files']
        enzyme = exp_resp.get('digestion_enzyme')
        enzymes.append(enzyme)
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff)  
            # skip pair no 2
            if file_resp.get('paired_end')=='2':
                continue 
            # exclude miseq
            if exclude_miseq:
                if file_resp.get('instrument') == 'Illumina MiSeq':
                    print 'skipping miseq files', exp
                    continue
                
            #Some checks before running
            #check if status is deleted
            if file_resp['status'] == 'deleted':
                print 'deleted file', file_resp['accession'], 'in', my_rep_set['accession']
                continue
            #if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
            if not head_info:
                print file_resp['accession'], "does not have a file in S3"
                continue
            # check that file has a pair
            f1 = file_resp['@id']
            f2 = ''
            relations = file_resp.get('related_files')
            for relation in relations:
                if relation['relationship_type'] == 'paired with':
                    f2 = relation['file']
            if not f2:
                print f1, 'does not have a pair'
                continue
            report[exp_resp['accession']].append((f1, f2))
            
    # get the organism
    if len(list(set(organisms))) == 1:
        organism = organisms[0]
    else:
        organism = None
        print 'problematic organism', set(organisms)
        
    # get the enzyme
    if len(list(set(enzymes))) == 1:
        enz = enzymes[0].split('/')[2]
    else:
        enz = None
        print 'problematic enzyme', set(enzymes)
    return report, organism, enz


def get_wfr_out(file_id, wfr_name, file_format):
    emb_file = ff_utils.get_metadata(file_id, connection=ff, frame = 'embedded')
    workflows = emb_file.get('workflow_run_inputs')
    wfr = {}
    run_status = 'did not run'
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_resp_name = wfr_resp['display_title']
            if wfr_resp_name.startswith(wfr_name):
                wfr = wfr_resp
                run_status = wfr_resp['run_status']
    else:
        return "no workflow in file"
    
    if run_status == 'complete':
        outputs = wfr.get('output_files')
        file_id = [i['value'] for i in outputs if i['format'] == file_format][0]
        if file_id:
            return file_id
        else:
            return "no file found"
    else:
        return "no completed run"


In [4]:
import time
from datetime import datetime

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately

wf_dict =[
    {'wf_name': 'bwa-mem',
     'wf_uuid': '3feedadc-50f9-4bb4-919b-09a8b731d0cc',
     'parameters':{"nThreads": 16},
    },
    {'wf_name': 'hi-c-processing-bam',
     'wf_uuid': '023bfb3e-9a8b-42b9-a9d4-216079526f68',
     'parameters':{"nthreads_merge": 16, "nthreads_parse_sort": 16},
    },
    {'wf_name': 'hi-c-processing-pairs',
     'wf_uuid': 'c9e0e6f7-b0ed-4a42-9466-cadc2dd84df0',
     'parameters': {"nthreads": 1, "maxmem": "32g"},
    }    
]

# url for hic exps
exp_types = ['in%20situ%20Hi-C', 'dilution%20Hi-C']
set_url = '/search/?'+'&'.join(['experiments_in_set.experiment_type='+i for i in exp_types])+'&type=ExperimentSetReplicate'
run_sets = ff_utils.get_metadata(set_url , connection=ff)['@graph']

add_pc = False
add_rel = False
add_wfr = False

#test_set = '4DNES2R6PUEK'
#test_set = '4DNESZ2PVZWR'
#run_sets = [ff_utils.get_metadata(test_set , connection=ff)]
counter = 0
completed = 0
completed_acc = []
print len(run_sets)
for a_set in run_sets: 
    counter += 1
     
    if a_set.get('completed_processes') == ["HiC_Pipeline_0.2.5"]:
        print counter, a_set['accession'], 'complete'    
    else:
        continue
    fastqpairs, organism, enzyme = find_pairs(a_set)    
    for exp in fastqpairs.keys():
        if not fastqpairs.get(exp):
            print(exp, 'does not have any fastq pairs')
            continue
        # Check Part 1 and See if all are okay
        for pair in fastqpairs[exp]:
            #p1
            bam1 = get_wfr_out(pair[0], 'bwa-mem 0.2.5', 'bam')
            #p2
            com_bam = get_wfr_out(bam1, 'hi-c-processing-bam 0.2.5', 'bam')
            pairs = get_wfr_out(bam1, 'hi-c-processing-bam 0.2.5', 'pairs')
            #p3
            mcool = get_wfr_out(pairs, 'hi-c-processing-pairs 0.2.5', 'mcool')
            # merged_pair = get_wfr_out(pairs, 'hi-c-processing-pairs 0.2.5', 'pairs')
            # hic = get_wfr_out(pairs, 'hi-c-processing-pairs 0.2.5', 'hic')
            # normvec = get_wfr_out(pairs, 'hi-c-processing-pairs 0.2.5', 'normvector_juicerformat')
            print(mcool)
            file_resp = ff_utils.get_metadata(mcool, connection=ff)
            head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], out_bucket)
            if not head_info:
                print file_resp['accession'], "does not have a file in S3"
            else:
                print 'fine'
            break
        break



89
9 4DNESRJ8KV4Q complete
/files-processed/4DNFI7JNCNFB/
fine
11 4DNES78Y8Y5K complete
/files-processed/4DNFISWCOKN2/
fine
12 4DNESB6MNCFE complete
/files-processed/4DNFIQJQY7PW/
fine
14 4DNES8J78WV2 complete
/files-processed/4DNFIPNC6K5B/
fine
15 4DNESAPF27TG complete
/files-processed/4DNFIK6HMOII/
fine
23 4DNES9L4AK6Q complete
/files-processed/4DNFIMDOXUT8/
fine
24 4DNES2M5JIGV complete
/files-processed/4DNFI6HDY7WZ/
fine
26 4DNESLLTENG9 complete
/files-processed/4DNFI3UNF9VB/
fine
28 4DNES98CI6GV complete
/files-processed/4DNFIJYULXT7/
fine
29 4DNESC5J3EIX complete
/files-processed/4DNFI2T49EAG/
fine
30 4DNES21NPLZU complete
/files-processed/4DNFIH77QBT5/
fine
31 4DNESYTIFTEE complete
/files-processed/4DNFIQI8SFNE/
fine
32 4DNESIG4ELE4 complete
/files-processed/4DNFIIH3SM5N/
fine
34 4DNESNHN919R complete
/files-processed/4DNFILC1IPOE/
fine
35 4DNES8ZUV5CQ complete
/files-processed/4DNFI5K8L94P/
fine
36 4DNESCCP4KTY complete
/files-processed/4DNFIFD991IA/
fine
37 4DNEST9AVULS comple