In [None]:
from tasks import run_md5
from tasks import run_fastqc
from invoke import run
import time
from datetime import datetime
from core.utils import Tibanna
from core import ff_utils

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately


wf_md5 = "md5"
wf_fastqc = "fastqc-0-11-4-1/1"

env = 'fourfront-webdev'
tibanna = Tibanna(env=env)
run_md_qc = raw_input("Do you wanna run md5 and/or fastqc if missing? (md5/qc/all/none)")
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)

def get_wfr_report(wfr_data):
    """For a given workflow_run_sbg item, grabs details, uuid, run_status, wfr name, date, and run time"""
    wfr_uuid = wfr_data['uuid']
    wfr_status = wfr_data['run_status']
    wfr_name = wfr_data['title'].split(' run ')[0]
    wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
    run_hours = (datetime.now()-wfr_time).total_seconds()/3600
    wfr_rep = {'wfr_uuid': wfr_data['uuid'],
               'wfr_status': wfr_data['run_status'],
               'wfr_name': wfr_data['title'].split(' run ')[0],
               'wfr_date': wfr_time,
               'run_time': run_hours}
    return wfr_rep

def summarize_file(fastq_file_id):
    file_resp = ff_utils.get_metadata(fastq_file_id, connection=ff, frame = 'embedded')
    qc = False
    file_id = file_resp['accession']
    sequencer = file_resp.get('instrument')
    relations = file_resp.get('related_files')
    status = file_resp.get('status')
    workflows = file_resp.get('workflow_run_inputs')
    print workflows
    first_alias = file_resp.get('aliases',[None])[0]
    pair_no = file_resp.get('paired_end')
    # get related file
    paired_file = ''
    for relation in relations:
        if relation['relationship_type'] == 'paired with':
            paired_file = relation['file']['accession']
    # is there a qc?
    if file_resp.get('quality_metric'):
        qc = True
    # Check workflows for qc fastqc workflow partA
    last_part_A = ''
    last_part_A_status = 'did_not_run'
    md5_status = 'did_not_run'
    fastqc_status = 'did_not_run'
    
    # simplfy workflow info and reorder
    wfr_report = []
    if workflows:
        for wfr_resp in workflows:
            wfr_report.append(get_wfr_report(wfr_resp))
        wfr_report = sorted(wfr_report, key=lambda k: (k['wfr_date'], k['wfr_name']))   
    if wfr_report:
        for report in wfr_report:
            if report['wfr_name'].startswith(wf_md5):
                md5_status = wfr_resp.get('run_status')     
            elif wfr_name.startswith(wf_fastqc):
                fastqc_status = wfr_resp.get('run_status')     
             
    # Check for md5 and fastqc, and if not complete, run or report it. 
    # check if md5 step is completed properly

    if status != "uploaded" or md5_status != 'complete':
        # if not, shall we run it?
        if run_md_qc in ['md5', 'all']:
            print 'md5 running for', file_resp['accession']
            code_md5= "invoke run_md5 " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
            run(code_md5)
            print ''
            time.sleep(10)
        # user does not want it to be run, so just report
        else:
            print 'md5 run missing for', file_resp['accession']
    # check fastqc if md5 is fine
    else:
        if not qc or fastqc_status != 'complete':
            # if not, shall we run it?
            if run_md_qc in ['qc', 'all']:
                print 'fastqc running for', file_resp['accession']
                code_qc= "invoke run_fastqc " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
                run(code_qc)
                print ''    
                time.sleep(10)
            # user does not want it to be run, so just report
            else:
                print 'fastqc run missing for', file_resp['accession'], fastqc_status
                print 


In [None]:
delet = 0
no_f = 0
no_s3 = 0
miseq = 0
all_files = ff_utils.get_metadata('files-fastq', connection=ff)['@graph']

for fastq_file in all_files:
    if fastq_file['status'] in ['deleted']:
        delet += 1
        continue
    #if no uploaded file in the file item report and skip
    if not fastq_file.get('filename'):
        print fastq_file['accession'], "does not have a file"
        no_f += 1
        continue
    # check if file is in s3
    if not tibanna.s3.does_key_exist(fastq_file['upload_key'], tibanna.s3.raw_file_bucket):
        print fastq_file['accession'], "does not have a file in S3"
        no_s3 += 1
        continue 
    # skip miseq for fastqc
#     if fastq_file.get('instrument') == "Illumina MiSeq":
#         miseq += 1
#         continue
    file_info = summarize_file(fastq_file['uuid'])
    break
    
print str(delet), 'deleted files skipped'
print str(no_f), 'files with no uploads skipped'
print str(no_s3), 'files with missing s3 file skipped'
print str(miseq), 'files skipped because they are miseq'

In [None]:
all_sets = ['dciclab:rao_rep07',
'dciclab:rao_rep02',
'dciclab:rao_rep12',
'dciclab:rao_rep13',
'dcic:Selvaraj_gm12878_hic',
'dcic:Jin_imr90_hic']

for my_rep_set in all_setsXXXX:
    print my_rep_set
    rep_resp = ff_utils.get_metadata(my_rep_set, connection=ff)['experiments_in_set']   
    for exp in rep_resp:
        exp_resp = ff_utils.get_metadata(exp, connection=ff)
        exp_files = exp_resp['files']
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame = 'embedded')  
            #if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            if not tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket):
                print file_resp['accession'], "does not have a file in S3"
                continue
            file_info = summarize_file(file_resp)
    print
