In [3]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partA
def make_input_file_json(obj_id, arg_name, tibanna):
    '''
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    metadata = ff_utils.get_metadata(obj_id, connection=ff)
    data = {}
    
    # just make sure the file is on s3, otherwise bail
    print("looking for upload key %s, on bucket %s" % 
          (metadata['upload_key'],
           tibanna.s3.outfile_bucket))
    if tibanna.s3.does_key_exist(metadata['upload_key']):
        data = {'bucket_name' : tibanna.s3.outfile_bucket,
                'object_key' : metadata['upload_key'].split('/')[1],
                'uuid' : metadata['uuid'],
                'workflow_argument_name': arg_name
                }
    return data
    

def make_hic1_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "3feedadc-50f9-4bb4-919b-09a8b731d0cc",
                  "app_name": "bwa-mem",
                  "parameters": {
                      "nThreads": 16
                      },
                  "config": {
                        "ebs_type": "io1",
                        "json_bucket": "4dn-aws-pipeline-run-json",
                        "ebs_iops": 500,
                        "shutdown_min": 30,
                        "s3_access_arn": "arn:aws:iam::643366669028:instance-profile/S3_access",
                        "ami_id": "ami-cfb14bb5",
                        "copy_to_s3": True,
                        "launch_instance": True,
                        "password": "dragonfly",
                        "log_bucket": "tibanna-output",
                        "script_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/awsf/",
                        "key_name": ""
                    },
                  "_tibanna": {"env": env, "run_type": "bwa-mem",
                               "run_id": accession},
                  "tag": "0.2.5"
                  }
    return input_json
    

In [6]:
from tasks import run_md5
from tasks import run_fastqc
from invoke import run
import time
from datetime import datetime

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately



exclude_miseq = True

wf_md5 = "md5"
wf_fastqc = "fastqc-0-11-4-1"
wf_partA = "bwa-mem 0.2.5"

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

run_md_qc = raw_input("Do you wanna run md5 and/or fastqc if missing? (md5/qc/all/none)")

# status for completion
# there are two flavors of complete signals, before it was output_file_transfer_finished, not it is complete.
# old completed wf runs have former one.
status_done = ['complete', 'output_file_transfer_finished']
status_good = ['uploaded', 'released', 'released to project']
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)

################
##ADD TO WORKFLOW
# wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
# run_hours = int((datetime.now()-wfr_time).total_seconds()/3600)
################

def summarize_file(file_resp):
    qc = False
    file_id = file_resp['accession']
    sequencer = file_resp.get('instrument')
    relations = file_resp.get('related_files')
    status = file_resp.get('status')
    workflows = file_resp.get('workflow_run_inputs')
    first_alias = file_resp.get('aliases',[None])[0]
    pair_no = file_resp.get('paired_end')
    # get related file
    paired_file = ''
    for relation in relations:
        if relation['relationship_type'] == 'paired with':
            paired_file = relation['file']['accession']
    
    # is there a qc?
    if file_resp.get('quality_metric'):
        qc = True
        
    # Check workflows for qc fastqc workflow partA
    last_part_A = ''
    last_part_A_status = 'did_not_run'
    md5_status = 'did_not_run'
    fastqc_status = 'did_not_run'
    # Assumes workflow_runs come in time ordered list, and grabs the last ones for each wf run
    if workflows:
        for a_wfr in workflows:
            wfr_resp = ff_utils.get_metadata(a_wfr['uuid'], connection=ff)  
            wfr_name = wfr_resp['display_title']
            if wfr_name.startswith(wf_md5):
                md5_status = wfr_resp.get('run_status')     
            elif wfr_name.startswith(wf_fastqc):
                fastqc_status = wfr_resp.get('run_status')     
            elif wfr_name.startswith(wf_partA):
                last_part_A=wfr_resp['uuid']
                last_part_A_status = wfr_resp.get('run_status')  
                
    # Check for md5 and fastqc, and if not complete, run or report it. 
    # if exclude miseq is on, do this only if sequencer is not miseq
    if not exclude_miseq or sequencer != "Illumina MiSeq":
        # check if md5 step is completed properly
        
        if status not in status_good or md5_status not in status_done:
            # if not, shall we run it?
            if run_md_qc in ['md5', 'all']:
                print 'md5 running for', file_resp['accession']
                code_md5= "invoke run_md5 " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
                run(code_md5)
                print ''
                time.sleep(10)
            # user does not want it to be run, so just report
            else:
                print 'md5 run missing for', file_resp['accession']
        # check fastqc if md5 is fine
        else:

            if not qc or fastqc_status not in status_done:
                # if not, shall we run it?
                if run_md_qc in ['qc', 'all']:
                    print 'fastqc running for', file_resp['accession']
                    code_qc= "invoke run_fastqc " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
                    run(code_qc)
                    print ''    
                    time.sleep(10)
                # user does not want it to be run, so just report
                else:
                    print 'fastqc run missing for', file_resp['accession'], fastqc_status
                    print 
   
    # return a small report
    return {'file': file_id,
            'alias': first_alias,
            'sequencer': sequencer,
            'pair_no': pair_no,
            'paired_file': paired_file,
            'file_status': status,
            'qc': qc,
            'md5_status': md5_status,
            'fastqc_status': fastqc_status,
            'last_part_A': last_part_A,
            'last_part_A_status': last_part_A_status
           }


def get_file_list(my_rep_set):

    rep_resp = ff_utils.get_metadata(my_rep_set, connection=ff)['experiments_in_set']
    report = []
    enzymes = []
    organisms = []
    for exp in rep_resp:
        exp_resp = ff_utils.get_metadata(exp, connection=ff)

        if not organisms:
            biosample = ff_utils.get_metadata(exp_resp['biosample'], connection=ff, frame='embedded')      
            organisms = list(set([bs['individual']['organism']['display_title'] for bs in  biosample['biosource']]))
            print organisms

        exp_files = exp_resp['files']
        enzyme = exp_resp.get('digestion_enzyme')
        enzymes.append(enzyme)
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded')  
            #Some checks before running
            #check if status is deleted
            if file_resp['status'] == 'deleted':
                continue
            #if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            head_info = tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket)
            if not head_info:
                print file_resp['accession'], "does not have a file in S3"
                continue

            # skip pair no 2
            if file_resp.get('paired_end')=='2':
                continue 

            # if the file size is more than 42 GB do not run
            file_size = head_info['ContentLength']
            if file_size > 45097156608:
                print file_resp['aliases'], 'larger than 42GB, skipping, please use another script'
                continue
            filesize = round(head_info['ContentLength']/1073741824.0,1)

            file_info = summarize_file(file_resp)

            # check for miseq
            if exclude_miseq:
                if file_info['sequencer'] == 'Illumina MiSeq':
                    continue
            paired_file = file_info['paired_file']
            pair_file_resp = ff_utils.get_metadata(paired_file, connection=ff, frame='embedded')
            pair_file_info = summarize_file(pair_file_resp)
            print fastq_file,paired_file, filesize

            # check consistency of paired file info
            # status differences gives error but there are multiple statuses that indicate complete
            # TODO fix it
            pairs_inconsistent = ""
            check_items = [ i for i in file_info.keys() if i not in ['file', 'paired_file', 'pair_no', 'alias']]
            for check_item in check_items:
                try:
                    assert file_info[check_item] == pair_file_info[check_item]
                except AssertionError:
                    print check_item, "not the same between pair", fastq_file, 'and', paired_file
                    pairs_inconsistent += check_item + ', '
            wf_check = ''
            # check if md5 and qc are okay
            for info in [file_info, pair_file_info]:
                if (info['md5_status'] in status_done and info['file_status'] in status_good and
                    info['fastqc_status'] in status_done and info['qc'] == True):
                    wf_check += '+'
            rep = {"consistency": pairs_inconsistent,  
                   "file1": file_info['alias'],
                   "file2": pair_file_info['alias'],
                   "const_check": pairs_inconsistent,
                   "wf_check": wf_check, 
                   "partA_wf": file_info['last_part_A'], 
                   "partA_status": file_info['last_part_A_status']
                   }
            # status differences gives error but there are multiple statuses that indicate complete
            # TODO fix it
            report.append(rep)
            if rep.get('const_check'):
                print rep['const_check']
    
    return my_rep_set, organisms, report


Do you wanna run md5 and/or fastqc if missing? (md5/qc/all/none)


In [7]:
all_reports = []
rest = ['4DNESCQ7ZD21',
'4DNESUZ3Y5GY',
'4DNESIV168N6',
'4DNESR9S8R38',
'4DNESO1IVQSC',
'4DNESNPHX8LY',
'4DNESOOCOBBA']
for a_set in rest:
    rep = get_file_list(a_set)
    if rep:
        all_reports.append(rep)
print 'done'

[u'human']
fastqc run missing for 4DNFIFF7K2ZC did_not_run

fastqc run missing for 4DNFIEPLTDNG did_not_run

/files-fastq/4DNFIFF7K2ZC/ 4DNFIEPLTDNG 1.2
fastqc run missing for 4DNFIWB5O95D did_not_run

fastqc run missing for 4DNFIJ95LUGK did_not_run

/files-fastq/4DNFIWB5O95D/ 4DNFIJ95LUGK 24.6
fastqc run missing for 4DNFIOOZOF4F did_not_run

fastqc run missing for 4DNFIJ4JO7H2 did_not_run

/files-fastq/4DNFIOOZOF4F/ 4DNFIJ4JO7H2 0.8
fastqc run missing for 4DNFIB734M8W did_not_run

fastqc run missing for 4DNFIPG8BLCU did_not_run

/files-fastq/4DNFIB734M8W/ 4DNFIPG8BLCU 26.0
[u'human']


TypeError: 'NoneType' object is not iterable

In [4]:


pairs_completed = []
pairs_running = []
pairs_did_not_run = []
pairs_ready_to_run = []
pairs_qcmd_problem = []

all_num= 0
for name, org, report in all_reports: 
    all_num += len(report)
    
    # 1 completed pairs
    pairs_completed.extend([[name,org,i] for i in report if i['partA_status']=='complete'])
    # 2 running pairs
    pairs_running.extend([[name, org, i] for i in report if i['partA_status'] not in ['complete','did_not_run']])
    # 3 no run pairs
    pairs_did_not_run.extend([[name, org, i] for i in report if i['partA_status']=='did_not_run'])
                             
                            
# 3a no run pairs with fine qc md5
pairs_ready_to_run = [[i[0], i[1], i[2]['file1'], i[2]['file2']] for i in pairs_did_not_run if i[2]['wf_check'] == '++']
# 3b no run pairs with problematic qc md5
pairs_qcmd_problem = [[i[0], i[1], i[2]['file1'], i[2]['file2']] for i in pairs_did_not_run if i[2]['wf_check'] != '++']

# # 2 running pairs to run again
# rerun_running_pairs = [(i['file1'], i['file2']) for i in report if i['partA_status'] not in ['complete','did_not_run']]

# rerun_started_pairs = [(i['file1'], i['file2']) for i in report if i['partA_status']=='started']

print "{}/{} pairs completed partA".format(all_num, len(pairs_completed))
print "{}/{} pairs still running partA".format(all_num, len(pairs_running))

print '1) ready to run (pairs_ready_to_run)'
print len(pairs_ready_to_run)
for n,o,a,b in pairs_ready_to_run:
    print n,o,a,b
print ""

print '2) problematics ones (pairs_qcmd_problem)'
for n,o,a,b in pairs_qcmd_problem:
    print n,o,a,b
print ""


print "DONE"

30/0 pairs completed partA
30/0 pairs still running partA
1) ready to run (pairs_ready_to_run)
30
dekker:h1_microc_repset [u'human'] dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S7_L007_R1_001 dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S7_L007_R2_001
dekker:h1_microc_repset [u'human'] dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S22_L005_R1_001 dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S22_L005_R2_001
dekker:h1_microc_repset [u'human'] dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S33_L008_R1_001 dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_U54-H1ESC4DN-FA-DSG-MNase-R1-T1_S33_L008_R2_001
dekker:h1_microc_repset [u'human'] dekker-lab:SeqencingFile_U54-Micro-C_U54-H1ESC4DN-FA-DSG-MNase

In [5]:
from core.utils import Tibanna
from core.utils import run_workflow
import time

paired_files = pairs_ready_to_run

env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
outfiles = tibanna.s3.outfile_bucket
tibanna.s3.outfile_bucket = 'elasticbeanstalk-fourfront-webprod-files'
index_h = make_input_file_json('4DNFIZQZ39L9', 'bwa_index', tibanna)
index_m = make_input_file_json('4DNFI823LSI8', 'bwa_index', tibanna)

for set_name, organisms, f1,f2 in paired_files: 

    # find the correct index
    if organisms == ['human']:
        index = index_h
    elif organisms == ['mouse']:
        #index = index_m
        continue
    else:
        continue

    fastq1 = make_input_file_json(f1, 'fastq1', tibanna)
    fastq2 = make_input_file_json(f2, 'fastq2', tibanna)

    input_files = [fastq1, fastq2, index]
    if all(input_files):
        name = fastq1['object_key'].split('.')[0] + "-" + fastq2['object_key'].split('.')[0]
        input_json = make_hic1_json(input_files, env, outfiles, name)
        # print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(2)
    #a = raw_input("Press Enter to continue...")

print('Done')


looking for upload key 1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 4a6d10ee-2edb-4402-a98f-0edb1d58f5e1/4DNFI823LSI8.bwaIndex.tgz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 9bf9729c-2ff3-4a3b-a33c-0528326fe65b/4DNFITHCIIBX.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 08aec878-b59a-4f26-b07a-819dd29e252f/4DNFID2ZCDGF.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFITHCIIBX-4DNFID2ZCDGF
response from aws was: 
 {u'startDate': datetime.datetime(2018, 1, 17, 12, 24, 23, 648000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '42c42746-fbab-11e7-a82d-8186db427310', 'HTTPHeaders': {'x-amzn-requestid': '42c42746-fbab-11e7-a82d-8186db427310', 'content-length': '148', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:

looking for upload key 84b4aa07-d088-4d7a-b14d-3ff9bea5d039/4DNFIUO8C231.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key c14f911b-0a0e-4c7b-b66c-63670af3da1e/4DNFI4J6LQYN.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFIUO8C231-4DNFI4J6LQYN
response from aws was: 
 {u'startDate': datetime.datetime(2018, 1, 17, 12, 25, 31, 969000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '6b7d90c7-fbab-11e7-b40b-1bdeecf79fe7', 'HTTPHeaders': {'x-amzn-requestid': '6b7d90c7-fbab-11e7-b40b-1bdeecf79fe7', 'content-length': '148', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIUO8C231-4DNFI4J6LQYN'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIU

looking for upload key 099f0a82-314a-4546-89f8-c0d0a1856a6a/4DNFI6XT2LSJ.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFIZHLTFWN-4DNFI6XT2LSJ
response from aws was: 
 {u'startDate': datetime.datetime(2018, 1, 17, 12, 26, 50, 813000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': '9a7c0d89-fbab-11e7-992e-3b39bcb66436', 'HTTPHeaders': {'x-amzn-requestid': '9a7c0d89-fbab-11e7-992e-3b39bcb66436', 'content-length': '148', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIZHLTFWN-4DNFI6XT2LSJ'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIZHLTFWN-4DNFI6XT2LSJ
looking for upload key b1483003-83f3-4087-a9e0-7525088becf4/4DNFIAQPZGR9.fastq.gz, on bucket elasticbeanstalk-four

response from aws was: 
 {u'startDate': datetime.datetime(2018, 1, 17, 12, 28, 13, 467000, tzinfo=tzlocal()), 'ResponseMetadata': {'RetryAttempts': 0, 'HTTPStatusCode': 200, 'RequestId': 'cbbf9867-fbab-11e7-9506-4fec2ef251b8', 'HTTPHeaders': {'x-amzn-requestid': 'cbbf9867-fbab-11e7-9506-4fec2ef251b8', 'content-length': '148', 'content-type': 'application/x-amz-json-1.0'}}, u'executionArn': u'arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIQWDFUI7-4DNFIVWCHMEG'}
url to view status:
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_awsem_new_pony:bwa-mem_4DNFIQWDFUI7-4DNFIVWCHMEG
looking for upload key 6e30e84b-4f4f-40c8-9b8e-8bc86b12c370/4DNFIB2BJTO3.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
looking for upload key 39919d6c-acaa-4611-962c-4a2ae2baa920/4DNFIJ9JKURX.fastq.gz, on bucket elasticbeanstalk-fourfront-webprod-files
about to start run bwa-mem_4DNFIB