In [None]:
from core.utils import Tibanna
from core import ff_utils

#format for input json in hic-partA
'''
{
  "input_files": [
    {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFIFBK258N.fastq.gz",
      "uuid" : "06aa0af1-2ccf-4dfe-aa14-209b1bd2754d",
      "workflow_argument_name": "fastq1"
    },
    {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFINZVD2W3.fastq.gz",
      "uuid": "52646398-29d5-4200-b3b5-059ff5c40b82",
      "workflow_argument_name": "fastq2"
    },
    {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFIZQZ39L9.bwaIndex.tgz",
      "uuid": "1f53df95-4cf3-41cc-971d-81bb16c486dd",
      "workflow_argument_name": "bwa_index"
    }
        {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFI823LSII.chrom.sizes",
      "uuid": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9",
      "workflow_argument_name": "chrsizes"
    },
    {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFI823L888.fasta.gz",
      "uuid": "4a6d10ee-2edb-4402-a98f-0edb1d58ddd2",
      "workflow_argument_name": "reference_fasta"
    },
    {
      "bucket_name": "elasticbeanstalk-fourfront-webprod-files",
      "object_key": "4DNFI823L812.txt",
      "uuid": "4a6d10ee-2edb-4402-a98f-0edb1d582084",
      "workflow_argument_name": "restriction_file"
    }
  ],
  "workflow_uuid": "02d636b9-d82d-4da9-950c-2ca994a0943e",
  "app_name": "hi-c-processing-parta/9",
  "parameters": {
    "nThreads": 8
  },
  "output_bucket": "elasticbeanstalk-fourfront-webprod-wfoutput-files",
  "_tibanna": {"env": "fourfront-webprod"}
}
'''


def make_input_file_json(obj_id, arg_name, tibanna):
    '''
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    metadata = ff_utils.get_metadata(obj_id, connection=ff)
    data = {}
    
    # just make sure the file is on s3, otherwise bail
    print("looking for upload key %s, on bucket %s" % 
          (metadata['upload_key'],
           tibanna.s3.outfile_bucket))
    if tibanna.s3.does_key_exist(metadata['upload_key']):
        data = {'bucket_name' : tibanna.s3.outfile_bucket,
                'object_key' : metadata['upload_key'].split('/')[1],
                'uuid' : metadata['uuid'],
                'workflow_argument_name': arg_name
                }
    return data
    
        
def make_hica_json(input_files, env, output_bucket, accession):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "a9caf6f3-49e5-4c33-afab-9ec90d65faf3",
                  "app_name": "hi-c-processing-parta-juicer/5",
                  "parameters": {
                      "nsplit": 100
                      },
                  "_tibanna": {"env": env, "run_type": "hic-parta",
                               "run_id": accession}
                  }
    return input_json
    

In [None]:
from tasks import run_md5
from tasks import run_fastqc
from invoke import run
import time
from datetime import datetime

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately

all_sets = ['ren:HG00732_repset','dciclab:rao_rep07',
'dciclab:rao_rep02',
'dciclab:rao_rep12',
'dciclab:rao_rep13',
'dcic:Selvaraj_gm12878_hic',
'dcic:Jin_imr90_hic']

    
my_rep_set = all_sets[0]
exclude_miseq = True
wf_md5 = "md5"
wf_fastqc = "fastqc-0-11-4-1/1"
wf_partA = "hi-c-processing-parta-juicer/5"
env = 'fourfront-webdev'
tibanna = Tibanna(env=env)
run_md_qc = raw_input("Do you wanna run md5 and/or fastqc if missing? (md5/qc/all/none)")

# status for completion
# there are two flavors of complete signals, before it was output_file_transfer_finished, not it is complete.
# old completed wf runs have former one.
status_done = ['complete', 'output_file_transfer_finished']

ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
rep_resp = ff_utils.get_metadata(my_rep_set, connection=ff)['experiments_in_set']

################
##ADD TO WORKFLOW
# wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
# run_hours = int((datetime.now()-wfr_time).total_seconds()/3600)
################

def summarize_file(file_resp):
    qc = False
    file_id = file_resp['accession']
    sequencer = file_resp.get('instrument')
    relations = file_resp.get('related_files')
    status = file_resp.get('status')
    workflows = file_resp.get('workflow_run_inputs')
    first_alias = file_resp.get('aliases',[None])[0]
    pair_no = file_resp.get('paired_end')
    # get related file
    paired_file = ''
    for relation in relations:
        if relation['relationship_type'] == 'paired with':
            paired_file = relation['file']
    
    # is there a qc?
    if file_resp.get('quality_metric'):
        qc = True
    # Check workflows for qc fastqc workflow partA
    last_part_A = ''
    last_part_A_status = 'did_not_run'
    md5_status = 'did_not_run'
    fastqc_status = 'did_not_run'
    # Assumes workflow_runs come in time ordered list, and grabs the last ones for each wf run
    if workflows:
        for wfr in workflows:
            wfr_resp = ff_utils.get_metadata(wfr, connection=ff)
            wfr_name = wfr_resp['display_title']
            if wfr_name.startswith(wf_md5):
                md5_status = wfr_resp.get('run_status')     
            elif wfr_name.startswith(wf_fastqc):
                fastqc_status = wfr_resp.get('run_status')     
            elif wfr_name.startswith(wf_partA):
                last_part_A=wfr
                last_part_A_status = wfr_resp.get('run_status')  
                
    # Check for md5 and fastqc, and if not complete, run or report it. 
    # if exclude miseq is on, do this only if sequencer is not miseq
    if not exclude_miseq or sequencer != "Illumina MiSeq":
        # check if md5 step is completed properly
        
        if status != "uploaded" or md5_status not in status_done:
            # if not, shall we run it?
            if run_md_qc in ['md5', 'all']:
                print 'md5 running for', file_resp['accession']
                code_md5= "invoke run_md5 " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
                run(code_md5)
                print ''
                time.sleep(10)
            # user does not want it to be run, so just report
            else:
                print 'md5 run missing for', file_resp['accession']
        # check fastqc if md5 is fine
        else:
            if not qc or fastqc_status not in status_done:
                # if not, shall we run it?
                if run_md_qc in ['qc', 'all']:
                    print 'fastqc running for', file_resp['accession']
                    code_qc= "invoke run_fastqc " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
                    run(code_qc)
                    print ''    
                    time.sleep(10)
                # user does not want it to be run, so just report
                else:
                    print 'fastqc run missing for', file_resp['accession'], fastqc_status
                    print 
                    
#             if fastqc_status == "did_not_run":
#                     print 'fastqc running for', file_resp['accession']
#                     code_qc= "invoke run_fastqc " + env + " " + file_resp['accession'] + " " + file_resp['uuid']
#                     run(code_qc)
#                     print ''    
#                     time.sleep(10)
           
     
    # return a small report
    return {'file': file_id,
            'alias': first_alias,
            'sequencer': sequencer,
            'pair_no': pair_no,
            'paired_file': paired_file,
            'file_status': status,
            'qc': qc,
            'md5_status': md5_status,
            'fastqc_status': fastqc_status,
            'last_part_A': last_part_A,
            'last_part_A_status': last_part_A_status
           }


report = []
enzymes = []
for exp in rep_resp:
    # print 'Experiment', exp
    exp_resp = ff_utils.get_metadata(exp, connection=ff)
    exp_files = exp_resp['files']
    enzyme = exp_resp['digestion_enzyme']
    enzymes.append(enzyme)
    for fastq_file in exp_files[:]:
        file_resp = ff_utils.get_metadata(fastq_file, connection=ff)
        
        #if no uploaded file in the file item report and skip
        if not file_resp.get('filename'):
            print file_resp['accession'], "does not have a file"
            continue
        # check if file is in s3
        if not tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket):
            print file_resp['accession'], "does not have a file in S3"
            continue
        # skip pair no 2
        if file_resp.get('paired_end')=='2':
            continue

        file_info = summarize_file(file_resp)

        # check for miseq
        if exclude_miseq:
            if file_info['sequencer'] == 'Illumina MiSeq':
                continue
        paired_file = file_info['paired_file']
        pair_file_resp = ff_utils.get_metadata(paired_file, connection=ff)
        pair_file_info = summarize_file(pair_file_resp)

        # check consistency of paired file info
        # status differences gives error but there are multiple statuses that indicate complete
        # TODO fix it
        pairs_inconsistent = ""
        check_items = [ i for i in file_info.keys() if i not in ['file', 'paired_file', 'pair_no', 'alias']]
        for check_item in check_items:
            try:
                assert file_info[check_item] == pair_file_info[check_item]
            except AssertionError:
                print check_item, "not the same between pair", fastq_file, 'and', paired_file
                pairs_inconsistent += check_item + ', '
        wf_check = ''
        # check if md5 and qc are okay
        for info in [file_info, pair_file_info]:
            if (info['md5_status'] in status_done and info['file_status'] == 'uploaded' and
                info['fastqc_status'] in status_done and info['qc'] == True):
                wf_check += '+'
        rep = {"consistency": pairs_inconsistent,  
               "file1": file_info['alias'],
               "file2": pair_file_info['alias'],
               "const_check": pairs_inconsistent,
               "wf_check": wf_check, 
               "partA_wf": file_info['last_part_A'], 
               "partA_status": file_info['last_part_A_status']
               }
        # status differences gives error but there are multiple statuses that indicate complete
        # TODO fix it
        report.append(rep)
        if rep.get('const_check'):
            print rep['const_check']

    
# TODO need to add failed ones
# 1 completed pairs
pairs_completed = [i for i in report if i['partA_status']=='complete']
# 2 running pairs
pairs_running = [i for i in report if i['partA_status'] not in ['complete','did_not_run']]
# 3 no run pairs
pairs_did_not_run = [i for i in report if i['partA_status']=='did_not_run']
# 3a no run pairs with fine qc md5
pairs_ready_to_run = [(i['file1'], i['file2']) for i in pairs_did_not_run if i['wf_check'] == '++']
# 3b no run pairs with problematic qc md5
pairs_qcmd_problem = [(i['file1'], i['file2']) for i in pairs_did_not_run if i['wf_check'] != '++']

# 2 running pairs to run again
rerun_running_pairs = [(i['file1'], i['file2']) for i in report if i['partA_status'] not in ['complete','did_not_run']]



print "{}/{} pairs completed partA".format(len(report), len(pairs_completed))
print "{}/{} pairs still running partA".format(len(report), len(pairs_running))
print ",".join(i['partA_status'] for i in pairs_running)
print ''
print '1) ready to run (pairs_ready_to_run)'
for a,b in pairs_ready_to_run:
    print a,b
print ""

print '2) problematics ones (pairs_qcmd_problem)'
for a,b in pairs_qcmd_problem:
    print a,b
print ""

print '3) running ones (rerun_running_pairs)'
for a,b in rerun_running_pairs:
    print a,b
print ""

# Choose the right NZ reference file
re_ref_file = ''
choice = {'HindIII': '4DNFI823L811', 'MboI': '4DNFI823L812'}
# Check if all experiments use the same enzyme
if len(list(set(enzymes))) != 1:
    print "ERROR Mixed Enzyme Content in Experiment Set"
else:
    nz_name = enzymes[0].split('/')[2]
    re_ref_file = choice[nz_name]
print 'using {} ({}) as the enzyme'.format(nz_name, re_ref_file)
print "DONE"


In [None]:
from core.utils import run_workflow
import time

# hic-partA, paired files
paired_files = []
#pairs_qcmd_problem
#pairs_ready_to_run
#rerun_running_pairs


# testportal
env = 'fourfront-webdev'
tibanna = Tibanna(env=env)
outfiles = tibanna.s3.outfile_bucket
tibanna.s3.outfile_bucket = 'elasticbeanstalk-fourfront-webdev-files'

# todo need a function to determin this given fastq1
index = make_input_file_json('4DNFIZQZ39L9', 'bwa_index', tibanna)
chrsizes = make_input_file_json('4DNFI823LSII', 'chrsizes', tibanna)
ref = make_input_file_json('4DNFI823L888', 'reference_fasta', tibanna)
restrict = make_input_file_json(re_ref_file, 'restriction_file', tibanna)

for pair in paired_files:
    fastq1 = make_input_file_json(pair[0], 'fastq1', tibanna)
    fastq2 = make_input_file_json(pair[1], 'fastq2', tibanna)
    
     
    input_files = [fastq1, fastq2, index, chrsizes, ref, restrict]
    if all(input_files):
        name = fastq1['object_key'].split('.')[0] + "-" + fastq2['object_key'].split('.')[0]
        input_json = make_hica_json(input_files, env, outfiles, name)
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(30)
print 'Done'

In [None]:
tibanna = {
    "env": "fourfront-webprod",
    "settings": {
      "url": "https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:run_sbg_workflow_3:generic_d98ee6a1-b8ea-4648-a79c-9576e94f555e",
      "run_type": "generic",
      "run_name": "generic_d98ee6a1-b8ea-4648-a79c-9576e94f555e",
      "env": "fourfront-webprod",
      "run_id": "d98ee6a1-b8ea-4648-a79c-9576e94f555e"
    }
workflow = 
sbg = sbg_utils.create_sbg_workflow(token=tibanna.sbg_keys,