In [3]:
from core.utils import Tibanna
from core import ff_utils
from datetime import datetime


env = 'fourfront-webdev'
tibanna = Tibanna(env=env)
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)


def summarize_file(file_resp):
    file_id = file_resp['accession']
    wf_partA = ["hi-c-processing-parta-juicer/25", "hi-c-processing-parta-juicer/26"]
    relations = file_resp.get('related_files')
    workflows = file_resp.get('workflow_run_inputs')
    first_alias = file_resp.get('aliases',[None])[0]
    # get related file
    paired_file = ''
    for relation in relations:
        if relation['relationship_type'] == 'paired with':
            paired_file = relation['file']['accession']
    # Check workflows workflow partA
    last_part_A = ''
    last_part_A_status = 'did_not_run'
    
    # Assumes workflow_runs come in time ordered list, and grabs the last ones for each wf run
    wfr_report = []
    if workflows:
        for wfr_resp in workflows:
            wfr_report.append(get_wfr_report(wfr_resp))
        wfr_report = sorted(wfr_report, key=lambda k: (k['wfr_date'], k['wfr_name']))   
    if wfr_report:
        for report in wfr_report:
            if report['wfr_name'] in wf_partA:
                last_part_A = report.get('wfr_uuid')
                last_part_A_status = report.get('wfr_status') 
        
    # return a small report
    return {'file': file_id,
            'alias': first_alias,
            'paired_file': paired_file,
            'last_part_A': last_part_A,
            'last_part_A_status': last_part_A_status
           }


def get_wfr_report(wfr_data):
    """For a given workflow_run_sbg item, grabs details, uuid, run_status, wfr name, date, and run time"""
    wfr_uuid = wfr_data['uuid']
    wfr_status = wfr_data['run_status']
    wfr_name = wfr_data['title'].split(' run ')[0]
    wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
    run_hours = (datetime.now()-wfr_time).total_seconds()/3600
    wfr_rep = {'wfr_uuid': wfr_data['uuid'],
               'wfr_status': wfr_data['run_status'],
               'wfr_name': wfr_data['title'].split(' run ')[0],
               'wfr_date': wfr_time,
               'run_time': run_hours}
    return wfr_rep


def make_hicb_json(input_files, env, output_bucket, accession, ncores, rev_no):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "b9829418-49e5-4c33-afab-9ec90d659999",
                  "app_name": "hi-c-processing-partb/"+str(rev_no),
                  "parameters": {
                      "ncores" : ncores,
                      "binsize": 5000,
                      "min_res": 5000
                  },
                  "_tibanna": {"env": env, "run_type": "hic_part_b", "run_id": accession}
                  }
    return input_json


def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''obj_ids can be either a string or a list.
    {"bucket_name": "%s", "object_key": "%s", "uuid" : "%s", "workflow_argument_name": "%s"}'''
    input_is_array = True
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    if not isinstance(obj_ids, list):
        input_is_array = False
        obj_ids = [ obj_ids ]     
    object_key_list = []
    uuid_list = []
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])         
    if not input_is_array:
        uuid_list = uuid_list[0]
        object_key_list = object_key_list[0]     
    data = {'bucket_name' : bucket,
            'object_key' :  object_key_list,
            'uuid' : uuid_list,
            'workflow_argument_name': arg_name
            }
    return data


In [4]:
all_sets = [
'dciclab:rao_rep02',
'dciclab:rao_rep07',
'dciclab:rao_rep12',
'dciclab:rao_rep13',
'dcic:Selvaraj_gm12878_hic',
'dcic:Jin_imr90_hic'
]

run_sets = ['dciclab:rao_rep02',
'dciclab:rao_rep07']

all_pairs = []
for a_set in run_sets:
    print 'report on', a_set
    rep_resp = ff_utils.get_metadata(a_set, connection=ff)['experiments_in_set']
    total = 0
    set_pairs =[]
    set_pairs_fine = True  
    for exp in rep_resp:
        # print 'Experiment', exp
        exp_resp = ff_utils.get_metadata(exp, connection=ff)
        exp_files = exp_resp['files']
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded') 
            # skip unfortunate status
            if file_resp['status'] in ['deleted', 'uploading', 'upload failed']:
                continue
            # if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            if not tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket):
                print file_resp['accession'], "does not have a file in S3"
                continue 
            # skip miseq
            if file_resp.get('instrument') == "Illumina MiSeq":
                continue
            # skip pair no 2
            if file_resp.get('paired_end')=='2':
                continue 
 
            # get report
            file_info = summarize_file(file_resp)
            # get report on paired file
            paired_file = file_info['paired_file']
            pair_file_resp = ff_utils.get_metadata(paired_file, connection=ff, frame='embedded')
            pair_file_info = summarize_file(pair_file_resp)
            
            partAfine = False
            # check the partA for both paired fastq
            if file_info['last_part_A_status'] == 'complete':
                if pair_file_info['last_part_A_status'] == 'complete':
                    if file_info['last_part_A'] == pair_file_info['last_part_A']:
                        partAfine = True
                        
            ########
            #######
            #  We need to add a check if partV run before, so don't run twice
            #####
            ####
            ###
            ##
            #
          
            if not partAfine:
                set_pairs_fine = False
                print file_info['file'], pair_file_info['file'], "has problems with partA"
                print "this experiment set will not be part of all_pairs list"
                
            # if partA is fine
            else:
                partA_data = ff_utils.get_metadata(file_info['last_part_A'], connection=ff)
                inputs = partA_data.get('input_files')
                    
                    
                outputs = partA_data.get('output_files')
                pair_file = [i['value'] for i in outputs if i['format'] == 'pairs'][0]
                pair_resp = ff_utils.get_metadata(pair_file, connection=ff, frame='embedded')  
                
                # check if file is in s3
                head_info = tibanna.s3.does_key_exist(pair_resp['upload_key'], tibanna.s3.outfile_bucket)
                if not head_info:
                    set_pairs_fine = False
                    print pair_resp['accession'], "does not have a file in S3, skipping this set"
                    continue
                file_size = round(head_info['ContentLength']/1073741824.0,1)
                total += file_size
                # print pair_file, str(file_size)
                set_pairs.append(pair_file)
    
    if set_pairs_fine:
        all_pairs.append([a_set, set_pairs, total])
    print str(total) + "GB total file size"
    print

print "no of pairs sets"
print len(all_pairs)
print "use 'all_pairs' to pass the list of pairs files of each set"

report on dciclab:rao_rep02
90.7GB total file size

report on dciclab:rao_rep07
96.2GB total file size

no of pairs sets
2
use 'all_pairs' to pass the list of pairs files of each set


In [5]:
from core.utils import Tibanna
from core.utils import run_workflow
from core import ff_utils
import time

# testportal
env = 'fourfront-webdev'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket



test_pairs = [
    ['small_set_1.2gb_1.2gb', ['/files-processed/4DNFIO9EV5ME/', '/files-processed/4DNFIPZZNRT5/']],
    ['medium_set_2.3gb_2.4gb',['/files-processed/4DNFI0U1SFJJ/', '/files-processed/4DNFI1C97MZ6/']],
    ['large_set_3.6gb_3.6gb',['/files-processed/4DNFIXV3ACPK/', '/files-processed/4DNFI6NDND1Y/']]
]

for set_name, pair_list, size in all_pairs:
    revision = 34
    if size > 25:
        revision = 37
    print set_name, "rev", revision
    
    chrsizes = make_input_file_json('4DNFI823LSII', 'chrsizes', tibanna, raw_file_bucket)
    pair_files= make_input_file_json(pair_list, 'input_pairs', tibanna, output_file_bucket)
    # changed from 32 to 8 19.07.17 11.21 am - because we had cool multiprocess error
    ncores = 8

    input_files = [chrsizes, pair_files]
    if all(input_files):
        name = set_name.replace(":", "_")
        input_json = make_hicb_json(input_files, env, output_file_bucket, name, ncores, revision)
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
        print
    print '---------------------------'
    time.sleep(10)
    
print('Done')

dciclab:rao_rep02 rev 37
looking for upload key 4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes, on bucket elasticbeanstalk-fourfront-webdev-files
looking for upload key a724d25a-01a2-47f5-8dd5-25c661946dd3/4DNFISKR3C53.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 7efa6b5a-a2d0-4e3b-b465-f295b231b592/4DNFI906HTJW.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key a7bcf6d6-559a-4379-bdd3-6ec2669cb8a6/4DNFI022C7SS.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 6a175ab2-9dc9-4c47-bd0b-dbed9941edff/4DNFIK7BFT18.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 3f84f9e5-68a1-4e9f-8fc5-ab42db44806c/4DNFIEN5Y1JE.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 06bd6439-230b-4838-aee4-b1d2206eef7c/4DNFIPDA3140.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 

looking for upload key d166961c-4d41-454a-bbc0-3e3b8b9fc336/4DNFIAI5WWUK.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 365f7a45-f22c-432f-ad47-af43996fdf70/4DNFIY4ADY12.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key dbf20163-a27c-4627-95ee-03fd0947dcb5/4DNFIXDYZNGA.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 3fe787db-03a9-471a-849e-40dffe226e8a/4DNFIX97V4QW.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 4be453c4-c1f4-4598-ac0c-d1b73ae176e6/4DNFIMDUVALM.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 4876156a-1c06-41e1-a676-6d7dab366553/4DNFIANXLTA3.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 39d71669-d3db-4e07-81f9-8911259f4b36/4DNFIV6KEBIF.pairs.gz, on bucket elasticbeanstalk-fourfront-webdev-wfoutput
looking for upload key 29231198-4cfa-432d-83fc-1