In [None]:
from core.utils import Tibanna
from core import ff_utils

wf_partA = "hi-c-processing-parta-juicer/"
env = 'fourfront-webdev'
tibanna = Tibanna(env=env)
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)


def summarize_file(file_resp):
    file_id = file_resp['accession']
    relations = file_resp.get('related_files')
    workflows = file_resp.get('workflow_run_inputs')
    first_alias = file_resp.get('aliases',[None])[0]
    # get related file
    paired_file = ''
    for relation in relations:
        if relation['relationship_type'] == 'paired with':
            paired_file = relation['file']['accession']
    # Check workflows workflow partA
    last_part_A = ''
    last_part_A_status = 'did_not_run'
    # Assumes workflow_runs come in time ordered list, and grabs the last ones for each wf run
    if workflows:
        for wfr_resp in workflows:
            wfr_name = wfr_resp['display_title']   
            if wfr_name.startswith(wf_partA):
                last_part_A=wfr_resp['uuid']
                last_part_A_status = wfr_resp.get('run_status')  
    # return a small report
    return {'file': file_id,
            'alias': first_alias,
            'paired_file': paired_file,
            'last_part_A': last_part_A,
            'last_part_A_status': last_part_A_status
           }

#changed worflow uuid to old again
# it is b9829418-49e5-4c33-afab-9ec90d65faf3
# new one is b9829418-49e5-4c33-afab-9ec90d659999
def make_hicb_json(input_files, env, output_bucket, accession, ncores):
    input_json = {'input_files': input_files,
                  'output_bucket': output_bucket,
                  'workflow_uuid': "b9829418-49e5-4c33-afab-9ec90d65faf3",
                  "app_name": "hi-c-processing-partb/25",
                  "parameters": {
                      "ncores" :  ncores,
                      "binsize": 5000,
                      "min_res": 5000,
                      "normalization_type": "KR"
                  },
                  "_tibanna": {"env": env, "run_type": "hic-partb", "run_id": accession}
                  }
    return input_json


def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    input_is_array = True
    if not isinstance(obj_ids, list):
        input_is_array = False
        obj_ids = [ obj_ids ]
        
    object_key_list = []
    uuid_list = []
    
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
     
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            print tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket)
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])
            
    if not input_is_array: 
        uuid_list = uuid_list[0]
        object_key_list = object_key_list[0]
        
    data = {'bucket_name' : bucket,
                    'object_key' :  object_key_list,
                    'uuid' : uuid_list,
                    'workflow_argument_name': arg_name
            }
    return data



def make_input_file_json(obj_ids, arg_name, tibanna, bucket):
    '''
    obj_ids can be either a string or a list.
    {
      "bucket_name": "%s",
      "object_key": "%s",
      "uuid" : "%s",
      "workflow_argument_name": "%s"
    }
    '''
    input_is_array = True
    ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
    
    if not isinstance(obj_ids, list):
        input_is_array = False
        obj_ids = [ obj_ids ]
        
    object_key_list = []
    uuid_list = []
    
    for obj_id in obj_ids:
        metadata = ff_utils.get_metadata(obj_id, connection=ff)
     
        # just make sure the file is on s3, otherwise bail
        print("looking for upload key %s, on bucket %s" % 
              (metadata['upload_key'],
               bucket))
        if tibanna.s3.does_key_exist(metadata['upload_key'], bucket=bucket):
            object_key_list.append(metadata['upload_key'].split('/')[1])
            uuid_list.append(metadata['uuid'])
            
    if not input_is_array:
        uuid_list = uuid_list[0]
        object_key_list = object_key_list[0]
        
    data = {'bucket_name' : bucket,
                    'object_key' :  object_key_list,
                    'uuid' : uuid_list,
                    'workflow_argument_name': arg_name
            }
    return data


In [None]:
mbo_sets = [
'dciclab:rao_rep02',
'dciclab:rao_rep12'
]

all_sets = [
            'dciclab:rao_rep07',
            'dciclab:rao_rep02',
            'dciclab:rao_rep12',
            'dcic:Jin_imr90_hic'
            ]

done_sets = [
            'dcic:Selvaraj_gm12878_hic'
            ]

all_pairs = []
for a_set in done_sets:
    print 'report on', a_set
    rep_resp = ff_utils.get_metadata(a_set, connection=ff)['experiments_in_set']
    total = 0
    set_pairs =[]
    set_pairs_fine = True  
    for exp in rep_resp:
        # print 'Experiment', exp
        exp_resp = ff_utils.get_metadata(exp, connection=ff)
        exp_files = exp_resp['files']
        for fastq_file in exp_files:
            file_resp = ff_utils.get_metadata(fastq_file, connection=ff, frame='embedded') 
            # skip unfortunate status
            if file_resp['status'] in ['deleted', 'uploading', 'upload failed']:
                continue
            # if no uploaded file in the file item report and skip
            if not file_resp.get('filename'):
                print file_resp['accession'], "does not have a file"
                continue
            # check if file is in s3
            if not tibanna.s3.does_key_exist(file_resp['upload_key'], tibanna.s3.raw_file_bucket):
                print file_resp['accession'], "does not have a file in S3"
                continue 
            # skip miseq
            if file_resp.get('instrument') == "Illumina MiSeq":
                continue
            # skip pair no 2
            if file_resp.get('paired_end')=='2':
                continue 
 
            # get report
            file_info = summarize_file(file_resp)
            # get report on paired file
            paired_file = file_info['paired_file']
            pair_file_resp = ff_utils.get_metadata(paired_file, connection=ff, frame='embedded')
            pair_file_info = summarize_file(pair_file_resp)
            
            partAfine = False
            # check the partA for both paired fastq
            if file_info['last_part_A_status'] == 'complete':
                if pair_file_info['last_part_A_status'] == 'complete':
                    if file_info['last_part_A'] == pair_file_info['last_part_A']:
                        partAfine = True
                        
            ########
            #######
            #  We need to add a check if partV run before, so don't run twice
            #####
            ####
            ###
            ##
            #
            
            hindIII_correction = False
            if not partAfine:
                set_pairs_fine = False
                print file_info['file'], pair_file_info['file'], "has problems with partA"
                print "this experiment set will not be part of all_pairs list"
            else:
                partA_data = ff_utils.get_metadata(file_info['last_part_A'], connection=ff)
                inputs = partA_data.get('input_files')
                # An old version of the hindIII restriction file needed to be corrected
                # If found that as the input file, look at the wfr on pair file and use the 
                # correct one
                nz = [i['value'] for i in inputs if i['workflow_argument_name']=='restriction_file'][0]
                if nz == '/files-reference/4DNFI823L811/':
                    hindIII_correction = True
                outputs = partA_data.get('output_files')
                pair_file = [i['value'] for i in outputs if i['format'] == 'pairs'][0]
                pair_resp = ff_utils.get_metadata(pair_file, connection=ff, frame='embedded')  
                
                # check if file is in s3
                head_info = tibanna.s3.does_key_exist(pair_resp['upload_key'], tibanna.s3.outfile_bucket)
                if not head_info:
                    set_pairs_fine = False
                    print pair_resp['accession'], "does not have a file in S3, skipping this set"
                    continue
                
                ### if there was a correction for partA, find the right pairs file
                if hindIII_correction:
                    pair_wfrs = pair_resp['workflow_run_inputs']
                    if pair_wfrs:
                        for wfr_resp in pair_wfrs:
                            wfr_name = wfr_resp['display_title']   
                            if wfr_name.startswith('hi-c-processing-parta-juicer-patch'):
                                last_part_A_cor=wfr_resp['uuid']
                                last_part_A_cor_status = wfr_resp.get('run_status') 
                    if last_part_A_cor_status != 'complete':
                        set_pairs_fine = False
                        print "There is a problem with partA_correction run for pair", pair_resp['accession']
                        continue
                    else:
                        cor_wfr = ff_utils.get_metadata(last_part_A_cor, connection=ff)
                        cor_outputs = cor_wfr.get('output_files')
                        cor_pair_file = [i['value'] for i in cor_outputs if i['format'] == 'pairs'][0]
                        cor_pair_resp = ff_utils.get_metadata(cor_pair_file, connection=ff)  
                        # check if file is in s3
                        cor_head_info = tibanna.s3.does_key_exist(cor_pair_resp['upload_key'], tibanna.s3.outfile_bucket)
                        if not cor_head_info:
                            set_pairs_fine = False
                            print pair_resp['accession'], "does not have the corrected pair file", cor_pair_resp['accession'] ,"in S3, skipping this set"
                            continue
                        
                        cor_file_size = round(cor_head_info['ContentLength']/1073741824.0,1)
                        total += cor_file_size
                        print cor_pair_file, str(cor_file_size), "(corrected)"
                        set_pairs.append(cor_pair_file)
                else:
                    file_size = round(head_info['ContentLength']/1073741824.0,1)
                    total += file_size
                    print pair_file, str(file_size)
                    set_pairs.append(pair_file)


                
    if set_pairs_fine:
        all_pairs.append([a_set, set_pairs])
    print str(total) + "GB total file size"
    print

print "no of pairs sets"
print len(all_pairs)
print all_pairs
print "use 'all_pairs' to pass the list of pairs files of each set"

In [None]:
from core.utils import Tibanna
from core.utils import run_workflow
from core import ff_utils
import time

# testportal
env = 'fourfront-webdev'
tibanna = Tibanna(env=env)

output_file_bucket = tibanna.s3.outfile_bucket
raw_file_bucket = tibanna.s3.raw_file_bucket

test_pairs = [
    ['small_set_1.2gb_1.2gb', ['/files-processed/4DNFIO9EV5ME/', '/files-processed/4DNFIPZZNRT5/']],
    ['medium_set_2.3gb_2.4gb',['/files-processed/4DNFI0U1SFJJ/', '/files-processed/4DNFI1C97MZ6/']],
    ['large_set_3.6gb_3.6gb',['/files-processed/4DNFIXV3ACPK/', '/files-processed/4DNFI6NDND1Y/']]
]


for set_name, pair_list in all_pairs:
    print set_name
    print pair_list
    chrsizes = make_input_file_json('4DNFI823LSII', 'chrsizes', tibanna, raw_file_bucket)
    pair_files= make_input_file_json(pair_list, 'input_pairs', tibanna, output_file_bucket)
    ncores = 8

    # ncore options 8  cores up to 20gb per .fastq.gz (1tb)
    #               36 cores up to 42gb per .fastq.gz (2tb)
    #     Not set            up to 90gb per .fastq.gz (4tb)

    input_files = [chrsizes, pair_files]
    if all(input_files):
        name = 'partB_'+ set_name.replace(":", "_")
        input_json = make_hicb_json(input_files, env, output_file_bucket, name, ncores)
        print input_json
        res = run_workflow(input_json)
    else:
        print("some files not found on s3.  Investigate this list %s" % input_files)
    time.sleep(30)
print('Done')