In [None]:
from dcicutils import ff_utils
from functions.wfr import *
from functions.wfr_settings import *
from functions.notebook_functions import *

# tibanna = Tibanna(env=env)
my_env = 'data'
my_auth = get_key('koray_data')


# Atac runs Luisa 2021 Jan
set_url = '/search/?experiments_in_set.experiment_type.display_title=ATAC-seq&experimentset_type=replicate&lab.display_title=Karen+Adelman%2C+HARVARD&status=pre-release&type=ExperimentSetReplicate'

all_sets = ff_utils.search_metadata(set_url , key=my_auth)
counter = 0
completed = 0
completed_acc = []

run_sets = [i for i in all_sets if "ENCODE_ATAC_Pipeline_1.1.1"  not in i.get('completed_processes', [])]
print(len(all_sets), 'total number of sets')
print(len(all_sets)-len(run_sets), 'sets completed')
print(len(run_sets), 'ready for processing')

In [None]:
run_wfr = True
add_pc = True
add_tag = True
pick_best = False

for a_set in run_sets: 
    print()
    print(a_set['accession'], end = " ")
    counter += 1
    # some feature to extract from each set
    control = ""  # True or False (True if set in scope is control)
    control_set = ""  # None (if no control exp is set), or the control experiment for the one in scope
    target_type = "" # Histone or TF (or None for control)
    paired = "" # single or paired
    organism = ""
    
    # pass attributions to new objects
    attributions = None
    
    replicate_exps = a_set['replicate_exps']
    replicate_exps = sorted(replicate_exps, key = lambda x: [x['bio_rep_no'], x['tec_rep_no']])

    # get organism, target and control from the first replicate
    f_exp = replicate_exps[0]['replicate_exp']['uuid']
    f_exp_resp = ff_utils.get_metadata(f_exp, key = my_auth)
    biosample = f_exp_resp['biosample']
    organism = list(set([bs['individual']['organism']['name'] for bs in biosample['biosource']]))[0]
    print(organism)
    
    if organism not in ['mouse', 'human']:
        print('orgamism not ready', organism)
        continue

    attributions = get_attribution(ff_utils.get_metadata(f_exp_resp['files'][0]['uuid'], key = my_auth))

    ta = []
    # check for step1, and start if missing
    step1_status = 'Done'
    for an_exp in replicate_exps:
        # are step 1 files raw or processed
        my_source = ''
        exp_id = an_exp['replicate_exp']['accession']
        exp_resp = ff_utils.get_metadata(exp_id, my_auth)
        run_name = exp_resp['accession']
        exp_files, exp_obj, paired = get_chip_files(exp_resp, my_auth)
        print(run_name, len(exp_files), paired, end = ' status: ')
        
        
        # if too many input, merge them in step0
        if len(exp_files) > 2:
            my_source = 'processed'
            merge_tag = True
            # exp_files format [[pair1,pair2], [pair1, pair2]]  uuids
            # exp_obj format  [[pair1,pair2], [pair1, pair2]]  accession.fileformat ('4DNFIKW8IQT2.fastq.gz')
            input_list = []
            if paired == 'paired':
                # first add paired end 1s
                input_list.append([i[0] for i in exp_files])
                input_list.append([i[1] for i in exp_files])
            elif paired == 'single':
                input_list.append([i[0] for i in exp_files])      
            merged_files = []
            step0_status = 'complete'
            iit = 0
            for merge_case in input_list:
                iit += 1
                all_step0s = []
                for an_input in merge_case:
                    step0_result = get_wfr_out(an_input, 'merge-fastq', my_auth, ['v1'])
                    all_step0s.append((step0_result['status'], step0_result.get('fastq')))
                if len(list(set(all_step0s))) != 1:
                    print('inconsistent step0 run for input fastq files')
                    # this run will be repeated if add_wfr
                    step0_result['status'] = 'inconsistent run'
                #check if part 0 is run already, it not start the run
                # if successful
                if step0_result['status'] == 'complete':
                    merged_fastq = step0_result['fastq']
                    merged_files.append(merged_fastq)
                # if still running
                elif step0_result['status'] == 'running':
                    step0_status = 'running'
                # if run is not successful
                else:
                    step0_status = 'missing'
                    if run_wfr:
                        # RUN PART 0
                        print('\nstarting step0')
                        inp_f = {'input_fastqs':merge_case}
                        tag = exp_id + '_p' + str(iit) 
                        run_missing_wfr(step_settings('merge-fastq', organism, attributions),
                                        inp_f, tag, my_auth, my_env)

            if step0_status != 'complete':
                print('step0', step0_status)
                step1_status = 'not ready'
                continue
            
            # if completed let's catch up with the variables used in step 1
            # update exp_files and exp_obj
            exp_files = [[]]
            exp_obj = [[]]
            for a_merged in merged_files:
                temp_resp = ff_utils.get_metadata(a_merged, my_auth)
                exp_files[0].append(temp_resp['uuid'])
                exp_obj[0].append(temp_resp['display_title']) 

        step1_result = get_wfr_out_file(exp_files[0][0], 'encode-atacseq-aln', my_auth, ['1.1.1']) 
        print(step1_result['status'])
        if step1_result['status'] == 'complete':
            ta.append(step1_result['atac.first_ta'])
            if add_pc:
                add_preliminary_processed_files(exp_id, [step1_result['atac.first_ta']], my_auth, run_type = 'atac')
        elif step1_result['status'] == 'running':
            step1_status = 'Incomplete'
        else:
            step1_status = 'Incomplete'
            if len(exp_files) > 2:
                print('WARNING More then 2 seq reps in exp')
            if run_wfr:
                print('starting run')
                run_missing_atac1(step_settings('encode-atacseq-aln', organism, attributions), 
                                  organism, paired, [exp_files], [exp_obj], my_env, my_auth, run_name,
                                  source=my_source)


    if step1_status != 'Done':
        continue
    # if there are controls, each experiment should have its own single control experiment
    
    # if there are more then 2 experiments, check the number of biological replicates

    # if there is 1 Biological Replicate
    # -pick best 2 exp

    # if there are 2 Biological replicates 
    #  - run mergebed on bioreps with more then 1 technical replicate

    # if there are 3 Biological replicates
    # - if there are 3 total experiments (1 in each biological rep), pick best 2
    # - else, run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep

    # if there are 4 or more Biolofical replicates
    # - run mergebed on bioreps with more then 1 technical replicate, and pick best 2 biorep
    
    if len(ta) > 2:
        print('more then 2')
        
        # if hardcoded pick best 2
        if pick_best:
            print('ExperimentSet has bioreps, selecting best 2')
            ta_2 = select_best_2(ta, my_auth)
            print(ta_2)
            ta = ta_2
            step2_result = get_wfr_out_file(ta[0], 'encode-atacseq-postaln', my_auth, ['1.1.1'])
        else:
            # merge bed stuff will come here
            step1_5_result = get_wfr_out_file(ta[0], 'mergebed', my_auth, ['v1'])
            if step1_5_result['status'] != 'complete':
                print('mergebed missing')
                continue
            # prevent step2 running for this cases TEMP
            ta = None

            mb = step1_5_result['merged_bed']
            step2_result = get_wfr_out_file(mb, 'encode-atacseq-postaln', my_auth, ['1.1.1'])
        
    else:
        step2_result = get_wfr_out_file(ta[0], 'encode-atacseq-postaln', my_auth, ['1.1.1'])
        
        
    if step2_result['status'] == 'complete':
        print('step2 is completed')
        if add_pc:
            add_preliminary_processed_files(a_set['accession'], 
                                            [
                                                step2_result['atac.optimal_peak'],
                                                step2_result['atac.conservative_peak'],
                                                step2_result['atac.sig_fc']
                                            ], 
                                            my_auth, run_type = 'atac')
        if add_tag:
            ff_utils.patch_metadata({"completed_processes":["ENCODE_ATAC_Pipeline_1.1.1"]}, obj_id=a_set['accession'] , key=my_auth)   
    
    elif step2_result['status'] == 'running':
        print('step2 is still running')
    else:
        step2_status = 'Incomplete' 
        if run_wfr:
            print('starting run')
            run_missing_atac2(step_settings('encode-atacseq-postaln', organism, attributions), 
                              organism, paired, ta, my_env, my_auth, a_set['accession'])
        else:
            print('missing step2')   

In [None]:
# Move files from opc to pc
from dcicutils import ff_utils
from functions.notebook_functions import *
from functions.wfr import *

move_title = 'ENCODE ATAC-Seq Pipeline - Preliminary Files'

# set_url = ''
# all_sets = ff_utils.search_metadata(set_url , key=my_auth)
counter = 0
completed = 0
completed_acc = []
for i in all_sets:
    print(i['uuid'])
ready_sets_1 = [i for i in all_sets if "ENCODE_ATAC_Pipeline_1.1.1" in i.get('completed_processes', [])]

print(len(ready_sets_1))

ready_sets_2 = []
for a_set in ready_sets_1:
    if a_set.get('other_processed_files'):
        print(a_set['uuid'])
        if move_title in [i['title'] for i in a_set['other_processed_files']]:
            if a_set.get('processed_files'):
                print('WARN' ,a_set['accession'], 'has items in processed files, skipping ')
                continue
            else:
                ready_sets_2.append(a_set)
print(len(ready_sets_2), 'items are ready')

In [None]:
# move other processed files to processed files field
action = True
move_title = 'ENCODE ATAC-Seq Pipeline - Preliminary Files'

def move_opc_to_pc(resp, move_title, con_key, status):
    opc = resp.get('other_processed_files')
    pc = resp.get('processed_files')
    # if processed_files field already has values, exit
    if pc:
        if opc:
            print('There are files in processed_files field, expected empty', resp['accession'])
            return False
        else:
            print('it is possible that move already happened, no opc but pc', resp['accession'])
    # see if there are other_processed_files to move
    if opc:
        titles = [i['title'] for i in opc]
        if move_title in titles:
            print(resp['accession'], 'files will move')
            move_item = [i for i in opc if i['title'] == move_title]
            assert len(move_item) == 1
            assert move_item[0]['type'] == 'preliminary'
            new_pc = move_item[0]['files']
            new_opc = [i for i in opc if i['title'] != move_title]
            # Time to patch
            patch_data = {}
            add_on = ""
            #if there is something left in opc, patch it, if not delete field
            if new_opc:
                patch_data['other_processed_files'] = opc
            else:
                add_on = 'delete_fields=other_processed_files'
            # patch with processed files
            patch_data['processed_files'] = new_pc
            if action:
                ff_utils.patch_metadata(patch_data, resp['uuid'], key = con_key, add_on = add_on)
                # update status of pc to status of set or exp
                release_files(resp['uuid'], new_pc, con_key, status = status)
            return True
        else:
            return False

        
set_w_apf = 0
exp_w_apf = 0
counter = 0

print(len(ready_sets_2), 'experiment sets in scope')
for a_set in ready_sets_2:
    set_resp = ff_utils.get_metadata(a_set['uuid'],key=my_auth, add_on='frame=raw')
    counter += 1
    print(counter, set_resp['accession'])
    exps = set_resp['experiments_in_set']
    res =  move_opc_to_pc(set_resp, move_title, my_auth, 'released')
    if res:
        set_w_apf += 1
        print(set_resp['accession'], 'moved to pc')
  
    for exp in exps:
        exp_resp = ff_utils.get_metadata(exp, key=my_auth, add_on='frame=raw')
        res_e =  move_opc_to_pc(exp_resp,move_title,my_auth, 'released')
        if res_e:
            exp_w_apf += 1
            print(exp_resp['accession'], 'moved to pc')
    print()

print(set_w_apf)
print(exp_w_apf)