In [2]:
### This script looks at all files
### 1) Checks deleted files for md5 related fields and clears them
###                         for qc metric and changes status of qc metric object to deleted
###                         clears the qc metric field
###                         for workflows and deletes all of them
### 2) Checks other files for workflows and deleted old workflows
###                                     and deleted problematic ones (status or rev)
### takes around 20 min

from core.utils import Tibanna
from core import ff_utils
from datetime import datetime

# set enviroment and key/connection
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)

tibanna.ff_keys['default']['server'] = 'https://data.4dnucleome.org'
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)
tibanna.ff_keys['default']['server'] = 'https://data.4dnucleome.org'


def get_wfr_report(wfrs):
    wfr_report = []
    for wfr_data in wfrs:
        wfr_rep = {}
        """For a given workflow_run_sbg item, grabs details, uuid, run_status, wfr name, date, and run time"""
        wfr_uuid = wfr_data['uuid']
        wfr_data = ff_utils.get_metadata(wfr_uuid, connection = ff)
        wfr_status = wfr_data['run_status']
        try:
            wfr_name = wfr_data['title'].split(' run ')[0]
        except:
            print('ProblematicCase')
            print(wfr_data['uuid'], wfr_data.get('display_title', 'no title'))
            continue
        wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
        run_hours = (datetime.utcnow()-wfr_time).total_seconds()/3600
        wfr_name_list = wfr_data['title'].split(' run ')[0].split('/')
        wfr_name = wfr_name_list[0]
        try:
            wfr_rev = wfr_name_list[1] 
        except:
            wfr_rev = "0"

        output_files = wfr_data.get('output_files',None)
        output_uuids = []
        if output_files:
            for i in output_files:
                if i.get('value', None):
                    output_uuids.append(i['value'])

        wfr_rep = {'wfr_uuid': wfr_data['uuid'],
                   'wfr_status': wfr_data['run_status'],
                   'wfr_name': wfr_name,
                   'wfr_rev': wfr_rev,
                   'wfr_date': wfr_time,
                   'run_time': run_hours,
                   'status': wfr_data['status'],
                   'outputs': output_uuids}
        wfr_report.append(wfr_rep)
    wfr_report = sorted(wfr_report, key=lambda k: (k['wfr_date'], k['wfr_name']))
    return wfr_report

    
def printTable(myDict, colList=None):
    """ Pretty print a list of dictionaries Author: Thierry Husson"""
    if not colList: colList = list(myDict[0].keys() if myDict else [])
    myList = [colList] # 1st row = header
    for item in myDict: myList.append([str(item[col] or '') for col in colList])
    colSize = [max(map(len,col)) for col in zip(*myList)]
    formatStr = ' | '.join(["{{:<{}}}".format(i) for i in colSize])
    myList.insert(1, ['-' * i for i in colSize]) # Seperating line
    for item in myList: print(formatStr.format(*item))
        





In [10]:
print 'started at', datetime.utcnow()
delete_workflows = raw_input("Do you want to delete old workflowruns (if not, only report will be displayed (y/n))")

# what kind of files should be searched for worflow run inputs, use url compatible naming

# accepted workflows
# workflow name, accepted revision numbers (0 if none), accetable run time (hours)
workflow_details = [
                    ['md5', ['0'], 12],
                    ['fastqc-0-11-4-1', ['0', '1'], 12],
#                     ['hi-c-processing-parta-juicer', ['25','26'], 150],
#                     ['hi-c-processing-partb', ['31', '34', '38'],150],
#                     ['hi-c-processing-partc', ['3', '8'], 150],
#                     ['add-hic-normvector-to-mcool', ['3'], 150],
#                     ['extract-mcool-normvector-for-juicebox', ['100'], 150],
#                     ['extract-mcool-normvector-for-juicebox-1', ['1'], 150],
#                     ['bwa-mem', ['0'], 150],
#                     ['pairsam-parse-sort',['0'],150],
#                     ['pairsam-merge',['0'],150],
#                     ['pairsam-markasdup',['0'],150],
#                     ['pairsam-filter',['0'],150],
#                     ['addfragtopairs',['0'],150],
#                     ['pairs-patch',['0'],150],
#                     ['hi-c-processing-partb set',['0'],150],
#                     ['hi-c-processing-partb exp',['0'],150],
#                     ['hi-c-processing-partc set',['0'],150],
#                     ['hi-c-processing-partc exp',['0'],150],
    
                    ['bwa-mem 0.2.5', ['0'],50],
                    ['pairsqc-single 0.2.5', ['0'],50],
                    ['hi-c-processing-bam 0.2.5', ['0'],50],  
                    ['hi-c-processing-pairs 0.2.5', ['0'],50],
                    ['hi-c-processing-pairs-nore 0.2.5', ['0'],50],
                    ['hi-c-processing-pairs-nonorm 0.2.5', ['0'],50],
                    ['hi-c-processing-pairs 0.2.5', ['0'],50],
                   ]
workflow_names = [i[0] for i in workflow_details]

deleted_wfr_no = 0
files_with_deleted_wfr = 0


run_what = 'Mic'   # Proc or Fastq

file_url = ""
if run_what == 'Proc':
    file_url = '/search/?type=FileProcessed&limit=all&q=date_created%3A%3E%3D2017-09-01'
    files = [i['uuid'] for i in ff_utils.get_metadata(file_url , connection=ff)['@graph']]
    
elif run_what == 'Fastq':
    file_url = '/search/?type=FileFastq&limit=all&q=date_created%3A%3E%3D2016-09-01'
    files = [i['uuid'] for i in ff_utils.get_metadata(file_url , connection=ff)['@graph']]
    # grab these guys, because they accumulate losts of md5 runs
    for a_test in ['4DNFIO67AFHV','4DNFIXH5OV2H', '4DNFI5RQBUKE']:
        files.insert(0,a_test)

elif run_what == 'Mic':
    file_url = '/search/?type=FileMicroscopy&limit=all&q=date_created%3A%3E%3D2016-09-01'
    files = [i for i in ff_utils.get_metadata(file_url , connection=ff)['@graph']]
    

print len(files), 'files in the system'
deleted_wfrs = []
counter = 0
del_md5 = 0
del_qc = 0
deleted_output = 0
for a_file in files:
    counter += 1
    if counter % 100 == 0:
        print counter, files_with_deleted_wfr
    raw_file = a_file
    deleted_wf = False
    wfr_report = []
    wfrs = raw_file.get('workflow_run_inputs')
    
    if not wfrs:
        wfrs_url = '/search/?type=WorkflowRunAwsem&input_files.value.accession='+ a_file['accession']
        wfrs = ff_utils.get_metadata(wfrs_url , connection=ff).get('@graph')
        if len(wfrs)==0:
            print('file has no wfr', a_file['accession'])
    
    if not wfrs:
        continue        
    
    # Delete wfrs if file is deleted
    if raw_file['status'] == 'deleted':
        if delete_workflows.lower() in ['y', 'yes']:
            # clean deleted files of md5 and qc metrics
            for a_field in ['content_md5sum', 'md5sum']:  
                if raw_file.get(a_field):
                    ff_utils.delete_field(raw_file, a_field, connection=ff)
                    del_md5 += 1
            if raw_file.get('quality_metric'):
                qc_uuid = raw_file['quality_metric']['uuid']
                ff_utils.delete_field(raw_file, 'quality_metric', connection=ff)
                # delete quality metrics object
                patch_data = {'status': "deleted"}
                ff_utils.patch_metadata(patch_data, obj_id=qc_uuid ,connection=ff)
                del_qc += 1
        # delete all workflows for deleted files
        if not wfrs:
            continue
        else:
            wfr_report = get_wfr_report(wfrs)
            for wfr_to_del in wfr_report:
                if wfr_to_del['status'] != 'deleted':
                    if wfr_to_del['wfr_name'] not in workflow_names:
                        print('Unlisted Workflow', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                    deleted_wf = True
                    deleted_wfr_no += 1
                    
                    ####################################################
                    ## TEMPORARY PIECE##################################
                    if wfr_to_del['status'] == 'released to project':
                        print('saved from deletion', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                        continue
                    if wfr_to_del['status'] == 'released':
                        print('delete released!!!!!', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                        continue  
                    #####################################################
                    
                    print(wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                    if delete_workflows.lower() in ['y', 'yes']:
                        patch_data = {'description': "This workflow run is deleted", 'status': "deleted"}
                        deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                        ff_utils.patch_metadata(patch_data, obj_id=wfr_to_del['wfr_uuid'] ,connection=ff)
                        # delete output files of the deleted workflow run
                        if wfr_to_del['outputs']:
                            for out_file in wfr_to_del['outputs']:
                                deleted_output += 1
                                ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_file ,connection=ff)
       
                
    else:
        # get a report on all workflow_runs
        if not wfrs:
            continue
        else:
            wfr_report = get_wfr_report(wfrs)
            # printTable(wfr_report, ['wfr_name', 'run_time', 'wfr_rev', 'run_time', 'wfr_status'])
            
            # check if any unlisted wfr in report
            my_wfr_names = [i['wfr_name'] for i in wfr_report]
            unlisted = [x for x in my_wfr_names if x not in workflow_names]
            # report the unlisted ones
            #if unlisted:
                #print('Unlisted Workflow', unlisted, 'skipped in', raw_file['accession'])
                    
            for wf_name,accepted_rev,accepted_run_time in workflow_details:
                #for each type of worklow make a list of old ones, and patch status and description
                sub_wfrs = [i for i in wfr_report if i['wfr_name'] == wf_name]
                if sub_wfrs:
                    active_wfr = sub_wfrs[-1]
                    old_wfrs = sub_wfrs [:-1]
                    # check the status of the most recent workflow
                    if active_wfr['wfr_status'] != 'complete':
                        if active_wfr['wfr_status'] in ['running', 'started'] and active_wfr['run_time'] < accepted_run_time:
                            print wf_name,'still running for', a_file
                        else:
                            old_wfrs.append(active_wfr)
                    elif active_wfr['wfr_rev'] not in accepted_rev:
                        old_wfrs.append(active_wfr)
                    if old_wfrs:
                        for wfr_to_del in old_wfrs:
                            if wfr_to_del['status'] != 'deleted':
                                deleted_wf = True
                                deleted_wfr_no += 1 
                                
                                ####################################################
                                ## TEMPORARY PIECE
                                if wfr_to_del['status'] == 'released to project':
                                    print('saved from deletion',wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                    continue
                                if wfr_to_del['status'] == 'released':
                                    print('delete released????',wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                    continue
                                ####################################################

                                print(wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                
                                if delete_workflows.lower() in ['y', 'yes']:
                                    patch_data = {'description': "This workflow run is deleted", 'status': "deleted"}
                                    deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                                    
                                    ff_utils.patch_metadata(patch_data, obj_id=wfr_to_del['wfr_uuid'] ,connection=ff)
                                    # delete output files of the deleted workflow run
                                    if wfr_to_del['outputs']:
                                        for out_file in wfr_to_del['outputs']:
                                            deleted_output += 1
                                            ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_file ,connection=ff)
    if deleted_wf:
        files_with_deleted_wfr += 1

if delete_workflows.lower() in ['y', 'yes']:
    print str(deleted_wfr_no),"workflowruns from", str(files_with_deleted_wfr), "files deleted"
else:
    print str(deleted_wfr_no),"workflowruns from", str(files_with_deleted_wfr), "files need to be deleted"

print len(deleted_wfrs)
print del_md5, 'md5 fields deleted'
print del_qc, 'qc metrics deleted'
print deleted_output, 'deleted output files'
print 'finished at', datetime.utcnow()

started at 2018-05-11 16:19:40.933838
Do you want to delete old workflowruns (if not, only report will be displayed (y/n))y
1414 files in the system
('file has no wfr', u'4DNFIXAGUQFQ')
('file has no wfr', u'4DNFII5APXQO')
(u'md5', 'old style or dub', u'c7913775-2b10-49b7-9b21-19dbe8d4194a', u'4DNFIZG7QF4Q')
(u'md5', 'old style or dub', u'989d84cd-8e39-4897-8679-9b6d96225ea1', u'4DNFIH5N3YET')
(u'md5', 'old style or dub', u'cf2b944d-aef9-4548-968b-d3ce975ff52f', u'4DNFIQSUXNYZ')
(u'md5', 'old style or dub', u'3634dde0-4951-4c4b-942c-5b57eb8e8974', u'4DNFIEL57XRH')
(u'md5', 'old style or dub', u'427ea9a9-344f-43ba-82da-20fabdb2cdc7', u'4DNFIUJUPBT6')
(u'md5', 'old style or dub', u'cc6de668-9c88-4e2c-b3e0-dec8b973eade', u'4DNFIH2OCAZE')
(u'md5', 'old style or dub', u'5820dba6-14a7-4331-a5dc-8eab14a7750b', u'4DNFIWQOGIOF')
(u'md5', 'old style or dub', u'9b889b36-22e5-4137-abe2-0e7d97e4f895', u'4DNFIWJSSUON')
(u'md5', 'old style or dub', u'e44a836d-bfb1-49c9-b94f-b391f3a2e77b', u'4DNFIW5AD

('file has no wfr', u'4DNFIVWMRIUU')
('file has no wfr', u'4DNFIAUQ639K')
('file has no wfr', u'4DNFI8H6S9HI')
('file has no wfr', u'4DNFIK44V47A')
('file has no wfr', u'4DNFIXZLB2CX')
900 49
('file has no wfr', u'4DNFI41PBPYY')
('file has no wfr', u'4DNFIDIDIVS8')
('file has no wfr', u'4DNFIUXJBL6Q')
('file has no wfr', u'4DNFIT7ZZ7EM')
('file has no wfr', u'4DNFIUVZ2GM9')
('file has no wfr', u'4DNFIXPRJ7CY')
('file has no wfr', u'4DNFI22MOEZU')
('file has no wfr', u'4DNFIHEF52EK')
('file has no wfr', u'4DNFI9Y189XI')
('file has no wfr', u'4DNFI74TXZEI')
('file has no wfr', u'4DNFISUOIRPH')
('file has no wfr', u'4DNFIQKFQ5PZ')
('file has no wfr', u'4DNFIU8ESY12')
('file has no wfr', u'4DNFI72M8B4V')
('file has no wfr', u'4DNFIB9JPWCI')
('file has no wfr', u'4DNFIXQO36NR')
('file has no wfr', u'4DNFIKPZT4K5')
('file has no wfr', u'4DNFIGU4FR5U')
('file has no wfr', u'4DNFIQUJVCEL')
('file has no wfr', u'4DNFI4H6VSR6')
('file has no wfr', u'4DNFIN7RGH1P')
('file has no wfr', u'4DNFILKWQ

('file has no wfr', u'4DNFI3VJ9333')
('file has no wfr', u'4DNFIVJZIMGZ')
('file has no wfr', u'4DNFIMZI3IV2')
('file has no wfr', u'4DNFIE2KNOZS')
('file has no wfr', u'4DNFIS7BDP4E')
('file has no wfr', u'4DNFIUYO175L')
('file has no wfr', u'4DNFIF8QP9PK')
('file has no wfr', u'4DNFIAB4YY4J')
('file has no wfr', u'4DNFI7UVVM1O')
('file has no wfr', u'4DNFIGLDFGCV')
('file has no wfr', u'4DNFITI29773')
('file has no wfr', u'4DNFI6VK159S')
('file has no wfr', u'4DNFID5WO9AO')
('file has no wfr', u'4DNFIWGZE1UI')
('file has no wfr', u'4DNFIPI6PN9V')
('file has no wfr', u'4DNFIM6JGCIE')
('file has no wfr', u'4DNFIJQ8J6IU')
('file has no wfr', u'4DNFIN2BX7NK')
('file has no wfr', u'4DNFIGDSFDJM')
('file has no wfr', u'4DNFIXD1GBLS')
('file has no wfr', u'4DNFIPRL3UP7')
('file has no wfr', u'4DNFIR47ZPBE')
('file has no wfr', u'4DNFIYG9PQ6P')
('file has no wfr', u'4DNFI89LTFGK')
('file has no wfr', u'4DNFIKG5QDFY')
('file has no wfr', u'4DNFIOAMX67K')
('file has no wfr', u'4DNFIK3V79YB')
(

('file has no wfr', u'4DNFI43WSBO6')
('file has no wfr', u'4DNFI18K587V')
('file has no wfr', u'4DNFIUHPBSRQ')
('file has no wfr', u'4DNFIBTJ1LPS')
('file has no wfr', u'4DNFIZUMUI7T')
('file has no wfr', u'4DNFIEJT82AA')
('file has no wfr', u'4DNFIZU1R7PJ')
('file has no wfr', u'4DNFIRKC3ZVV')
('file has no wfr', u'4DNFIUS1HTDR')
('file has no wfr', u'4DNFIXAQJSHR')
('file has no wfr', u'4DNFIP5HS1R9')
('file has no wfr', u'4DNFIBM27NOP')
('file has no wfr', u'4DNFI6NGACCJ')
('file has no wfr', u'4DNFI5Q7RA45')
('file has no wfr', u'4DNFII3F4JFS')
('file has no wfr', u'4DNFIIGS4LCW')
('file has no wfr', u'4DNFIA4I367I')
('file has no wfr', u'4DNFIGGW4TQ6')
('file has no wfr', u'4DNFIO5JVZJC')
('file has no wfr', u'4DNFIHS68XBD')
('file has no wfr', u'4DNFIXLHES13')
('file has no wfr', u'4DNFIU9JAHD8')
('file has no wfr', u'4DNFICM14NVM')
('file has no wfr', u'4DNFITNZJ6JM')
('file has no wfr', u'4DNFIPWVVWUO')
('file has no wfr', u'4DNFIJOJSEF4')
('file has no wfr', u'4DNFI1IQRP74')
(