In [1]:
### This script looks at all files
### 1) Checks deleted files for md5 related fields and clears them
###                         for qc metric and changes status of qc metric object to deleted
###                         clears the qc metric field
###                         for workflows and deletes all of them
### 2) Checks other files for workflows and deleted old workflows
###                                     and deleted problematic ones (status or rev)
### takes around 20 min

from core.utils import Tibanna
from core import ff_utils
from datetime import datetime

# set enviroment and key/connection
env = 'fourfront-webprod'
tibanna = Tibanna(env=env)
ff = ff_utils.fdn_connection(key=tibanna.ff_keys)

def get_wfr_report(wfrs):
    wfr_report = []
    for wfr_data in wfrs:
        wfr_rep = {}
        """For a given workflow_run_sbg item, grabs details, uuid, run_status, wfr name, date, and run time"""
        wfr_uuid = wfr_data['uuid']
        wfr_data = ff_utils.get_metadata(wfr_uuid, connection = ff)
        wfr_status = wfr_data['run_status']
        try:
            wfr_name = wfr_data['title'].split(' run ')[0]
        except:
            print('ProblematicCase')
            print(wfr_data['uuid'], wfr_data.get('display_title', 'no title'))
            continue
        wfr_time = datetime.strptime(wfr_data['date_created'],'%Y-%m-%dT%H:%M:%S.%f+00:00')
        run_hours = (datetime.utcnow()-wfr_time).total_seconds()/3600
        wfr_name_list = wfr_data['title'].split(' run ')[0].split('/')
        wfr_name = wfr_name_list[0]
        try:
            wfr_rev = wfr_name_list[1] 
        except:
            wfr_rev = "0"

        output_files = wfr_data.get('output_files',None)
        output_uuids = []
        if output_files:
            for i in output_files:
                if i.get('value', None):
                    output_uuids.append(i['value'])

        wfr_rep = {'wfr_uuid': wfr_data['uuid'],
                   'wfr_status': wfr_data['run_status'],
                   'wfr_name': wfr_name,
                   'wfr_rev': wfr_rev,
                   'wfr_date': wfr_time,
                   'run_time': run_hours,
                   'status': wfr_data['status'],
                   'outputs': output_uuids}
        wfr_report.append(wfr_rep)
    wfr_report = sorted(wfr_report, key=lambda k: (k['wfr_date'], k['wfr_name']))
    return wfr_report

    
def printTable(myDict, colList=None):
    """ Pretty print a list of dictionaries Author: Thierry Husson"""
    if not colList: colList = list(myDict[0].keys() if myDict else [])
    myList = [colList] # 1st row = header
    for item in myDict: myList.append([str(item[col] or '') for col in colList])
    colSize = [max(map(len,col)) for col in zip(*myList)]
    formatStr = ' | '.join(["{{:<{}}}".format(i) for i in colSize])
    myList.insert(1, ['-' * i for i in colSize]) # Seperating line
    for item in myList: print(formatStr.format(*item))
        


In [2]:
print 'started at', datetime.utcnow()
delete_workflows = raw_input("Do you want to delete old workflowruns (if not, only report will be displayed (y/n))")

# what kind of files should be searched for worflow run inputs, use url compatible naming

# accepted workflows
# workflow name, accepted revision numbers (0 if none), accetable run time (hours)
workflow_details = [
                    ['md5', ['0'], 24],
                    ['fastqc-0-11-4-1', ['0', '1'], 24],
                    ['hi-c-processing-parta-juicer', ['25','26'], 150],
                    ['hi-c-processing-partb', ['31', '34', '38'],150],
                    ['hi-c-processing-partc', ['3', '8'], 150],
                    ['add-hic-normvector-to-mcool', ['3'], 150],
                    ['extract-mcool-normvector-for-juicebox', ['100'], 150],
                    ['extract-mcool-normvector-for-juicebox-1', ['1'], 150],
                    ['bwa-mem', ['0'], 150],
                    ['pairsam-parse-sort',['0'],150],
                    ['pairsam-merge',['0'],150],
                    ['pairsam-markasdup',['0'],150],
                    ['pairsam-filter',['0'],150],
                    ['addfragtopairs',['0'],150],
                    ['pairs-patch',['0'],150],
                    ['hi-c-processing-partb set',['0'],150],
                    ['hi-c-processing-partb exp',['0'],150],
                    ['hi-c-processing-partc set',['0'],150],
                    ['hi-c-processing-partc exp',['0'],150],
                    ['bwa-mem 0.2.5', ['0'],150],
                    ['pairsqc-single 0.2.5', ['0'],150],
                    ['hi-c-processing-bam 0.2.5', ['0'],150],  
                    ['hi-c-processing-pairs 0.2.5', ['0'],150],
                   ]
workflow_names = [i[0] for i in workflow_details]

deleted_wfr_no = 0
files_with_deleted_wfr = 0

files = [i['uuid'] for i in ff_utils.get_metadata('files-processed' , connection=ff)['@graph']]

# #files = [u'311b0cbc-079e-4fd3-bd95-17df5e838a25', u'3f7c6c9b-741f-4a89-947b-d65b7ed77f81', 
#                 u'd7d771ba-a273-440e-9f73-41fb93005307', u'c05e8866-35de-4488-95de-af9fe6bfada6',
#                 u'73e64de4-b773-46f3-a6fa-50e14815a076', u'e2cc7cae-6184-487a-b70c-6c6beb7f1ee5', 
#                 u'cd8385a5-4ab7-467e-8cd2-1fa9c24413d5', u'2db2d085-f1f0-4e78-a0ba-ce7faa60adbb', 
#                 u'3c22e002-d69b-4c78-9a50-4cc85417daa3', u'61dafc84-7991-4542-a3f8-645adee368af', 
#                 u'e3ba43cc-428f-4289-891d-96f698fbf7a5', u'0e449082-98d0-4c84-abb0-b49b661467dc', 
#                 u'5fb57fab-f2a7-4fac-a6bd-fd2ce0e28eb2', u'695212e9-64a7-4d55-a29f-f7895108bd4f']

print len(files), 'files in the system'
deleted_wfrs = []
counter = 0
del_md5 = 0
del_qc = 0
deleted_output = 0
for a_file in files:
    counter += 1
    if counter % 100 == 0:
        print counter, files_with_deleted_wfr
    raw_file = ff_utils.get_metadata(a_file, connection = ff, frame='embedded')
    deleted_wf = False
    wfr_report = []
    wfrs = raw_file.get('workflow_run_inputs')
    
    # Delete wfrs if file is deleted
    if raw_file['status'] == 'deleted':
        if delete_workflows.lower() in ['y', 'yes']:
            # clean deleted files of md5 and qc metrics
            for a_field in ['content_md5sum', 'md5sum']:  
                if raw_file.get(a_field):
                    ff_utils.delete_field(raw_file, a_field, connection=ff)
                    del_md5 += 1
            if raw_file.get('quality_metric'):
                qc_uuid = raw_file['quality_metric']['uuid']
                ff_utils.delete_field(raw_file, 'quality_metric', connection=ff)
                # delete quality metrics object
                patch_data = {'status': "deleted"}
                ff_utils.patch_metadata(patch_data, obj_id=qc_uuid ,connection=ff)
                del_qc += 1
        # delete all workflows for deleted files
        if not wfrs:
            continue
        else:
            wfr_report = get_wfr_report(wfrs)
            for wfr_to_del in wfr_report:
                if wfr_to_del['status'] != 'deleted':
                    if wfr_to_del['wfr_name'] not in workflow_names:
                        print('Unlisted Workflow', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                    deleted_wf = True
                    deleted_wfr_no += 1
                    
                    ####################################################
                    ## TEMPORARY PIECE##################################
                    if wfr_to_del['status'] == 'released to project':
                        print('saved from deletion', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                        continue
                    if wfr_to_del['status'] == 'released':
                        print('delete released!!!!!', wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                        continue  
                    #####################################################
                    
                    print(wfr_to_del['wfr_name'], 'deleted file workflow', wfr_to_del['wfr_uuid'], raw_file['accession'])
                    if delete_workflows.lower() in ['y', 'yes']:
                        patch_data = {'description': "This workflow run is deleted", 'status': "deleted"}
                        deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                        ff_utils.patch_metadata(patch_data, obj_id=wfr_to_del['wfr_uuid'] ,connection=ff)
                        # delete output files of the deleted workflow run
                        if wfr_to_del['outputs']:
                            for out_file in wfr_to_del['outputs']:
                                deleted_output += 1
                                ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_file ,connection=ff)
       
                
    else:
        # get a report on all workflow_runs
        if not wfrs:
            continue
        else:
            wfr_report = get_wfr_report(wfrs)
            # printTable(wfr_report, ['wfr_name', 'run_time', 'wfr_rev', 'run_time', 'wfr_status'])
            
            # check if any unlisted wfr in report
            my_wfr_names = [i['wfr_name'] for i in wfr_report]
            unlisted = [x for x in my_wfr_names if x not in workflow_names]
            if unlisted:
                print('Unlisted Workflow', unlisted, 'skipped in', raw_file['accession'])
                    
            for wf_name,accepted_rev,accepted_run_time in workflow_details:
                #for each type of worklow make a list of old ones, and patch status and description
                sub_wfrs = [i for i in wfr_report if i['wfr_name'] == wf_name]
                if sub_wfrs:
                    active_wfr = sub_wfrs[-1]
                    old_wfrs = sub_wfrs [:-1]
                    # check the status of the most recent workflow
                    if active_wfr['wfr_status'] != 'complete':
                        if active_wfr['wfr_status'] in ['running', 'started'] and active_wfr['run_time'] < accepted_run_time:
                            print wf_name,'still running for', a_file
                        else:
                            old_wfrs.append(active_wfr)
                    elif active_wfr['wfr_rev'] not in accepted_rev:
                        old_wfrs.append(active_wfr)
                    if old_wfrs:
                        for wfr_to_del in old_wfrs:
                            if wfr_to_del['status'] != 'deleted':
                                deleted_wf = True
                                deleted_wfr_no += 1 
                                
                                ####################################################
                                ## TEMPORARY PIECE
                                if wfr_to_del['status'] == 'released to project':
                                    print('saved from deletion',wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                    continue
                                if wfr_to_del['status'] == 'released':
                                    print('delete released????',wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                    continue
                                ####################################################

                                print(wfr_to_del['wfr_name'], 'old style or dub', wfr_to_del['wfr_uuid'], raw_file['accession'])
                                
                                if delete_workflows.lower() in ['y', 'yes']:
                                    patch_data = {'description': "This workflow run is deleted", 'status': "deleted"}
                                    deleted_wfrs.append(wfr_to_del['wfr_uuid'])
                                    
                                    ff_utils.patch_metadata(patch_data, obj_id=wfr_to_del['wfr_uuid'] ,connection=ff)
                                    # delete output files of the deleted workflow run
                                    if wfr_to_del['outputs']:
                                        for out_file in wfr_to_del['outputs']:
                                            deleted_output += 1
                                            ff_utils.patch_metadata({'status': "deleted"}, obj_id=out_file ,connection=ff)
    if deleted_wf:
        files_with_deleted_wfr += 1

if delete_workflows.lower() in ['y', 'yes']:
    print str(deleted_wfr_no),"workflowruns from", str(files_with_deleted_wfr), "files deleted"
else:
    print str(deleted_wfr_no),"workflowruns from", str(files_with_deleted_wfr), "files need to be deleted"

print len(deleted_wfrs)
print del_md5, 'md5 fields deleted'
print del_qc, 'qc metrics deleted'
print deleted_output, 'deleted output files'
print 'finished at', datetime.utcnow()

started at 2018-03-02 19:26:57.990295
Do you want to delete old workflowruns (if not, only report will be displayed (y/n))
4942 files in the system
100 0
pairsqc-single 0.2.5 still running for 8b87bcdf-5b0b-4df2-b767-89ff8789120d
pairsqc-single 0.2.5 still running for 41335b9f-6123-4f32-89a1-09c77dadf922
pairsqc-single 0.2.5 still running for d14e2179-e383-4464-9327-5b529183648b
pairsqc-single 0.2.5 still running for 34aa42ca-1d51-4ca2-b9d2-ec3d6f0004e0
200 0
pairsqc-single 0.2.5 still running for be07af4f-4cc2-4a73-9f21-cc5d573bd644
pairsqc-single 0.2.5 still running for c604d8ca-3aa6-4317-8b87-0668e294762b
(u'pairsqc-single 0.2.5', 'old style or dub', u'4b91a98a-91f9-4b32-a51d-56b5165b0e8e', u'4DNFIMUQ1JN9')
pairsqc-single 0.2.5 still running for df20051a-2e42-4a4f-9ca0-08444ba9d57a
pairsqc-single 0.2.5 still running for 48d1fdc7-5ea7-4922-a5c4-32ecd386f055
pairsqc-single 0.2.5 still running for de9d73a3-3f36-401f-b3b3-cfd073ea6608
hi-c-processing-pairs 0.2.5 still running for 481cb5

('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'a72bb11a-ceda-4a04-badd-93d1dedb2ee2', u'4DNFIWSXKIRR')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'6fe81d5d-97a3-4204-92bf-294c1ba88fff', u'4DNFI1RTJOBJ')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'efc75a16-06b2-4a86-9b3e-a7e6719bb38e', u'4DNFIZZO8TNU')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'133de575-6910-4145-9971-a1c78eb8f835', u'4DNFIZZO8TNU')
(u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'2cf0475c-6f35-4d6f-87cc-d35873f814eb', u'4DNFIN1XTP8M')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'48a4096d-38eb-4a8e-92c9-f99092235b78', u'4DNFIS5PRTL5')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'89d18cac-8274-497e-ae30-778ec10e7329', u'4DNFIS5PRTL5')
('delete released????', u'hi-c-processing-pairs 0.2.5', 'old style or dub', u'efc75a1

(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'75dc6414-4401-4c5c-bc83-9ef008c20c19', u'4DNFIH938QN6')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'7f2db0ab-7d7e-4dbf-9aea-9c92ae576caa', u'4DNFIH938QN6')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'cbba5715-2f04-460a-aa4a-b35bfb4b02a0', u'4DNFIH938QN6')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'17b61078-d14e-451b-abb0-e9a90c3aacb3', u'4DNFIH938QN6')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'a9f9825c-8731-438e-bb04-21395c178bf4', u'4DNFIH938QN6')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'75dc6414-4401-4c5c-bc83-9ef008c20c19', u'4DNFIGYXTP9P')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'7f2db0ab-7d7e-4dbf-9aea-9c92ae576caa', u'4DNFIGYXTP9P')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'cbba5715-2f04-460a-aa4a-b35bfb4b02a0', u'4DNFIGYXTP9P')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'17b61078-d14e-451b-abb0-e9a90c3aacb3', u'4DNFIGYXTP9P')
(u'hi-c-processing-

(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'0f3821ab-9341-4227-aa15-7892a4c6448a', u'4DNFIH9ENEQH')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'0f3821ab-9341-4227-aa15-7892a4c6448a', u'4DNFILVVL953')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'0f3821ab-9341-4227-aa15-7892a4c6448a', u'4DNFIRPX427O')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'0f3821ab-9341-4227-aa15-7892a4c6448a', u'4DNFIGK3BJXZ')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'0f3821ab-9341-4227-aa15-7892a4c6448a', u'4DNFIXABWO7M')
(u'hi-c-processing-bam 0.2.5', 'old style or dub', u'48c8aee3-887b-4694-b116-f8d4380d5133', u'4DNFI5CIE2GL')
hi-c-processing-bam 0.2.5 still running for c2a92d9c-1822-4f32-97ff-72679007acc1
hi-c-processing-bam 0.2.5 still running for 819f33ca-f292-420c-82e9-8dc117743806
hi-c-processing-bam 0.2.5 still running for b2715531-4dff-4d87-8bd1-34ea8b3b26bc
hi-c-processing-bam 0.2.5 still running for d490c532-745b-4000-8e09-d8bfc3728d36
hi-c-processing-bam 0.

('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFI5H0U2IB')
('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFIKKZBHI1')
('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFILJJ6PMI')
('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFIC6HGO0T')
('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFIASXKH2N')
('saved from deletion', u'hi-c-processing-partb', 'old style or dub', u'0970eba6-2be9-4a7d-a48a-75fea1742b76', u'4DNFIVU1YKV3')
3800 185
3900 185
4000 185
4100 185
4200 185
4300 185
4400 185
4500 185
4600 185
4700 185
4800 185
4900 185
224 workflowruns from 185 files need to be deleted
0
0 md5 fields deleted
0 qc metrics deleted
0 deleted output files
finish