Skip to content

Commit

Permalink
Merge 07a6531 into bbd22e5
Browse files Browse the repository at this point in the history
  • Loading branch information
SooLee committed May 2, 2019
2 parents bbd22e5 + 07a6531 commit 096c095
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 29 deletions.
20 changes: 18 additions & 2 deletions chalicelib/check_setup.json
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,24 @@
}
}
},
"identify_files_without_qc_summary": {
"title": "Files without quality metric summary",
"identify_files_without_qc_summary_pairs": {
"title": "Pairs files without quality metric summary",
"group": "Metadata checks",
"schedule": {
"morning_checks": {
"data": {
"kwargs": {"primary": true},
"dependencies": []
},
"webdev": {
"kwargs": {"primary": true},
"dependencies": []
}
}
}
},
"identify_files_without_qc_summary_bb": {
"title": "Bigbed files without quality metric summary",
"group": "Metadata checks",
"schedule": {
"morning_checks": {
Expand Down
72 changes: 60 additions & 12 deletions chalicelib/checks/helpers/qc_utils.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,86 @@
from dcicutils import ff_utils


def calculate_qc_metric_pairsqc(file_uuid, key):
'''Patching a pairs file object with quality_metric summary'''
res = ff_utils.get_metadata(file_uuid, key=key)
qc_uuid = res['quality_metric']['uuid']
quality_metric = ff_utils.get_metadata(qc_uuid, key=key)
qc_summary = []

def percent(numVal):
'''convert to percentage of Total reads'''
return round((numVal / quality_metric['Total reads']) * 100 * 1000) / 1000

def million(numVal):
return str(round(numVal / 10000) / 100) + "m"

def tooltip(numVal):
return "Percent of total reads (=%s)" % million(numVal)
qc_summary.append({"title": "Filtered Reads",
"value": str(quality_metric["Total reads"]),
"numberType": "integer"})
"value": str(quality_metric["Total reads"]),
"numberType": "integer"})
qc_summary.append({"title": "Cis reads (>20kb)",
"value": str(percent(quality_metric["Cis reads (>20kb)"])),
"tooltip": tooltip(quality_metric["Cis reads (>20kb)"]),
"numberType": "percent"})
"value": str(percent(quality_metric["Cis reads (>20kb)"])),
"tooltip": tooltip(quality_metric["Cis reads (>20kb)"]),
"numberType": "percent"})
qc_summary.append({"title": "Short cis reads",
"value": str(percent(quality_metric["Short cis reads (<20kb)"])),
"tooltip": tooltip(quality_metric["Short cis reads (<20kb)"]),
"numberType": "percent"})
"value": str(percent(quality_metric["Short cis reads (<20kb)"])),
"tooltip": tooltip(quality_metric["Short cis reads (<20kb)"]),
"numberType": "percent"})
qc_summary.append({"title": "Trans Reads",
"value": str(percent(quality_metric["Trans reads"])),
"tooltip": tooltip(quality_metric["Trans reads"]),
"numberType": "percent"})
"value": str(percent(quality_metric["Trans reads"])),
"tooltip": tooltip(quality_metric["Trans reads"]),
"numberType": "percent"})
res = ff_utils.patch_metadata({'quality_metric_summary': qc_summary}, file_uuid, key=key)
return res


def parse_formatstr(file_format_str):
if not file_format_str:
return None
return file_format_str.replace('/file-formats/', '').replace('/', '')


def calculate_qc_metric_atacseq_bb(file_uuid, key, patch=False):
'''peak call bigbed file from atacseq/chipseq'''
res = ff_utils.get_metadata(file_uuid, key=key)
if 'quality_metric' not in res:
return
qc_uuid = res['quality_metric']['uuid']
quality_metric = ff_utils.get_metadata(qc_uuid, key=key)
if 'overlap_reproducibility_qc' not in quality_metric:
return
if 'idr_reproducibility_qc' in quality_metric:
qc_method = 'idr'
else:
qc_method = 'overlap'
qc_summary = []

def million(numVal):
return str(round(numVal / 10000) / 100) + "m"

def tooltip(numVal):
return "Percent of total reads (=%s)" % million(numVal)

def round2(numVal):
return round(numVal * 100) / 100
opt_set = quality_metric[qc_method + "_reproducibility_qc"]["opt_set"]
qc_summary.append({"title": "Optimal Peaks",
"value": str(quality_metric[qc_method + "_reproducibility_qc"]["N_opt"]),
"numberType": "integer"})
qc_summary.append({"title": "Rescue Ratio",
"tooltip": "Ratio of number of peaks (Nt) relative to peak calling based" +
" on psuedoreplicates (Np) [max(Np,Nt) / min (Np,Nt)]",
"value": str(round2(quality_metric[qc_method + "_reproducibility_qc"]["rescue_ratio"])),
"numberType": "float"})
qc_summary.append({"title": "Self Consistency Ratio",
"tooltip": "Ratio of number of peaks in two replicates [max(N1,N2) / min (N1,N2)]",
"value": str(round2(quality_metric[qc_method + "_reproducibility_qc"]["self_consistency_ratio"])),
"numberType": "float"})
qc_summary.append({"title": "Fraction of Reads in Peaks",
"value": str(round2(quality_metric[qc_method + "_frip_qc"][opt_set]["FRiP"])),
"numberType": "float"})
if patch:
ff_utils.patch_metadata({'quality_metric_summary': qc_summary}, file_uuid, key=key)
return qc_summary
106 changes: 91 additions & 15 deletions chalicelib/checks/qc_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,80 @@
)
from .helpers import qc_utils
from dcicutils import ff_utils
import requests
import sys
import json
import datetime
import time
import boto3


@check_function(file_type=None, status=None, file_format=None, search_add_on=None)
def identify_files_without_qc_summary(connection, **kwargs):
@check_function(file_type=None, status=None, search_add_on=None)
def identify_files_without_qc_summary_pairs(connection, **kwargs):
fileformat = 'pairs'
t0 = time.time() # keep track of how start time
time_limit = 270 # 4.5 minutes
check = init_check_res(connection, 'identify_files_without_qc_summary')
check = init_check_res(connection, 'identify_files_without_qc_summary_pairs')
# must set this to be the function name of the action
check.action = 'patch_quality_metric_summary'
check.action = 'patch_quality_metric_summary_pairs'
default_filetype = 'FileProcessed' # skip fastq
default_stati = 'released%20to%20project&status=released&status=uploaded&status=pre-release'
filetype = kwargs.get('file_type') or default_filetype
stati = 'status=' + (kwargs.get('status') or default_stati)
search_query = 'search/?type={}&{}&frame=object'.format(filetype, stati)
ff = kwargs.get('file_format')
if ff is not None:
ff = '&file_format.file_format=' + ff
search_query += ff
search_query += '&file_format.file_format=' + fileformat
addon = kwargs.get('search_add_on')
if addon is not None:
if not addon.startswith('&'):
addon = '&' + addon
search_query += addon
problem_files = []
file_hits = ff_utils.search_metadata(search_query, key=connection.ff_keys, page_limit=200)
for hit in file_hits:
if round(time.time() - t0, 2) > time_limit:
break
if hit.get('quality_metric') and not hit.get('quality_metric_summary', ''):
hit_dict = {
'accession': hit.get('accession'),
'uuid': hit.get('uuid'),
'@type': hit.get('@type'),
'upload_key': hit.get('upload_key'),
'file_format': hit.get('file_format'),
'quality_metric': hit.get('quality_metric')
}
problem_files.append(hit_dict)
check.summary = '{} files with no quality metric summary'.format(len(problem_files))
check.full_output = problem_files
if problem_files:
check.status = 'WARN'
check.summary = 'File metadata found without quality_metric_summary'
status_str = 'pre-release/released/released to project/uploaded'
if kwargs.get('status'):
status_str = kwargs.get('status')
type_str = ''
if kwargs.get('file_type'):
type_str = kwargs.get('file_type') + ' '
ff_str = ''
if kwargs.get('file_format'):
ff_str = kwargs.get('file_format') + ' '
check.description = "{cnt} {type}{ff}files that are {st} don't have quality_metric_summary.".format(
cnt=len(problem_files), type=type_str, st=status_str, ff=ff_str)
check.action_message = "Will attempt to patch quality_metric_summary for %s files." % str(len(problem_files))
check.allow_action = True # allows the action to be run
else:
check.status = 'PASS'
return check


@check_function(file_type=None, status=None, search_add_on=None)
def identify_files_without_qc_summary_bb(connection, **kwargs):
fileformat = 'bigbed'
t0 = time.time() # keep track of how start time
time_limit = 270 # 4.5 minutes
check = init_check_res(connection, 'identify_files_without_qc_summary_bb')
# must set this to be the function name of the action
check.action = 'patch_quality_metric_summary_bb'
default_filetype = 'FileProcessed' # skip fastq
default_stati = 'released%20to%20project&status=released&status=uploaded&status=pre-release'
filetype = kwargs.get('file_type') or default_filetype
stati = 'status=' + (kwargs.get('status') or default_stati)
search_query = 'search/?type={}&{}&frame=object'.format(filetype, stati)
search_query += '&file_format.file_format=' + fileformat
addon = kwargs.get('search_add_on')
if addon is not None:
if not addon.startswith('&'):
Expand Down Expand Up @@ -75,10 +125,10 @@ def identify_files_without_qc_summary(connection, **kwargs):


@action_function()
def patch_quality_metric_summary(connection, **kwargs):
def patch_quality_metric_summary_pairs(connection, **kwargs):
t0 = time.time() # keep track of how start time
time_limit = 270 # 4.5 minutes
action = init_action_res(connection, 'patch_quality_metric_summary')
action = init_action_res(connection, 'patch_quality_metric_summary_pairs')
action_logs = {'skipping_format': [], 'patch_failure': [], 'patch_success': []}
# get latest results from identify_files_without_qc_summary
filesize_check_result = action.get_associated_check_result(kwargs)
Expand All @@ -98,3 +148,29 @@ def patch_quality_metric_summary(connection, **kwargs):
action.status = 'DONE'
action.output = action_logs
return action


@action_function()
def patch_quality_metric_summary_bb(connection, **kwargs):
t0 = time.time() # keep track of how start time
time_limit = 270 # 4.5 minutes
action = init_action_res(connection, 'patch_quality_metric_summary_bb')
action_logs = {'skipping_format': [], 'patch_failure': [], 'patch_success': []}
# get latest results from identify_files_without_qc_summary_bb
filesize_check_result = action.get_associated_check_result(kwargs)
for hit in filesize_check_result.get('full_output', []):
if round(time.time() - t0, 2) > time_limit:
break
if qc_utils.parse_formatstr(hit['file_format']) == 'bigbed':
try:
qc_utils.calculate_qc_metric_atacseq_bb(hit['uuid'], key=connection.ff_keys)
action_logs['patch_success'].append(hit['accession'])
except Exception as e:
acc_and_error = ': '.join([hit['accession'], str(e)])
action_logs['patch_failure'].append(acc_and_error)
else:
acc_and_format = ': '.join([hit['accession'], hit['file_format']])
action_logs['skipping_format'].append(acc_and_format)
action.status = 'DONE'
action.output = action_logs
return action

0 comments on commit 096c095

Please sign in to comment.