Skip to content

Commit

Permalink
Merge ed7048f into 58c62aa
Browse files Browse the repository at this point in the history
  • Loading branch information
sbreiff committed Jul 20, 2018
2 parents 58c62aa + ed7048f commit 5e0a366
Show file tree
Hide file tree
Showing 3 changed files with 224 additions and 1 deletion.
48 changes: 48 additions & 0 deletions chalicelib/check_setup.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,54 @@
}
}
},
"biosource_cell_line_value": {
"title": "Cell line biosources without cell_line metadata",
"group": "Audit checks",
"schedule": {
"morning_checks": {
"all": {
"kwargs": {"primary": true},
"dependencies": []
}
}
}
},
"external_expsets_without_pub": {
"title": "External experiment sets without a publication",
"group": "Audit checks",
"schedule": {
"morning_checks": {
"all": {
"kwargs": {"primary": true},
"dependencies": []
}
}
}
},
"expset_opfsets_unique_titles": {
"title": "Experiment Sets with missing or duplicated titles for sets of other processed files",
"group": "Audit checks",
"schedule": {
"thirty_min_checks": {
"all": {
"kwargs": {"primary": true},
"dependencies": []
}
}
}
},
"expset_opf_unique_files_in_experiments": {
"title": "Other_processed_files collections with non-unique files",
"group": "Audit checks",
"schedule": {
"thirty_min_checks": {
"all": {
"kwargs": {"primary": true},
"dependencies": ["expset_opfsets_unique_titles"]
}
}
}
},
"identify_files_without_filesize": {
"title": "Files without filesize",
"group": "Metadata checks",
Expand Down
175 changes: 175 additions & 0 deletions chalicelib/checks/audit_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from __future__ import print_function, unicode_literals
from ..utils import (
check_function,
init_check_res,
action_function,
init_action_res
)
from dcicutils import ff_utils
import requests
import sys
import json
import datetime
import time
import boto3


@check_function()
def biosource_cell_line_value(connection, **kwargs):
'''
checks cell line biosources to make sure they have an associated ontology term
'''
check = init_check_res(connection, 'biosource_cell_line_value')

cell_line_types = ["primary cell", "primary cell line", "immortalized cell line",
"in vitro differentiated cells", "induced pluripotent stem cell line",
"stem cell", "stem cell derived cell line"]
biosources = ff_utils.search_metadata('search/?type=Biosource&frame=object',
ff_env=connection.ff_env, page_limit=200)
missing = []
for biosource in biosources:
# check if the biosource type is a cell/cell line
if biosource.get('biosource_type') and biosource.get('biosource_type') in cell_line_types:
# append if cell_line field is missing
if not biosource.get('cell_line'):
missing.append({'uuid': biosource['uuid'],
'@id': biosource['@id'],
'biosource_type': biosource.get('biosource_type'),
'description': biosource.get('description'),
'error': 'Missing cell_line metadata'})
# detail = 'In Biosource {}'.format(value['@id']) + \
# ' - Missing Required cell_line field value for biosource type ' + \
# '{}'.format(bstype)
check.full_output = missing
check.brief_output = [item['uuid'] for item in missing]
if missing:
check.status = 'WARN'
check.summary = 'Cell line biosources found missing cell_line metadata'
else:
check.status = 'PASS'
check.summary = 'No cell line biosources are missing cell_line metadata'
return check


@check_function()
def external_expsets_without_pub(connection, **kwargs):
'''
checks external experiment sets to see if they are attributed to a publication
'''
check = init_check_res(connection, 'external_expsets_without_pub')

ext = ff_utils.search_metadata('search/?award.project=External&type=ExperimentSet&frame=object',
ff_env=connection.ff_env, page_limit=50)
no_pub = []
for expset in ext:
if not expset.get('publications_of_set') and not expset.get('produced_in_pub'):
# add to Output
no_pub.append({'uuid': expset['uuid'],
'@id': expset['@id'],
'description': expset.get('description'),
'lab': expset.get('lab'),
'error': 'Missing attribution to a publication'})
if no_pub:
check.status = 'WARN'
check.summary = 'External experiment sets found without associated publication'
check.description = '{} external experiment sets are missing attribution to a publication.'.format(len(no_pub))
else:
check.status = 'PASS'
check.summary = 'No external experiment sets are missing publication'
check.description = '0 external experiment sets are missing attribution to a publication.'
check.full_output = no_pub
check.brief_output = [item['uuid'] for item in no_pub]
return check


@check_function()
def expset_opfsets_unique_titles(connection, **kwargs):
'''
checks experiment sets with other_processed_files to see if each collection
of other_processed_files has a unique title within that experiment set
'''
check = init_check_res(connection, 'expset_opfsets_unique_titles')

opf_expsets = ff_utils.search_metadata('search/?type=ExperimentSet&other_processed_files.files.uuid%21=No+value&frame=object',
ff_env=connection.ff_env, page_limit=50)
errors = []
for expset in opf_expsets:
e = []
fileset_names = [fileset.get('title') for fileset in expset['other_processed_files']]
if None in fileset_names or '' in fileset_names:
e.append('Missing title')
if len(list(set(fileset_names))) != len(fileset_names):
e.append('Duplicate title')
if e:
info = {'uuid': expset['uuid'],
'@id': expset['@id'],
'errors': []}
if 'Missing title' in e:
info['errors'] += ['ExperimentSet {} has an other_processed_files collection with a missing title.'.format(expset['accession'])]
if 'Duplicate title' in e:
info['errors'] += ['ExperimentSet {} has 2+ other_processed_files collections with duplicated titles.'.format(expset['accession'])]
errors.append(info)

if errors:
check.status = 'WARN'
check.summary = 'Experiment Sets found with duplicate/missing titles in other_processed_files'
check.description = '{} Experiment Sets have other_processed_files collections with missing or duplicate titles.'.format(len(errors))
else:
check.status = 'PASS'
check.summary = 'No issues found with other_processed_files of experiment sets'
check.description = '0 Experiment Sets have other_processed_files collections with missing or duplicate titles.'
check.full_output = errors
check.brief_output = {'missing title': [item['uuid'] for item in errors if 'missing' in ''.join(item['errors'])],
'duplicate title': [item['uuid'] for item in errors if 'duplicated' in ''.join(item['errors'])]}
return check


@check_function()
def expset_opf_unique_files_in_experiments(connection, **kwargs):
'''
checks experiment sets with other_processed_files and looks for other_processed_files collections
in child experiments to make sure that (1) the collections have titles and (2) that if the titles
are shared with the parent experiment set, that the filenames contained within are unique
'''
check = init_check_res(connection, 'expset_opf_unique_files_in_experiments')

opf_expsets = ff_utils.search_metadata('search/?type=ExperimentSet&other_processed_files.files.uuid%21=No+value',
ff_env=connection.ff_env, page_limit=25)
errors = []
for expset in opf_expsets:
expset_titles = {fileset.get('title'): fileset.get('files') for fileset in expset['other_processed_files'] if fileset.get('title')}
if not expset.get('experiments_in_set'):
continue
for expt in (exp for exp in expset.get('experiments_in_set') if exp.get('other_processed_files')):
e = []
for opf_set in expt['other_processed_files']:
# look for missing names
if not opf_set.get('title'):
e.append('Experiment {} in Experiment Set {} has an other_processed_files set '
'missing a title.'.format(expt['accession'], expset['accession']))
# look for duplicate names
elif opf_set.get('title') in expset_titles.keys() and opf_set.get('files'):
for opf_file in opf_set['files']:
# if duplicate names, look for duplicate file names
if opf_file in expset_titles[opf_set['title']]:
e.append('Experiment {} other_processed_files collection with title `{}` has file {} which '
'is also present in parent ExperimentSet {} other_processed_files collection of the '
'same name.'.format(expt['accession'], opf_set['title'], opf_file['accession'], expset['accession']))
if e:
errors.append({'uuid': expt['uuid'],
'@id': expt['@id'],
'error_details': e})
if errors:
check.status = 'WARN'
check.summary = '{} experiments found with issues in other_processed_files'.format(len(errors))
check.description = ('{} Experiments found that are either missing titles for sets of other_processed_files,'
' or have non-uniquefilenames in other_processed_files'.format(len(errors)))
else:
check.status = 'PASS'
check.summary = 'No issues found with other_processed_files of experiments'
check.description = ('0 Experiments found to be missing titles for sets of other_processed_files,'
' or have non-unique filenames in other_processed_files')
check.full_output = errors
check.brief_output = {'missing title': [item['uuid'] for item in errors if 'missing' in ''.join(item['error_details'])],
'duplicate title': [item['uuid'] for item in errors if 'also present in parent' in ''.join(item['error_details'])]}
return check
2 changes: 1 addition & 1 deletion docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ That's it! Now your check will automatically run with all other morning checks.
Now that your check is built and scheduled, it can be run or retrieved using the Foursight API. This is the last topic covered in this file. For more information on configuring the check setup, [go here](./checks.md#check-setup).

## Using the UI
The easiest way to interact with Foursight is through the UI, which allows viewing and running of checks. Here is [production Foursight](https://foursight.4dnucleome.org/api/view/all) and here is [development Foursight](https://m1kj6dypu3.execute-api.us-east-1.amazonaws.com/api/view/all). Checks are presented in groups, as specified in `check_setup.json`. Opening any group by clicking on it presents information on individual checks, which be further examined by clicking on the check title. If you have administrator privileges, you can log into your account and run checks directly from the page. When doing this, you can adjust key word arguments for the check directly on the UI; this allows a high level of flexibility, including the choice to not overwrite the primary record for the check by setting `primary` to something else besides `True`. Please note that running any checks requires either administrator privileges or a special authorization code.
The easiest way to interact with Foursight is through the UI, which allows viewing and running of checks. Here is [production Foursight](https://foursight.4dnucleome.org/api/view/all) and here is [development Foursight](https://kpqxwgx646.execute-api.us-east-1.amazonaws.com/api/view/all). Checks are presented in groups, as specified in `check_setup.json`. Opening any group by clicking on it presents information on individual checks, which be further examined by clicking on the check title. If you have administrator privileges, you can log into your account and run checks directly from the page. When doing this, you can adjust key word arguments for the check directly on the UI; this allows a high level of flexibility, including the choice to not overwrite the primary record for the check by setting `primary` to something else besides `True`. Please note that running any checks requires either administrator privileges or a special authorization code.

For any individual check on the /view/ page, you can access the past history of the checks on the /history/ page: `https://foursight.4dnucleome.org/api/history/<environ>/<check>`. This will give a paginated list of past runs for that check or action and displauy the status and key word arguments used to run the check. From there, individual results can be viewed in JSON format if you are logged in as admin.

Expand Down

0 comments on commit 5e0a366

Please sign in to comment.