Merge ed7048f into 58c62aa

4dn-dcic · Jul 20, 2018 · 5e0a366 · 5e0a366
2 parents 58c62aa + ed7048f
commit 5e0a366
Show file tree

Hide file tree

Showing 3 changed files with 224 additions and 1 deletion.
diff --git a/chalicelib/check_setup.json b/chalicelib/check_setup.json
@@ -113,6 +113,54 @@
             }
         }
     },
+    "biosource_cell_line_value": {
+        "title": "Cell line biosources without cell_line metadata",
+        "group": "Audit checks",
+        "schedule": {
+            "morning_checks": {
+                "all": {
+                    "kwargs": {"primary": true},
+                    "dependencies": []
+                }
+            }
+        }
+    },
+    "external_expsets_without_pub": {
+        "title": "External experiment sets without a publication",
+        "group": "Audit checks",
+        "schedule": {
+            "morning_checks": {
+                "all": {
+                    "kwargs": {"primary": true},
+                    "dependencies": []
+                }
+            }
+        }
+    },
+    "expset_opfsets_unique_titles": {
+        "title": "Experiment Sets with missing or duplicated titles for sets of other processed files",
+        "group": "Audit checks",
+        "schedule": {
+            "thirty_min_checks": {
+                "all": {
+                    "kwargs": {"primary": true},
+                    "dependencies": []
+                }
+            }
+        }
+    },
+    "expset_opf_unique_files_in_experiments": {
+        "title": "Other_processed_files collections with non-unique files",
+        "group": "Audit checks",
+        "schedule": {
+            "thirty_min_checks": {
+                "all": {
+                    "kwargs": {"primary": true},
+                    "dependencies": ["expset_opfsets_unique_titles"]
+                }
+            }
+        }
+    },
     "identify_files_without_filesize": {
         "title": "Files without filesize",
         "group": "Metadata checks",

diff --git a/chalicelib/checks/audit_checks.py b/chalicelib/checks/audit_checks.py
@@ -0,0 +1,175 @@
+from __future__ import print_function, unicode_literals
+from ..utils import (
+    check_function,
+    init_check_res,
+    action_function,
+    init_action_res
+)
+from dcicutils import ff_utils
+import requests
+import sys
+import json
+import datetime
+import time
+import boto3
+
+
+@check_function()
+def biosource_cell_line_value(connection, **kwargs):
+    '''
+    checks cell line biosources to make sure they have an associated ontology term
+    '''
+    check = init_check_res(connection, 'biosource_cell_line_value')
+
+    cell_line_types = ["primary cell", "primary cell line", "immortalized cell line",
+                       "in vitro differentiated cells", "induced pluripotent stem cell line",
+                       "stem cell", "stem cell derived cell line"]
+    biosources = ff_utils.search_metadata('search/?type=Biosource&frame=object',
+                                          ff_env=connection.ff_env, page_limit=200)
+    missing = []
+    for biosource in biosources:
+        # check if the biosource type is a cell/cell line
+        if biosource.get('biosource_type') and biosource.get('biosource_type') in cell_line_types:
+            # append if cell_line field is missing
+            if not biosource.get('cell_line'):
+                missing.append({'uuid': biosource['uuid'],
+                                '@id': biosource['@id'],
+                                'biosource_type': biosource.get('biosource_type'),
+                                'description': biosource.get('description'),
+                                'error': 'Missing cell_line metadata'})
+                # detail = 'In Biosource {}'.format(value['@id']) + \
+                #      ' - Missing Required cell_line field value for biosource type  ' + \
+                #      '{}'.format(bstype)
+    check.full_output = missing
+    check.brief_output = [item['uuid'] for item in missing]
+    if missing:
+        check.status = 'WARN'
+        check.summary = 'Cell line biosources found missing cell_line metadata'
+    else:
+        check.status = 'PASS'
+        check.summary = 'No cell line biosources are missing cell_line metadata'
+    return check
+
+
+@check_function()
+def external_expsets_without_pub(connection, **kwargs):
+    '''
+    checks external experiment sets to see if they are attributed to a publication
+    '''
+    check = init_check_res(connection, 'external_expsets_without_pub')
+
+    ext = ff_utils.search_metadata('search/?award.project=External&type=ExperimentSet&frame=object',
+                                   ff_env=connection.ff_env, page_limit=50)
+    no_pub = []
+    for expset in ext:
+        if not expset.get('publications_of_set') and not expset.get('produced_in_pub'):
+            # add to Output
+            no_pub.append({'uuid': expset['uuid'],
+                           '@id': expset['@id'],
+                           'description': expset.get('description'),
+                           'lab': expset.get('lab'),
+                           'error': 'Missing attribution to a publication'})
+    if no_pub:
+        check.status = 'WARN'
+        check.summary = 'External experiment sets found without associated publication'
+        check.description = '{} external experiment sets are missing attribution to a publication.'.format(len(no_pub))
+    else:
+        check.status = 'PASS'
+        check.summary = 'No external experiment sets are missing publication'
+        check.description = '0 external experiment sets are missing attribution to a publication.'
+    check.full_output = no_pub
+    check.brief_output = [item['uuid'] for item in no_pub]
+    return check
+
+
+@check_function()
+def expset_opfsets_unique_titles(connection, **kwargs):
+    '''
+    checks experiment sets with other_processed_files to see if each collection
+    of other_processed_files has a unique title within that experiment set
+    '''
+    check = init_check_res(connection, 'expset_opfsets_unique_titles')
+
+    opf_expsets = ff_utils.search_metadata('search/?type=ExperimentSet&other_processed_files.files.uuid%21=No+value&frame=object',
+                                           ff_env=connection.ff_env, page_limit=50)
+    errors = []
+    for expset in opf_expsets:
+        e = []
+        fileset_names = [fileset.get('title') for fileset in expset['other_processed_files']]
+        if None in fileset_names or '' in fileset_names:
+            e.append('Missing title')
+        if len(list(set(fileset_names))) != len(fileset_names):
+            e.append('Duplicate title')
+        if e:
+            info = {'uuid': expset['uuid'],
+                    '@id': expset['@id'],
+                    'errors': []}
+            if 'Missing title' in e:
+                info['errors'] += ['ExperimentSet {} has an other_processed_files collection with a missing title.'.format(expset['accession'])]
+            if 'Duplicate title' in e:
+                info['errors'] += ['ExperimentSet {} has 2+ other_processed_files collections with duplicated titles.'.format(expset['accession'])]
+            errors.append(info)
+
+    if errors:
+        check.status = 'WARN'
+        check.summary = 'Experiment Sets found with duplicate/missing titles in other_processed_files'
+        check.description = '{} Experiment Sets have other_processed_files collections with missing or duplicate titles.'.format(len(errors))
+    else:
+        check.status = 'PASS'
+        check.summary = 'No issues found with other_processed_files of experiment sets'
+        check.description = '0 Experiment Sets have other_processed_files collections with missing or duplicate titles.'
+    check.full_output = errors
+    check.brief_output = {'missing title': [item['uuid'] for item in errors if 'missing' in ''.join(item['errors'])],
+                          'duplicate title': [item['uuid'] for item in errors if 'duplicated' in ''.join(item['errors'])]}
+    return check
+
+
+@check_function()
+def expset_opf_unique_files_in_experiments(connection, **kwargs):
+    '''
+    checks experiment sets with other_processed_files and looks for other_processed_files collections
+    in child experiments to make sure that (1) the collections have titles and (2) that if the titles
+    are shared with the parent experiment set, that the filenames contained within are unique
+    '''
+    check = init_check_res(connection, 'expset_opf_unique_files_in_experiments')
+
+    opf_expsets = ff_utils.search_metadata('search/?type=ExperimentSet&other_processed_files.files.uuid%21=No+value',
+                                           ff_env=connection.ff_env, page_limit=25)
+    errors = []
+    for expset in opf_expsets:
+        expset_titles = {fileset.get('title'): fileset.get('files') for fileset in expset['other_processed_files'] if fileset.get('title')}
+        if not expset.get('experiments_in_set'):
+            continue
+        for expt in (exp for exp in expset.get('experiments_in_set') if exp.get('other_processed_files')):
+            e = []
+            for opf_set in expt['other_processed_files']:
+                # look for missing names
+                if not opf_set.get('title'):
+                    e.append('Experiment {} in Experiment Set {} has an other_processed_files set '
+                             'missing a title.'.format(expt['accession'], expset['accession']))
+                # look for duplicate names
+                elif opf_set.get('title') in expset_titles.keys() and opf_set.get('files'):
+                    for opf_file in opf_set['files']:
+                        # if duplicate names, look for duplicate file names
+                        if opf_file in expset_titles[opf_set['title']]:
+                            e.append('Experiment {} other_processed_files collection with title `{}` has file {} which '
+                                     'is also present in parent ExperimentSet {} other_processed_files collection of the '
+                                     'same name.'.format(expt['accession'], opf_set['title'], opf_file['accession'], expset['accession']))
+            if e:
+                errors.append({'uuid': expt['uuid'],
+                               '@id': expt['@id'],
+                               'error_details': e})
+    if errors:
+        check.status = 'WARN'
+        check.summary = '{} experiments found with issues in other_processed_files'.format(len(errors))
+        check.description = ('{} Experiments found that are either missing titles for sets of other_processed_files,'
+                             ' or have non-uniquefilenames in other_processed_files'.format(len(errors)))
+    else:
+        check.status = 'PASS'
+        check.summary = 'No issues found with other_processed_files of experiments'
+        check.description = ('0 Experiments found to be missing titles for sets of other_processed_files,'
+                             ' or have non-unique filenames in other_processed_files')
+    check.full_output = errors
+    check.brief_output = {'missing title': [item['uuid'] for item in errors if 'missing' in ''.join(item['error_details'])],
+                          'duplicate title': [item['uuid'] for item in errors if 'also present in parent' in ''.join(item['error_details'])]}
+    return check
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -169,7 +169,7 @@ That's it! Now your check will automatically run with all other morning checks.
 Now that your check is built and scheduled, it can be run or retrieved using the Foursight API. This is the last topic covered in this file. For more information on configuring the check setup, [go here](./checks.md#check-setup).
 
 ## Using the UI
-The easiest way to interact with Foursight is through the UI, which allows viewing and running of checks. Here is [production Foursight](https://foursight.4dnucleome.org/api/view/all) and here is [development Foursight](https://m1kj6dypu3.execute-api.us-east-1.amazonaws.com/api/view/all). Checks are presented in groups, as specified in `check_setup.json`. Opening any group by clicking on it presents information on individual checks, which be further examined by clicking on the check title. If you have administrator privileges, you can log into your account and run checks directly from the page. When doing this, you can adjust key word arguments for the check directly on the UI; this allows a high level of flexibility, including the choice to not overwrite the primary record for the check by setting `primary` to something else besides `True`. Please note that running any checks requires either administrator privileges or a special authorization code.
+The easiest way to interact with Foursight is through the UI, which allows viewing and running of checks. Here is [production Foursight](https://foursight.4dnucleome.org/api/view/all) and here is [development Foursight](https://kpqxwgx646.execute-api.us-east-1.amazonaws.com/api/view/all). Checks are presented in groups, as specified in `check_setup.json`. Opening any group by clicking on it presents information on individual checks, which be further examined by clicking on the check title. If you have administrator privileges, you can log into your account and run checks directly from the page. When doing this, you can adjust key word arguments for the check directly on the UI; this allows a high level of flexibility, including the choice to not overwrite the primary record for the check by setting `primary` to something else besides `True`. Please note that running any checks requires either administrator privileges or a special authorization code.
 
 For any individual check on the /view/ page, you can access the past history of the checks on the /history/ page: `https://foursight.4dnucleome.org/api/history/<environ>/<check>`. This will give a paginated list of past runs for that check or action and displauy the status and key word arguments used to run the check. From there, individual results can be viewed in JSON format if you are logged in as admin.