Workflows Base Module (#229)

* close #237 * add workflows.base (code for reading CSV file of projects and running automated workflows on each project) * add workflows.registry (with a single example workflow, DihedralAnalysis) * add new testing data, .csv for workflows base module; add workflows to STATES dictionary * add documentation for workflows registry and base module * update CHANGES
Becksteinlab · Apr 4, 2023 · cafb300 · cafb300
1 parent b7d0b06
commit cafb300
Show file tree

Hide file tree

Showing 9 changed files with 361 additions and 4 deletions.
diff --git a/CHANGES b/CHANGES
@@ -23,10 +23,13 @@ Changes
 
 Enhancements
 
-* new workflows module (PR #217)
+* new workflows registry that contains each EnsembleAnalysis for which
+  a workflows module exists, for use with workflows base module (#229)
+* new workflows base module that provides iterative workflow use for
+  directories that contain multiple projects (#229)
+* new workflows module (#217)
 * new automated dihedral analysis workflow (detect dihedrals with SMARTS,
-  analyze with EnsembleAnalysis, and generate seaborn violinplots)
-  PR #217)
+  analyze with EnsembleAnalysis, and generate seaborn violinplots) (#217)
 
 Fixes
 
@@ -36,7 +39,7 @@ Fixes
 * fix ensemble.EnsembleAnalysis.check_groups_from_common_ensemble (#212)
 
 
-2021-01-03    0.8.0
+2022-01-03    0.8.0
 ALescoulie, orbeckst
 
 Changes

diff --git a/doc/sphinx/source/workflows.txt b/doc/sphinx/source/workflows.txt
@@ -14,4 +14,6 @@ for use with :class:`~mdpow.analysis.dihedral.DihedralAnalysis`.
 .. toctree::
    :maxdepth: 1
 
+   workflows/base
+   workflows/registry
    workflows/dihedrals
diff --git a/doc/sphinx/source/workflows/base.txt b/doc/sphinx/source/workflows/base.txt
@@ -0,0 +1,7 @@
+==============
+Workflows Base
+==============
+
+.. versionadded:: 0.9.0
+
+.. automodule:: mdpow.workflows.base
diff --git a/doc/sphinx/source/workflows/registry.txt b/doc/sphinx/source/workflows/registry.txt
@@ -0,0 +1,7 @@
+==================
+Workflows Registry
+==================
+
+.. versionadded:: 0.9.0
+
+.. automodule:: mdpow.workflows.registry
diff --git a/mdpow/tests/__init__.py b/mdpow/tests/__init__.py
@@ -13,5 +13,6 @@
     "FEP": RESOURCES.join("states", "FEP"),
     "base": RESOURCES.join("states", "base"),
     "md_npt": RESOURCES.join("states", "FEP"),
+    "workflows": RESOURCES.join("states", "workflows"),
 }
 CONFIGURATIONS = RESOURCES.join("test_configurations")
diff --git a/mdpow/tests/test_workflows_base.py b/mdpow/tests/test_workflows_base.py
@@ -0,0 +1,101 @@
+import re
+import os
+import sys
+import yaml
+import pybol
+import pytest
+import pathlib
+import logging
+
+import pandas as pd
+
+from . import RESOURCES
+from . import STATES
+
+import py.path
+
+from ..workflows import base
+
+from pkg_resources import resource_filename
+
+RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources'))
+MANIFEST = RESOURCES / 'manifest.yml'
+
+@pytest.fixture(scope='function')
+def molname_workflows_directory(tmp_path):
+    m = pybol.Manifest(str(MANIFEST))
+    m.assemble('workflows', tmp_path)
+    return tmp_path
+
+class TestWorkflowsBase(object):
+
+    @pytest.fixture(scope='function')
+    def SM_tmp_dir(self, molname_workflows_directory):
+        dirname = molname_workflows_directory
+        return dirname
+
+    @pytest.fixture(scope='function')
+    def csv_input_data(self):
+        csv_path = STATES['workflows'] / 'project_paths.csv'
+        csv_df = pd.read_csv(csv_path).reset_index(drop=True)
+        return csv_path, csv_df
+
+    @pytest.fixture(scope='function')
+    def test_df_data(self):
+        test_dict = {'molecule' : ['SM25', 'SM26'],
+                    'resname' : ['SM25', 'SM26']}
+        test_df = pd.DataFrame(test_dict).reset_index(drop=True)
+        return test_df
+
+    @pytest.fixture(scope='function')
+    def project_paths_data(self, SM_tmp_dir):
+        project_paths = base.project_paths(parent_directory=SM_tmp_dir)
+        return project_paths
+
+    def test_project_paths(self, test_df_data, project_paths_data):
+        test_df = test_df_data
+        project_paths = project_paths_data
+
+        assert project_paths['molecule'][0] == test_df['molecule'][0]
+        assert project_paths['molecule'][1] == test_df['molecule'][1]
+        assert project_paths['resname'][0] == test_df['resname'][0]
+        assert project_paths['resname'][1] == test_df['resname'][1]
+
+    def test_project_paths_csv_input(self, csv_input_data):
+        csv_path, csv_df = csv_input_data
+        project_paths = base.project_paths(csv=csv_path)
+
+        pd.testing.assert_frame_equal(project_paths, csv_df)
+
+    def test_automated_project_analysis(self, project_paths_data, caplog):
+        project_paths = project_paths_data
+        # change resname to match topology (every SAMPL7 resname is 'UNK')
+        # only necessary for this dataset, not necessary for normal use
+        project_paths['resname'] = 'UNK'
+
+        base.automated_project_analysis(project_paths, solvents=('water',),
+                                 ensemble_analysis='DihedralAnalysis')
+
+        assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis '
+               'did not iteratively run to completion for the provided project')
+
+    def test_automated_project_analysis_KeyError(self, project_paths_data, caplog):
+        caplog.clear()
+        caplog.set_level(logging.ERROR, logger='mdpow.workflows.base')
+
+        project_paths = project_paths_data
+        # change resname to match topology (every SAMPL7 resname is 'UNK')
+        # only necessary for this dataset, not necessary for normal use
+        project_paths['resname'] = 'UNK'
+
+        # test error output when raised
+        with pytest.raises(KeyError,
+                           match="Invalid ensemble_analysis 'DarthVaderAnalysis'. "
+                                 "An EnsembleAnalysis type that corresponds to an existing "
+                                 "automated workflow module must be input as a kwarg. ex: "
+                                 "ensemble_analysis='DihedralAnalysis'"):
+            base.automated_project_analysis(project_paths, ensemble_analysis='DarthVaderAnalysis', solvents=('water',))
+
+        # test logger error recording
+        assert "'DarthVaderAnalysis' is an invalid selection" in caplog.text, ('did not catch incorrect '
+               'key specification for workflows.registry that results in KeyError')
diff --git a/mdpow/tests/testing_resources/states/workflows/project_paths.csv b/mdpow/tests/testing_resources/states/workflows/project_paths.csv
@@ -0,0 +1,3 @@
+molecule,resname,path
+SM25,SM25,mdpow/tests/testing_resources/states/workflows/SM25
+SM26,SM26,mdpow/tests/testing_resources/states/workflows/SM26
diff --git a/mdpow/workflows/base.py b/mdpow/workflows/base.py
@@ -0,0 +1,180 @@
+# MDPOW: base.py
+# 2022 Cade Duckworth
+
+"""
+:mod:`mdpow.workflows.base` --- Automated workflow base functions
+=================================================================
+
+To analyze multiple MDPOW projects, provide :func:`project_paths`
+with the top-level directory containing all MDPOW projects' simulation data
+to obtain a :class:`pandas.DataFrame` containing the project information
+and paths. Then, :func:`automated_project_analysis` takes as input the
+aforementioned :class:`pandas.DataFrame` and runs the specified
+:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects
+under the top-level directory provided to :func:`project_paths`.
+
+.. seealso:: :mod:`~mdpow.workflows.registry`
+
+.. autofunction:: project_paths
+.. autofunction:: automated_project_analysis
+
+"""
+
+import os
+import re
+import pandas as pd
+
+from mdpow.workflows import registry
+
+import logging
+
+logger = logging.getLogger('mdpow.workflows.base')
+
+def project_paths(parent_directory=None, csv=None, csv_save_dir=None):
+    """Takes a top directory containing MDPOW projects and determines
+       the molname, resname, and path, of each MDPOW project within.
+
+       Optionally takes a .csv file containing `molname`, `resname`, and
+       `paths`, in that order. 
+
+       :keywords:
+
+       *parent_directory*
+           the path for the location of the top directory 
+           under which the subdirectories of MDPOW simulation
+           data exist, additionally creates a 'project_paths.csv' file
+           for user manipulation of metadata and for future reference
+
+       *csv*
+           .csv file containing the molecule names, resnames,
+           and paths, in that order, for the MDPOW simulation
+           data to be iterated over must contain header of the
+           form: `molecule,resname,path`
+
+       *csv_save_dir*
+           optionally provided directory to save .csv file, otherwise,
+           data will be saved in current working directory
+
+       :returns:
+
+       *project_paths*
+           :class:`pandas.DataFrame` containing MDPOW project metadata
+
+       .. rubric:: Example
+       
+       Typical Workflow::
+
+           project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
+           automated_project_analysis(project_paths)
+
+       or::
+
+           project_paths = project_paths(csv='/foo/bar/MDPOW.csv')
+           automated_project_analysis(project_paths)
+
+    """
+
+    if parent_directory is not None:
+
+        locations = []
+
+        reg_compile = re.compile('FEP')
+        for dirpath, dirnames, filenames in os.walk(parent_directory):
+            result = [dirpath.strip() for dirname in dirnames if  reg_compile.match(dirname)]
+            if result:
+                locations.append(result[0])
+
+        resnames = []
+
+        for loc in locations:
+            res_temp = loc.strip().split('/')
+            resnames.append(res_temp[-1])
+
+        project_paths = pd.DataFrame(
+        {
+            'molecule': resnames,
+            'resname': resnames,
+            'path': locations
+        }
+        )
+        if csv_save_dir is not None:
+            project_paths.to_csv(f'{csv_save_dir}/project_paths.csv', index=False)
+            logger.info(f'project_paths saved under {csv_save_dir}')
+        else:
+            current_directory = os.getcwd()
+            project_paths.to_csv('project_paths.csv', index=False)
+            logger.info(f'project_paths saved under {current_directory}')
+
+    elif csv is not None:
+        locations = pd.read_csv(csv)
+        project_paths = locations.sort_values(by=['molecule', 'resname', 'path']).reset_index(drop=True)
+
+    return project_paths
+
+def automated_project_analysis(project_paths, ensemble_analysis, **kwargs):
+    """Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths`
+       and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
+       for each of the projects by running the associated automated workflow
+       in each project directory returned by :func:`~mdpow.workflows.base.project_paths`.
+
+       Compatibility with more automated analyses in development.
+
+       :keywords:
+
+       *project_paths*
+           :class:`pandas.DataFrame` that provides paths to MDPOW projects
+
+       *ensemble_analysis*
+           name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
+           that corresponds to the desired automated workflow module
+
+       *kwargs*
+           keyword arguments for the supported automated workflows,
+           see the :mod:`~mdpow.workflows.registry` for all available
+           workflows and their call signatures
+
+       .. rubric:: Example
+
+       A typical workflow is the automated dihedral analysis from 
+       :mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis*
+       :class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project. 
+       The :data:`~mdpow.workflows.registry.registry` contains this automated
+       workflow under the key *"DihedralAnalysis"* and so the automated execution
+       for all `project_paths` (obtained via :func:`project_paths`) is performed by 
+       passing the specific key to :func:`automated_project_analysis`::
+
+           project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
+           automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs)
+
+    """
+
+    for row in project_paths.itertuples():
+        molname = row.molecule
+        resname = row.resname
+        dirname = row.path
+
+        logger.info(f'starting {molname}')
+
+        try:
+            registry.registry[ensemble_analysis](dirname=dirname, resname=resname, molname=molname, **kwargs)
+
+            logger.info(f'{molname} completed')
+
+        except KeyError as err:
+            msg = (f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds "
+                    "to an existing automated workflow module must be input as a kwarg. "
+                    "ex: ensemble_analysis='DihedralAnalysis'")
+            logger.error(f'{err} is an invalid selection')
+
+            raise KeyError(msg)
+
+        except TypeError as err:
+            msg = (f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that "
+                    "corresponds to an existing automated workflow module must be input as a kwarg. "
+                    "ex: ensemble_analysis='DihedralAnalysis'")
+            logger.error(f'workflow module for {ensemble_analysis} does not exist yet')
+
+            raise TypeError(msg)
+
+    logger.info('all analyses completed')
+    return
diff --git a/mdpow/workflows/registry.py b/mdpow/workflows/registry.py
@@ -0,0 +1,53 @@
+# MDPOW: registry.py
+# 2023 Cade Duckworth
+
+"""
+:mod:`mdpow.workflows.registry` --- Registry of currently supported automated workflows
+=======================================================================================
+
+The :mod:`mdpow.workflows.registry` module hosts a dictionary with keys that correspond to an
+:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for which exists a corresponding automated workflow.
+
+.. table:: Currently supported automated workflows.
+   :widths: auto
+   :name: workflows_registry
+
+   +-------------------------------+------------------------------------------------------------------------------------------------------+
+   | key/keyword: EnsembleAnalysis | value: <workflow module>.<top-level automated analysis function>                                     |
+   +===============================+======================================================================================================+
+   | DihedralAnalysis              | :any:`dihedrals.automated_dihedral_analysis <mdpow.workflows.dihedrals.automated_dihedral_analysis>` |
+   +-------------------------------+------------------------------------------------------------------------------------------------------+
+
+.. autodata:: registry
+
+.. seealso:: :mod:`~mdpow.workflows.base`
+
+"""
+
+# import analysis
+from mdpow.workflows import dihedrals
+
+registry = {
+
+    'DihedralAnalysis' : dihedrals.automated_dihedral_analysis
+
+}
+
+"""
+In the `registry`, each entry corresponds to an
+:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
+for which exists a corresponding automated workflow.
+
+Intended for use with :mod:`mdpow.workflows.base` to specify which
+:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` should run iteratively over
+the provided project data directory.
+
+To include a new automated workflow for use with :mod:`mdpow.workflows.base`,
+create a key that is the name of the corresponding
+:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`, with the value defined as
+`<workflow module>.<top-level automated analysis function>`.
+
+The available automated workflows (key-value pairs) are listed in the
+following table :any:`Currently supported automated workflows. <workflows_registry>`
+
+"""