DeepRank · DaniBodor · May 11, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023
diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml
@@ -60,7 +60,7 @@ runs:
         # Only way to install msms is through conda
         conda install -c bioconda msms
         # Safest way to install PyTorch and PyTorch Geometric is through conda
-        conda install pytorch torchvision torchaudio cpuonly -c pytorch
+        conda install pytorch==2.0.0 torchvision=0.15.0 torchaudio=2.0.0 cpuonly -c pytorch
         conda install pyg -c pyg
         # Install optional pyg dependencies
         python3 -m pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python3 -c "import torch; print(torch.__version__)")+cpu.html

diff --git a/README.md b/README.md
@@ -59,7 +59,9 @@ Before installing deeprankcore you need to install:
  * [msms](https://ssbio.readthedocs.io/en/latest/instructions/msms.html): `conda install -c bioconda msms`. *For MacOS with M1 chip users*: you can follow [these instructions](https://ssbio.readthedocs.io/en/latest/instructions/msms.html).
  * [dssp](https://swift.cmbi.umcn.nl/gv/dssp/): `sudo apt-get install dssp`
     * See [DSSP docs](https://ssbio.readthedocs.io/en/latest/instructions/dssp.html) for installing it on Mac OSX
- * [pytorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` or `conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia`, for taking advantage of GPUs.
+ * [pytorch](https://pytorch.org/get-started/locally/): 
+   * CPU only: `conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 cpuonly -c pytorch`
+   * if using GPU: `conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia`
  * [pytorch-geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html): `conda install pyg -c pyg`
  * [Dependencies for pytorch geometric from wheels](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html#installation-from-wheels): `pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html`. 
     - Here, `${TORCH}` and `${CUDA}` should be replaced by the pytorch and CUDA versions installed. You can find these using:

diff --git a/deeprankcore/domain/aminoacidlist.py b/deeprankcore/domain/aminoacidlist.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from deeprankcore.molstruct.aminoacid import AminoAcid, Polarity
 
 # All info below sourced from above websites in December 2022 and summarized in deeprankcore/domain/aminoacid_summary.xlsx
@@ -349,3 +351,24 @@
     # selenocysteine,
     # pyrrolysine,
     ]
+
+def convert_aa_nomenclature(aa: str, output_type: Optional[int] = None):
+
+    # pylint: disable = raise-missing-from
+    try: 
+        if len(aa) == 1:
+            aa: AminoAcid = [entry for entry in amino_acids if entry.one_letter_code.lower() == aa.lower()][0]
+        elif len(aa) == 3:
+            aa: AminoAcid = [entry for entry in amino_acids if entry.three_letter_code.lower() == aa.lower()][0]
+        else:
+            aa: AminoAcid = [entry for entry in amino_acids if entry.name.lower() == aa.lower()][0]
+    except IndexError:
+        raise ValueError(f'{aa} is not a valid amino acid.')
+
+    if not output_type:
+        return aa.name
+    if output_type == 3: 
+        return aa.three_letter_code
+    if output_type == 1:
+        return aa.one_letter_code
+    raise ValueError(f'output_type {output_type} not recognized. Must be set to None (amino acid name), 1 (one letter code), or 3 (three letter code).')
diff --git a/deeprankcore/features/conservation.py b/deeprankcore/features/conservation.py
@@ -9,14 +9,14 @@
 from deeprankcore.molstruct.variant import SingleResidueVariant
 from deeprankcore.utils.graph import Graph
 
-profile_amino_acid_order = sorted(amino_acids, key=lambda aa: aa.one_letter_code)
-
 
 def add_features( # pylint: disable=unused-argument
     pdb_path: str, graph: Graph,
     single_amino_acid_variant: Optional[SingleResidueVariant] = None
     ):
 
+    profile_amino_acid_order = sorted(amino_acids, key=lambda aa: aa.three_letter_code)
+
     for node in graph.nodes:
         if isinstance(node.id, Residue):
             residue = node.id

diff --git a/deeprankcore/molstruct/residue.py b/deeprankcore/molstruct/residue.py
@@ -51,7 +51,7 @@ def get_pssm(self) -> PssmRow:
 
         pssm = self._chain.pssm
         if pssm is None:
-            raise ValueError(f"pssm not set on {self._chain}")
+            raise FileNotFoundError(f'No pssm file found for Chain {self._chain}.')
 
         return pssm[self]
 

diff --git a/deeprankcore/query.py b/deeprankcore/query.py
@@ -17,6 +17,8 @@
 import pdb2sql
 
 import deeprankcore.features
+from deeprankcore.domain.aminoacidlist import convert_aa_nomenclature
+from deeprankcore.features import components, contact
 from deeprankcore.molstruct.aminoacid import AminoAcid
 from deeprankcore.molstruct.atom import Atom
 from deeprankcore.molstruct.residue import get_residue_center
@@ -33,6 +35,46 @@
 _log = logging.getLogger(__name__)
 
 
+def _check_pssm(pdb_path: str, pssm_paths: Dict[str, str]):
+    """Checks whether information stored in PSSM file matches the PDB file.
+
+    Args:
+        pdb_path (str): Path to the PDB file.
+        pssm_paths (Dict[str, str]): The paths to the PSSM files, per chain identifier.
+
+    Raises:
+        ValueError: Raised if info between PDB and PSSM doesn't match
+    """
+
+    pssm_data = {}
+    for chain in pssm_paths:
+        with open(pssm_paths[chain], encoding='utf-8') as f:
+            lines = f.readlines()[1:]
+        for line in lines:
+            pssm_data[chain + line.split()[0].zfill(4)] = convert_aa_nomenclature(line.split()[1], 3)
+
+    # load ground truth from pdb file
+    pdb_truth = pdb2sql.pdb2sql(pdb_path).get_residues()
+    pdb_truth = {res[0] + str(res[2]).zfill(4): res[1] for res in pdb_truth}
+
+    n_wrong = 0
+    n_missing = 0
+    for residue in pdb_truth:
+        try: 
+            if pdb_truth[residue] != pssm_data[residue]:
+                n_wrong += 1
+        except KeyError:
+            n_missing += 1
+
+    if n_missing + n_wrong > 0:
+        error_message = f'Amino acids in PSSM files do not match pdb file for {pdb_path}.'
+        if n_wrong > 0:
+            error_message = error_message + f'\n\t{n_wrong} entries are incorrect.'
+        if n_missing > 0:
+            error_message = error_message + f'\n\t{n_missing} entries are missing.'
+        raise ValueError(error_message)
+
+
 class Query:
 
     def __init__(self, model_id: str, targets: Optional[Dict[str, Union[float, int]]] = None):
@@ -90,6 +132,7 @@ def _load_structure(
 
         # read the pssm
         if pssm_paths is not None:
+            _check_pssm(pdb_path, pssm_paths)
             for chain in structure.chains:
                 if chain.id in pssm_paths:
                     pssm_path = pssm_paths[chain.id]
@@ -121,6 +164,7 @@ def build(self, feature_modules: List[ModuleType], include_hydrogens: bool = Fal
     def get_query_id(self) -> str:
         raise NotImplementedError("Must be defined in child classes.")
 
+
 class QueryCollection:
     """
     Represents the collection of data queries.
@@ -223,10 +267,10 @@ def _process_one_query(  # pylint: disable=too-many-arguments
             _log.exception(e)
             return None
 
-    def process( # pylint: disable=too-many-arguments, too-many-locals
+    def process( # pylint: disable=too-many-arguments, too-many-locals, dangerous-default-value
         self, 
         prefix: Optional[str] = None,
-        feature_modules: Optional[Union[ModuleType, List[ModuleType]]] = None,
+        feature_modules: Union[ModuleType, List[ModuleType], str, List[str]] = [components, contact],
         cpu_count: Optional[int] = None,
         combine_output: bool = True,
         grid_settings: Optional[GridSettings] = None,
@@ -236,11 +280,11 @@ def process( # pylint: disable=too-many-arguments, too-many-locals
         """
         Args:
             prefix (Optional[str], optional): Prefix for the output files. Defaults to None, which sets ./processed-queries- prefix.
-            feature_modules(Union[ModuleType, List[ModuleType]], optional): Features' module or list of features' modules used to generate features. 
-                Each feature's module must implement the :py:func:`add_features` function, and features' modules can be found (or should be placed
-                in case of a custom made feature) in `deeprankcore.features` folder. 
-                Defaults to None, which means that all available modules in `deeprankcore.features` are used to generate
-                the features. 
+            feature_modules (Union[ModuleType, List[ModuleType], str, List[str]], optional): Features' module or list of features' modules
+                used to generate features (given as string or as an imported module). Each module must implement the :py:func:`add_features` function, 
+                and features' modules can be found (or should be placed in case of a custom made feature) in `deeprankcore.features` folder. 
+                If set to 'all', all available modules in `deeprankcore.features` are used to generate the features.
+                Defaults to only the basic feature modules `deeprankcore.features.components` and `deeprankcore.features.contact`.
             cpu_count (Optional[int], optional): How many processes to be run simultaneously. Defaults to None, which takes all available cpu cores.
             combine_output (bool, optional): For combining the HDF5 files generated by the processes. Defaults to True.
             grid_settings (Optional[:class:`GridSettings`], optional): If valid together with `grid_map_method`, the grid data will be stored as well.
@@ -254,29 +298,32 @@ def process( # pylint: disable=too-many-arguments, too-many-locals
             List[str]: The list of paths of the generated HDF5 files.
         """
 
+        # set defaults
+        if prefix is None:
+            prefix = "processed-queries"
         if cpu_count is None:
-            # returns the number of CPUs in the system
-            cpu_count = os.cpu_count()
+            cpu_count = os.cpu_count()  # returns the number of CPUs in the system
         else:
             cpu_count_system = os.cpu_count()
             if cpu_count > cpu_count_system:
                 _log.warning(f'\nTried to set {cpu_count} CPUs, but only {cpu_count_system} are present in the system.')
                 cpu_count = cpu_count_system
-
         self.cpu_count = cpu_count
-
         _log.info(f'\nNumber of CPUs for processing the queries set to: {self.cpu_count}.')
 
-        if prefix is None:
-            prefix = "processed-queries"
-
-        if feature_modules is None:
+
+        if feature_modules == 'all':
             feature_names = [modname for _, modname, _ in pkgutil.iter_modules(deeprankcore.features.__path__)]
         elif isinstance(feature_modules, list):
-            feature_names = [basename(m.__file__)[:-3] for m in feature_modules]
-        else:
+            feature_names = [basename(m.__file__)[:-3] if isinstance(m,ModuleType) 
+                             else m.replace('.py','') for m in feature_modules]
+        elif isinstance(feature_modules, ModuleType):
             feature_names = [basename(feature_modules.__file__)[:-3]]
-
+        elif isinstance(feature_modules, str):
+            feature_names = [feature_modules.replace('.py','')]
+        else:
+            raise ValueError(f'Feature_modules has received an invalid input type: {type(feature_modules)}.')
+        _log.info(f'\nSelected feature modules: {feature_names}.')
 
         _log.info(f'Creating pool function to process {len(self.queries)} queries...')
         pool_function = partial(self._process_one_query, prefix,
@@ -320,13 +367,13 @@ def __init__(  # pylint: disable=too-many-arguments
         Creates a residue graph from a single residue variant in a .PDB file.
 
         Args:
-            pdb_path (str): The path to the .PDB file.
+            pdb_path (str): The path to the PDB file.
             chain_id (str): The .PDB chain identifier of the variant residue.
             residue_number (int): The number of the variant residue.
             insertion_code (str): The insertion code of the variant residue, set to None if not applicable.
             wildtype_amino_acid (:class:`AminoAcid`): The wildtype amino acid.
             variant_amino_acid (:class:`AminoAcid`): The variant amino acid.
-            pssm_paths (Optional[Dict(str,str)], optional): The paths to the .PSSM files, per chain identifier. Defaults to None.
+            pssm_paths (Optional[Dict(str,str)], optional): The paths to the PSSM files, per chain identifier. Defaults to None.
             radius (float, optional): In Ångström, determines how many residues will be included in the graph. Defaults to 10.0.
             distance_cutoff (Optional[float], optional): Max distance in Ångström between a pair of atoms to consider them as an external edge in the graph.
                 Defaults to 4.5.
@@ -598,9 +645,11 @@ def _load_ppi_atoms(pdb_path: str,
 
 def _load_ppi_pssms(pssm_paths: Union[Dict[str, str], None],
                     chain_id1: str, chain_id2: str,
-                    structure: PDBStructure):
+                    structure: PDBStructure,
+                    pdb_path):
 
     if pssm_paths is not None:
+        _check_pssm(pdb_path, pssm_paths)
         for chain_id in [chain_id1, chain_id2]:
             if chain_id in pssm_paths:
 
@@ -612,7 +661,6 @@ def _load_ppi_pssms(pssm_paths: Union[Dict[str, str], None],
                     chain.pssm = parse_pssm(f, chain)
 
 
-
 class ProteinProteinInterfaceAtomicQuery(Query):
 
     def __init__(  # pylint: disable=too-many-arguments
@@ -694,7 +742,7 @@ def build(self, feature_modules: List[ModuleType], include_hydrogens: bool = Fal
 
         _load_ppi_pssms(self._pssm_paths,
                         self._chain_id1, self._chain_id2,
-                        structure)
+                        structure, self._pdb_path)
 
         # add the features
         for feature_module in feature_modules:
@@ -792,7 +840,7 @@ def build(self, feature_modules: List[ModuleType], include_hydrogens: bool = Fal
 
         _load_ppi_pssms(self._pssm_paths,
                         self._chain_id1, self._chain_id2,
-                        structure)
+                        structure, self._pdb_path)
 
         # add the features
         for feature_module in feature_modules:

diff --git a/deeprankcore/utils/parsing/pssm.py b/deeprankcore/utils/parsing/pssm.py
@@ -5,10 +5,6 @@
 from deeprankcore.molstruct.structure import Chain
 from deeprankcore.utils.pssmdata import PssmRow, PssmTable
 
-amino_acids_by_letter = {
-    amino_acid.one_letter_code: amino_acid for amino_acid in amino_acids
-}
-
 
 def parse_pssm(file_: TextIO, chain: Chain) -> PssmTable:
     """Read the PSSM data.
@@ -21,6 +17,7 @@ def parse_pssm(file_: TextIO, chain: Chain) -> PssmTable:
         PssmTable: The position-specific scoring table, parsed from the pssm file.
     """
 
+    amino_acids_by_letter = {amino_acid.one_letter_code: amino_acid for amino_acid in amino_acids}
     conservation_rows = {}
 
     # Read the pssm header.

diff --git a/deeprankcore/utils/pssmdata.py b/deeprankcore/utils/pssmdata.py
@@ -1,11 +1,7 @@
 from typing import Dict, List, Optional
 
-from deeprankcore.domain.aminoacidlist import amino_acids
 from deeprankcore.molstruct.aminoacid import AminoAcid
 
-amino_acids_by_letter = {
-    amino_acid.one_letter_code: amino_acid for amino_acid in amino_acids
-}
 
 class PssmRow:
     """Holds data for one position-specific scoring matrix row."""

diff --git a/docs/installation.md b/docs/installation.md
@@ -9,7 +9,9 @@ Before installing deeprankcore you need to install:
  * [msms](https://ssbio.readthedocs.io/en/latest/instructions/msms.html): `conda install -c bioconda msms`. *For MacOS with M1 chip users*: you can follow [these instructions](https://ssbio.readthedocs.io/en/latest/instructions/msms.html).
  * [dssp](https://swift.cmbi.umcn.nl/gv/dssp/): `sudo apt-get install dssp`
     * See [DSSP docs](https://ssbio.readthedocs.io/en/latest/instructions/dssp.html) for installing it on Mac OSX
- * [pytorch](https://pytorch.org/get-started/locally/): `conda install pytorch torchvision torchaudio cpuonly -c pytorch` or `conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia`, for taking advantage of GPUs.
+ * [pytorch](https://pytorch.org/get-started/locally/): 
+   * CPU only: `conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 cpuonly -c pytorch`
+   * if using GPU: `conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia`
  * [pytorch-geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html): `conda install pyg -c pyg`
  * [Dependencies for pytorch geometric from wheels](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html#installation-from-wheels): `pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-${TORCH}+${CUDA}.html`. 
     - Here, `${TORCH}` and `${CUDA}` should be replaced by the pytorch and CUDA versions installed. You can find these using:

diff --git a/tests/data/pssm/1ATN/1ATN.A.pdb.deeprank.pssm b/tests/data/pssm/1ATN/1ATN.A.pdb.deeprank.pssm