Merge 5348d56 into dd91cee

DeepRank · Nov 16, 2021 · 3b06616 · 3b06616
2 parents dd91cee + 5348d56
commit 3b06616
Show file tree

Hide file tree

Showing 8 changed files with 5,406 additions and 17 deletions.
diff --git a/deeprank/models/atom.py b/deeprank/models/atom.py
@@ -1,13 +1,15 @@
 
 
 class Atom:
-    def __init__(self, id_, position, chain_id, name, element, residue=None):
+    def __init__(self, id_, position, chain_id, name, element, residue=None, altloc=None, occupancy=None):
         self.id = id_
         self.name = name
         self.element = element
         self.chain_id = chain_id
         self.position = position
         self.residue = residue
+        self.altloc = altloc
+        self.occupancy = occupancy
 
     def __hash__(self):
         return hash(self.id)

diff --git a/deeprank/operate/pdb.py b/deeprank/operate/pdb.py
@@ -34,32 +34,50 @@ def get_atoms(pdb2sql):
     # This is a working dictionary of residues, identified by their chains and numbers.
     residues = {}
 
-    # This is the list of atom objects, that will be returned.
-    atoms = []
+    # This is the dictionary of atom objects, its values will be returned.
+    atoms = {}
 
-    # Iterate over the atom output from pdb2sql
-    request_s = "x,y,z,rowID,name,element,chainID,resSeq,resName"
+    # Iterate over the atom output from pdb2sql, select atoms with highest occupancy.
+    request_s = "x,y,z,rowID,name,element,chainID,resSeq,resName,iCode,altLoc,occ"
+    highest_occupancies = {}
     for row in pdb2sql.get(request_s):
 
         try:
-            x, y, z, atom_number, atom_name, element, chain_id, residue_number, residue_name = row
+            x, y, z, atom_number, atom_name, element, chain_id, residue_number, residue_name, insertion_code, altloc, occ = row
         except:
             raise ValueError("Got unexpected row {} for {}".format(row, request_s))
 
+        atom_id = (chain_id, residue_number, insertion_code, atom_name)
+
+        if insertion_code == "":
+            insertion_code = None  # default value
+
         # Make sure that the residue is in the working directory:
-        residue_id = (chain_id, residue_number)
+        residue_id = (chain_id, residue_number, insertion_code)
         if residue_id not in residues:
-            residues[residue_id] = Residue(int(residue_number), residue_name, chain_id)
+            residues[residue_id] = Residue(int(residue_number), residue_name, chain_id, insertion_code)
 
         # Turn the x,y,z into a vector:
         atom_position = numpy.array([x, y, z])
 
-        # Create the atom object and link it to the residue:
-        atom = Atom(atom_number, atom_position, chain_id, atom_name, element, residues[residue_id])
+        atom_id = (chain_id, residue_number, insertion_code, atom_name)
+
+        # If the occupancy is lower than the previous atom with the same id, skip the atom:
+        if atom_id in highest_occupancies:
+            highest_occ = highest_occupancies[atom_id]
+            if occ <= highest_occ:
+                continue
+
+        # otherwise, overwrite..
+        atoms[atom_id] = Atom(atom_number, atom_position, chain_id, atom_name, element, residues[residue_id], altloc, occ)
+        highest_occupancies[atom_id] = occ
+
+    # Link atoms to residues:
+    for (chain_id, residue_number, insertion_code, atom_name), atom in atoms.items():
+        residue_id = (chain_id, residue_number, insertion_code)
         residues[residue_id].atoms.append(atom)
-        atoms.append(atom)
 
-    return atoms
+    return list(atoms.values())
 
 
 def get_residue_contact_atom_pairs(pdb2sql, chain_id, residue_number, insertion_code, max_interatomic_distance):

diff --git a/test/data/pdb/5MNH/5MNH.pdb b/test/data/pdb/5MNH/5MNH.pdb
diff --git a/test/features/test_neighbour_profile.py b/test/features/test_neighbour_profile.py
@@ -14,12 +14,12 @@
 def test_feature():
     tmp_dir_path = mkdtemp()
 
+    variant = PdbVariantSelection("test/101M.pdb", "A", 25, glycine, tryptophan,
+                                  {'A': "test/101M.A.pdb.pssm"})
+
     try:
         hdf5_path = os.path.join(tmp_dir_path, 'test.hdf5')
 
-        variant = PdbVariantSelection("test/101M.pdb", "A", 25, glycine, tryptophan,
-                                      {'A': "test/101M.A.pdb.pssm"})
-
         with h5py.File(hdf5_path, 'w') as f5:
             group = f5.require_group("features")
             __compute_feature__(variant.pdb_path, group, None, variant)

diff --git a/test/generate/test_gridtools.py b/test/generate/test_gridtools.py
@@ -13,7 +13,7 @@
 from deeprank.generate.GridTools import GridTools
 from deeprank.models.variant import PdbVariantSelection
 from deeprank.features.atomic_contacts import __compute_feature__ as compute_contact_feature
-from deeprank.domain.amino_acid import phenylalanine, tyrosine, valine, unknown_amino_acid
+from deeprank.domain.amino_acid import phenylalanine, tyrosine, valine, aspartate, asparagine, unknown_amino_acid
 
 
 _log = logging.getLogger(__name__)
@@ -102,6 +102,41 @@ def test_atomic_contacts_mapping():
         shutil.rmtree(tmp_dir)
 
 
+def test_nan():
+    "this variant was reported as NaN-causing"
+
+    pdb_path = "test/data/pdb/5MNH/5MNH.pdb"
+
+    variant = PdbVariantSelection(pdb_path, 'A', 153, aspartate, asparagine)
+    variant_name = "NaN-causing"
+
+    feature_types = ["coulomb"]
+
+    tmp_dir = mkdtemp()
+    try:
+        tmp_path = os.path.join(tmp_dir, "test.hdf5")
+        with h5py.File(tmp_path, 'w') as f5:
+
+            variant_group = f5.require_group(variant_name)
+            variant_group.attrs['type'] = 'variant'
+            hdf5data.store_variant(variant_group, variant)
+
+            feature_group = variant_group.require_group('features')
+            raw_feature_group = variant_group.require_group('features_raw')
+
+            compute_contact_feature(pdb_path, feature_group, raw_feature_group, variant)
+
+            # Build the grid and map the features.
+            gridtools = GridTools(variant_group, variant,
+                                  number_of_points=20, resolution=1.0,
+                                  atomic_densities={'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
+                                  feature=feature_types,
+                                  contact_distance=8.5,
+                                  try_sparse=False)
+    finally:
+        shutil.rmtree(tmp_dir)
+
+
 def test_feature_mapping():
     """ In this test, we investigate a set of five atoms. We make the grid tool take
         the atoms in a 20 A radius around the first atom and compute the carbon density grid around it.

diff --git a/test/operate/test_pdb.py b/test/operate/test_pdb.py
@@ -17,6 +17,27 @@ def test_xray():
         assert not is_xray(f), "1a6b was identified as x-ray"
 
 
+def test_altloc():
+    pdb_path = "test/data/pdb/5MNH/5MNH.pdb"
+
+    try:
+        pdb = pdb2sql(pdb_path)
+
+        atoms = get_atoms(pdb)
+
+    finally:
+        pdb._close()
+
+    selection = [atom for atom in atoms if atom.residue.number == 153 and
+                                           atom.chain_id == "A" and
+                                           atom.name == "CA"]
+
+    assert len(selection) == 1, "got {} of the same atom".format(len(selection))
+
+    # should be altloc C
+    assert selection[0].altloc == 'C', "got atom {} instead".format(selection[0].altloc)
+
+
 def test_get_atoms():
     pdb_path = os.path.join(pkg_resources.resource_filename(__name__, ''),
                             "../1AK4/native/1AK4.pdb")
@@ -63,6 +84,7 @@ def test_residue_contact_atoms():
         query_residue = _find_residue(atoms, chain_id, residue_number)
 
         contact_atom_pairs = get_residue_contact_atom_pairs(pdb, chain_id, residue_number, None, 8.5)
+        assert len(contact_atom_pairs) > 0, "no contacts found"
 
         # Check for redundancy (we shouldn't see the same set of atoms twice)
         atom_pairs_encountered = []
@@ -101,3 +123,25 @@ def test_residue_contact_atoms():
 
     ok_(neighbour in contact_atoms)
     ok_(distant not in contact_atoms)
+
+
+def test_contacts_101m():
+    pdb_path = "test/101M.pdb"
+    chain_id = "A"
+
+    residue_number = 25
+
+    pdb = pdb2sql(pdb_path)
+    try:
+        contact_atom_pairs = get_residue_contact_atom_pairs(pdb, chain_id, residue_number, None, 10.0)
+
+        assert len(contact_atom_pairs) > 0, "no contacts found"
+
+        for atom1, atom2 in contact_atom_pairs:
+            assert atom1.residue is not None
+            assert len(atom1.residue.atoms) > 0
+
+            assert atom2.residue is not None
+            assert len(atom2.residue.atoms) > 0
+    finally:
+        pdb._close()
diff --git a/test/test_generate.py b/test/test_generate.py
@@ -1,10 +1,12 @@
 import os
+import sys
 import unittest
 from time import time
 import shutil
 from tempfile import mkdtemp
 import logging
 
+from unittest.mock import patch, MagicMock
 import numpy
 import h5py
 from nose.tools import eq_, ok_
@@ -119,3 +121,47 @@ def test_skip_error():
     finally:
         shutil.rmtree(tmp_dir)
 
+
+@patch("deeprank.generate.GridTools.map_features")
+def test_skip_nan(mock_map_features):
+    "NaN features should not be added to the preprocessing"
+
+    number_of_points = 20
+
+    nan_dict = {}
+    for feature_name in ["feature1", "feature2"]:
+
+        grid = numpy.empty((number_of_points, number_of_points, number_of_points))
+        grid[:] = numpy.nan
+
+        nan_dict[feature_name] = grid
+
+    mock_map_features.return_value = nan_dict
+
+    tmp_dir = mkdtemp()
+
+    variants = [PdbVariantSelection("test/101m.pdb", 'A', 25, glycine, alanine)]
+
+    feature_names = ["test.feature.feature1", "test.feature.feature2"]
+    target_names = ["test.target.target1"]
+
+    resolution = 1.0
+    atomic_densities = {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8}
+    grid_info = {
+        'number_of_points': [number_of_points, number_of_points, number_of_points],
+        'resolution': [resolution, resolution, resolution],
+        'atomic_densities': atomic_densities,
+    }
+
+    hdf5_path = os.path.join(tmp_dir, "test.hdf5")
+
+    try:
+        data_generator = DataGenerator(variants, None, target_names, feature_names, 1, hdf5_path)
+        data_generator.create_database()
+        data_generator.map_features(grid_info)
+
+        # Check that the output file is empty, since nan entries don't belong in the output.
+        with h5py.File(hdf5_path, 'r') as f5:
+            ok_(len(f5.keys()) == 0)
+    finally:
+        shutil.rmtree(tmp_dir)
diff --git a/test/test_learn.py b/test/test_learn.py
@@ -29,7 +29,7 @@ def test_learn():
     feature_modules = ["test.feature.feature1", "test.feature.feature2"]
     target_modules = ["test.target.target1"]
     pdb_path = "test/101M.pdb"
-    pssm_paths = {"A": "101M.A.pdb.pssm"}
+    pssm_paths = {"A": "test/101M.A.pdb.pssm"}
 
     atomic_densities = {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8}
     grid_info = {