Merge pull request #101 from DeepRank/issue92_atomdensity

Update atom densities feature calculation
DeepRank · Sep 18, 2019 · e53f60b · e53f60b
2 parents 5bc7968 + fb37bed
commit e53f60b
Show file tree

Hide file tree

Showing 15 changed files with 188 additions and 120 deletions.
diff --git a/deeprank/config/__init__.py b/deeprank/config/__init__.py
@@ -5,6 +5,7 @@
 from .chemicals import AA_codes, AA_codes_3to1, AA_codes_1to3
 from .chemicals import AA_codes_pssm_ordered
 from .chemicals import AA_properties
+from .chemicals import atom_vdw_radius, atom_vdw_radius_noH
 
 # Debug
 DEBUG = False

diff --git a/deeprank/config/chemicals.py b/deeprank/config/chemicals.py
@@ -70,4 +70,19 @@
     'THR': 'polar',
     'TRP': 'polar',
     'TYR': 'polar'
-    }
+    }
+
+
+# atom vdw radius
+# William M Haynes. CRC Handbook of Chemistry and Physics.
+# ISBN 9781482208689.
+# URL: https://books.google.no/books?id=bNDMBQAAQBAJ.
+
+atom_vdw_radius_noH = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+    }
+
+atom_vdw_radius = {**atom_vdw_radius_noH, "H": 1.1}
diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py
@@ -803,7 +803,7 @@ def map_features(self, grid_info={},
         >>> grid_info = {
         >>>     'number_of_points' : [30,30,30],
         >>>     'resolution' : [1.,1.,1.],
-        >>>     'atomic_densities' : {'CA':3.5,'N':3.5,'O':3.5,'C':3.5},
+        >>>     'atomic_densities' : {'C':1.7, 'N':1.55, 'O':1.52, 'S':1.8},
         >>> }
         >>>
         >>> database.map_features(grid_info,try_sparse=True,time=False,prog_bar=True)
@@ -1303,7 +1303,7 @@ def _add_pdb(molgrp, pdbfile, name):
                     for line in fi if line.startswith('ATOM')]
         #  PDB default line length is 80
         #  http://www.wwpdb.org/documentation/file-format
-        data = np.array(data).astype('|S73')
+        data = np.array(data).astype('|S78')
         molgrp.create_dataset(name, data=data)
 
 
@@ -1341,6 +1341,8 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
         # close the db
         sqldb.close()
 
+        # TODO the output does not obey PDB format
+        # TODO should not strip them!
         # export the data to h5
         data = []
         for d in sqldata:
@@ -1358,6 +1360,7 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
             line += '{: 8.3f}'.format(d[7])  # x
             line += '{: 8.3f}'.format(d[8])  # y
             line += '{: 8.3f}'.format(d[9])  # z
+            # TODO add the element
             try:
                 line += '{: 6.2f}'.format(d[10])    # occ
                 line += '{: 6.2f}'.format(d[11])    # temp
@@ -1366,7 +1369,7 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
                 line += '{: 6.2f}'.format(0)    # temp
             data.append(line)
 
-        data = np.array(data).astype('|S73')
+        data = np.array(data).astype('|S78')
         molgrp.create_dataset(name, data=data)
 
         return center

diff --git a/deeprank/generate/GridTools.py b/deeprank/generate/GridTools.py
@@ -35,8 +35,8 @@ def __init__(self, molgrp,
             number_of_points(int, optional): number of points we want in
                 each direction of the grid.
             resolution(float, optional): distance(in Angs) between two points.
-            atomic_densities(dict, optional): dictionary of atom types with
-                their vdw radius, e.g. {'CA':1.7, 'C':1.7, 'N':1.55, 'O':1.52}
+            atomic_densities(dict, optional): dictionary of element types with
+                their vdw radius, see deeprank.config.atom_vdw_radius_noH
             atomic_densities_mode(str, optional): Mode for mapping
                 (deprecated must be 'ind').
             feature(None, optional): Name of the features to be mapped.
@@ -336,24 +336,15 @@ def map_atomic_densities(self, only_contact=True):
                      self.sqldb.get('rowID', chainID='B'))
 
         # loop over all the data we want
-        for atomtype, vdw_rad in self.local_tqdm(
+        for elementtype, vdw_rad in self.local_tqdm(
                 self.atomic_densities.items()):
 
             t0 = time()
 
-            # get the contact atom that of the correct type on both chains
-            if only_contact:
-                xyzA = np.array(self.sqldb.get(
-                    'x,y,z', rowID=index[0], name=atomtype))
-                xyzB = np.array(self.sqldb.get(
-                    'x,y,z', rowID=index[1], name=atomtype))
-
-            else:
-                # get the atom that are of the correct type on both chains
-                xyzA = np.array(self.sqldb.get(
-                    'x,y,z', chainID='A', name=atomtype))
-                xyzB = np.array(self.sqldb.get(
-                    'x,y,z', chainID='B', name=atomtype))
+            xyzA = np.array(self.sqldb.get(
+                'x,y,z', rowID=index[0], element=elementtype))
+            xyzB = np.array(self.sqldb.get(
+                'x,y,z', rowID=index[1], element=elementtype))
 
             tprocess = time() - t0
 
@@ -404,16 +395,16 @@ def map_atomic_densities(self, only_contact=True):
 
             # create the final grid : A - B
             if mode == 'diff':
-                self.atdens[atomtype] = atdensA - atdensB
+                self.atdens[elementtype] = atdensA - atdensB
 
             # create the final grid : A + B
             elif mode == 'sum':
-                self.atdens[atomtype] = atdensA + atdensB
+                self.atdens[elementtype] = atdensA + atdensB
 
             # create the final grid : A and B
             elif mode == 'ind':
-                self.atdens[atomtype + '_chainA'] = atdensA
-                self.atdens[atomtype + '_chainB'] = atdensB
+                self.atdens[elementtype + '_chainA'] = atdensA
+                self.atdens[elementtype + '_chainB'] = atdensB
             else:
                 raise ValueError(f'Atomic density mode {mode} not recognized')
 

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
@@ -10,6 +10,7 @@
 import numpy as np
 from tqdm import tqdm
 
+from deeprank import config
 from deeprank.config import logger
 from deeprank.generate import MinMaxParam, NormalizeData, NormParam
 from deeprank.tools import pdb2sql, sparse
@@ -65,8 +66,7 @@ def __init__(self, train_database, valid_database=None, test_database=None,
                     Select the features used in the learning.
                     if mapfly is True:
                         {'AtomDensities': 'all', 'Features': 'all'}
-                        {'AtomicDensities': {
-                            'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52},
+                        {'AtomicDensities': config.atom_vdw_radius_noH,
                             'Features': ['PSSM_*', 'pssm_ic_*']}
                     if mapfly is False:
                         {'AtomDensities_ind': 'all',
@@ -196,10 +196,10 @@ def __init__(self, train_database, valid_database=None, test_database=None,
     @staticmethod
     def _get_database_name(database):
         """Get the list of hdf5 database file names.
-       
+
         Args:
             database(None, str or list(str)): hdf5 database name(s).
-       
+
         Returns:
             list: hdf5 file names
         """
@@ -310,6 +310,7 @@ def __getitem__(self, index):
         """
 
         fname, mol, angle, axis = self.index_complexes[index]
+        print(fname, mol)
 
         if self.mapfly:
             feature, target = self.map_one_molecule(fname, mol, angle, axis)
@@ -500,7 +501,6 @@ def filter(self, molgrp):
 
         Args:
             molgrp (str): group name of the molecule in the hdf5 file
-           
         Returns:
             bool: True if we keep the complex False otherwise
 
@@ -541,7 +541,7 @@ class parameter self.select_feature examples:
             - {'Feature_ind': ['PSSM_*', 'pssm_ic_*']}
 
             Feature type must be: 'AtomicDensities_ind' or 'Feature_ind'.
-       
+
         Raises:
             KeyError: Wrong feature type.
             KeyError: Wrong feature type.
@@ -632,12 +632,10 @@ def get_raw_feature_name(self):
             class parameter self.select_feature examples:
             - 'all'
             - {'AtomicDensities': 'all', 'Features':all}
-            - {'AtomicDensities': {
-                'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52},
+            - {'AtomicDensities': config.atom_vaw_radius_noH,
                'Features': ['PSSM_*', 'pssm_ic_*']}
 
             Feature type must be: 'AtomicDensities' or 'Features'.
-       
         Raises:
             KeyError: Wrong feature type.
             KeyError: Wrong feature type.
@@ -650,8 +648,7 @@ class parameter self.select_feature examples:
         # if we select all the features
         if self.select_feature == "all":
             self.select_feature = {}
-            self.select_feature['AtomicDensities'] = {
-                'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52}
+            self.select_feature['AtomicDensities'] = config.atom_vdw_radius_noH
             self.select_feature['Features'] = [
                 name for name in raw_data.keys()]
 
@@ -663,8 +660,8 @@ class parameter self.select_feature examples:
                 # if for a given type we need all the feature
                 if feat_names == 'all':
                     if feat_type == 'AtomicDensities':
-                        self.select_feature['AtomicDensities'] = {
-                            'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52}
+                        self.select_feature['AtomicDensities'] = \
+                            config.atom_vdw_radius_noH
                     elif feat_type == 'Features':
                         self.select_feature[feat_type] = list(raw_data.keys())
                     else:
@@ -1220,13 +1217,13 @@ def make_feature_pair(feature, op):
 
     def get_grid(self, mol_data):
         """Get meshed grids and number of pointgs
-       
+
         Args:
             mol_data(h5 group): HDF5 moleucle group
-       
+
         Raises:
             ValueError: Grid points not found in mol_data.
-       
+
         Returns:
             tuple, tuple: meshgrid, npts
         """
@@ -1269,9 +1266,9 @@ def get_grid(self, mol_data):
     def map_atomic_densities(
             self, feat_names, mol_data, grid, npts, angle, axis):
         """Map atomic densities.
-       
+
         Args:
-            feat_names(dict): Atom type and vdw radius
+            feat_names(dict): Element type and vdw radius
             mol_data(h5 group): HDF5 molecule group
             grid(tuple): mesh grid of x,y,z
             npts(tuple): number of points on axis x,y,z
@@ -1289,16 +1286,21 @@ def map_atomic_densities(
             center = [np.mean(g) for g in grid]
 
         densities = []
-        for atomtype, vdw_rad in feat_names.items():
+        for elementtype, vdw_rad in feat_names.items():
 
             # get pos of the contact atoms of correct type
-            xyzA = np.array(sql.get('x,y,z', rowID=index[0], name=atomtype))
-            xyzB = np.array(sql.get('x,y,z', rowID=index[1], name=atomtype))
+            xyzA = np.array(sql.get(
+                'x,y,z', rowID=index[0], element=elementtype))
+            xyzB = np.array(sql.get(
+                'x,y,z', rowID=index[1], element=elementtype))
 
             # rotate if necessary
             if angle is not None:
-                xyzA = self._rotate_coord(xyzA, center, angle, axis)
-                xyzB = self._rotate_coord(xyzB, center, angle, axis)
+                if xyzA != np.array([]):
+                    xyzA = self._rotate_coord(xyzA, center, angle, axis)
+
+                if xyzB != np.array([]):
+                    xyzB = self._rotate_coord(xyzB, center, angle, axis)
 
             # init the grid
             atdensA = np.zeros(npts)

diff --git a/deeprank/tools/README.md b/deeprank/tools/README.md
@@ -4,7 +4,7 @@ Here are located all the generic tools used during one or across multiple steps
 
 ## PDB2SQL
 
-The file pdb2sql.py contains a class named pdb2sql that allows using sqlite3 to manipulate PDB files. The use of SQL queries makes it very easy to extract information from the PDB file using only one line of code. 
+The file pdb2sql.py contains a class named pdb2sql that allows using sqlite3 to manipulate PDB files. The use of SQL queries makes it very easy to extract information from the PDB file using only one line of code.
 
 ### Create a SQl data base
 
@@ -27,11 +27,11 @@ After its creation the database contains 13 columns and one line for each atom i
   * resName : the name of the residue the atom belongs to
   * chaiID  : the ID of the chain the atom belongs to
   * resSeq  : the residue number the atom belongs to
-  * iCode   : Code for insertion of residue 
+  * iCode   : Code for insertion of residue
   * x       : x coordinate of the atom
   * y       : y coordinate of the atom
   * z       : z coordinate of the atom
-  * occ     : occupancy 
+  * occ     : occupancy
   * temp    : temperature factor
 
 
@@ -138,19 +138,19 @@ sqldb.close(rmdb=False)
 
 ## FeatureClass
 
-The file FeatureClass.py contain a super class that all feature calculations should subclass. So far the super class only contains one method **FatureClass.export_data()** that is used to export the data of the feature to a file. This ensure that we keep the same syntax for all the features. The class has 3 attributes 
+The file FeatureClass.py contain a super class that all feature calculations should subclass. So far the super class only contains one method **FatureClass.export_data()** that is used to export the data of the feature to a file. This ensure that we keep the same syntax for all the features. The class has 3 attributes
 
 
   * self.type         : "Atomic" or "Residue"
   * self.feature_data : dictionary {feature_name : feature_dict}
 
     feature_name is the name of the feature e.g. 'coulomb' or 'vdwaals'
-    
+
     feature_dict is a dictionary. The format of the key depends on the type of feature
-    
+
     residue-based feature
     {(chainID, residue_name(3-letter), residue_number) : [values1, values2, ....]}
-    
+
     atomic-based feature
     {(chainID, residue_name(3-letter), residue_number, atom_name) : [values1, values2, ....]}
 
@@ -191,11 +191,11 @@ The file atomic_feature.py contains a class named atomicFeature that allows comp
   * a file containing the vdw parameters
   * evantually a patch file for the force field parameters
 
-An example of use is provided in ./example/grid/atomicfeature.py. 
+An example of use is provided in ./example/grid/atomicfeature.py.
 
-```python 
+```python
 from deeprank.tools import atomicFeature
-  
+
 PDB = 'complex.pdb'
 FF = './forcefield/'
 
@@ -225,4 +225,4 @@ atfeat.sqldb.close()
 ```
 
 
-In this example we compute the pair interactions and the atomic charges of the complex given in the example folder and using the force field parameters also located there. The pair interactions are outputed on the screen. For the charges, the contact atom list is extended to all the residues that contains at least one contact atom. 
+In this example we compute the pair interactions and the atomic charges of the complex given in the example folder and using the force field parameters also located there. The pair interactions are outputed on the screen. For the charges, the contact atom list is extended to all the residues that contains at least one contact atom.