From e54334b78d0095accb744b848c53ddfb85fda3b0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 11 Sep 2019 16:28:49 +0200
Subject: [PATCH 01/14] add element column and get_element func

---
 deeprank/tools/pdb2sql.py | 49 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/deeprank/tools/pdb2sql.py b/deeprank/tools/pdb2sql.py
index 8e0dce0f..3abbfddb 100644
--- a/deeprank/tools/pdb2sql.py
+++ b/deeprank/tools/pdb2sql.py
@@ -113,7 +113,8 @@ def _create_sql(self):
                     'y': 'REAL',
                     'z': 'REAL',
                     'occ': 'REAL',
-                    'temp': 'REAL'
+                    'temp': 'REAL',
+                    'element': 'TEXT' 
                     }
 
         # delimtier of the column format
@@ -131,7 +132,9 @@ def _create_sql(self):
             'y': [38, 46],
             'z': [46, 54],
             'occ': [54, 60],
-            'temp': [60, 66]}
+            'temp': [60, 66],
+            'element': [76,78]
+            }
 
         if self.no_extra:
             del self.col['occ']
@@ -230,6 +233,10 @@ def _create_sql(self):
                 elif coltype == 'REAL':
                     data_col = float(data_col)
 
+                # get element if it does not exist
+                if colname == "element" and not data_col: 
+                    data_col = pdb2sql._get_element(line)
+
                 # append keep the comma !!
                 # we need proper tuple
                 at += (data_col,)
@@ -240,6 +247,42 @@ def _create_sql(self):
         # push in the database
         self.c.executemany(f'INSERT INTO ATOM VALUES ({qm})', data_atom)
 
+    @staticmethod
+    def _get_element(pdb_line):
+        """Get element type from the atom type of a pdb line
+        
+        Notes:
+            Atom type occupies 13-16th columns of a PDB line.
+            http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
+            Four situations exist:
+                13 14 15 16
+                   C  A      The element is C
+                C  A         The element is Ca   
+                1  H  G      The element is H
+                H  E  2  1   The element is H
+
+        Args:
+            pdb_line(str): one PDB ATOM line
+        
+        Returns:
+            [str]: element name
+        """
+
+        first_char = pdb_line[12].strip()
+        last_char = pdb_line[15].strip()
+        if first_char:
+            if first_char in "0123456789":
+                elem = pdb_line[13]
+            elif first_char == "H" and last_char:
+                elem = "H"
+            else:
+                elem = pdb_line[12:14]
+            
+        else:
+            elem = pdb_line[13]
+        return elem
+
+
     def _fix_chainID(self):
         """Fix the chain ID if necessary.
 
@@ -1077,6 +1120,8 @@ def exportpdb(self, fname, **kwargs):
         # write each line
         # the PDB format is pretty strict
         # http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
+        # TODO make sure the output of atom type on correct position.
+        # TODO use exportpdb in DataGenerator
         f = open(fname, 'w')
         for d in data:
             line = 'ATOM  '

From 3b434488b2fbb0674ca82b795d21ce3b3d7933a0 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 11 Sep 2019 16:39:04 +0200
Subject: [PATCH 02/14] change atom density to element level

---
 deeprank/generate/GridTools.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/deeprank/generate/GridTools.py b/deeprank/generate/GridTools.py
index 4ddc08e3..6a8b4426 100644
--- a/deeprank/generate/GridTools.py
+++ b/deeprank/generate/GridTools.py
@@ -336,24 +336,15 @@ def map_atomic_densities(self, only_contact=True):
                      self.sqldb.get('rowID', chainID='B'))
 
         # loop over all the data we want
-        for atomtype, vdw_rad in self.local_tqdm(
+        for elementtype, vdw_rad in self.local_tqdm(
                 self.atomic_densities.items()):
 
             t0 = time()
 
-            # get the contact atom that of the correct type on both chains
-            if only_contact:
-                xyzA = np.array(self.sqldb.get(
-                    'x,y,z', rowID=index[0], name=atomtype))
-                xyzB = np.array(self.sqldb.get(
-                    'x,y,z', rowID=index[1], name=atomtype))
-
-            else:
-                # get the atom that are of the correct type on both chains
-                xyzA = np.array(self.sqldb.get(
-                    'x,y,z', chainID='A', name=atomtype))
-                xyzB = np.array(self.sqldb.get(
-                    'x,y,z', chainID='B', name=atomtype))
+            xyzA = np.array(self.sqldb.get(
+                'x,y,z', rowID=index[0], element=elementtype))
+            xyzB = np.array(self.sqldb.get(
+                'x,y,z', rowID=index[1], element=elementtype))
 
             tprocess = time() - t0
 
@@ -404,16 +395,16 @@ def map_atomic_densities(self, only_contact=True):
 
             # create the final grid : A - B
             if mode == 'diff':
-                self.atdens[atomtype] = atdensA - atdensB
+                self.atdens[elementtype] = atdensA - atdensB
 
             # create the final grid : A + B
             elif mode == 'sum':
-                self.atdens[atomtype] = atdensA + atdensB
+                self.atdens[elementtype] = atdensA + atdensB
 
             # create the final grid : A and B
             elif mode == 'ind':
-                self.atdens[atomtype + '_chainA'] = atdensA
-                self.atdens[atomtype + '_chainB'] = atdensB
+                self.atdens[elementtype + '_chainA'] = atdensA
+                self.atdens[elementtype + '_chainB'] = atdensB
             else:
                 raise ValueError(f'Atomic density mode {mode} not recognized')
 

From ff241d60cafb390962d90b1cbe1f1e5c661f1b11 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 11 Sep 2019 16:58:49 +0200
Subject: [PATCH 03/14] add atom vaw radius

---
 deeprank/config/chemicals.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/deeprank/config/chemicals.py b/deeprank/config/chemicals.py
index 4c537d28..c39b003b 100644
--- a/deeprank/config/chemicals.py
+++ b/deeprank/config/chemicals.py
@@ -70,4 +70,17 @@
     'THR': 'polar',
     'TRP': 'polar',
     'TYR': 'polar'
-    }
\ No newline at end of file
+    }
+
+
+# atom vdw radius
+# https://en.wikipedia.org/wiki/Van_der_Waals_radius
+
+atom_vdw_radius_noH = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+    }
+
+atom_vdw_radius = {**atom_vdw_radius_noH, "H": 1.1}
\ No newline at end of file

From 57c0d424147444e85fc5f2612a5ba0412612d2e8 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Wed, 11 Sep 2019 16:59:16 +0200
Subject: [PATCH 04/14] change PDB line length from 73 to 78

---
 deeprank/generate/DataGenerator.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py
index 706ec583..5c838ef3 100644
--- a/deeprank/generate/DataGenerator.py
+++ b/deeprank/generate/DataGenerator.py
@@ -1303,7 +1303,7 @@ def _add_pdb(molgrp, pdbfile, name):
                     for line in fi if line.startswith('ATOM')]
         #  PDB default line length is 80
         #  http://www.wwpdb.org/documentation/file-format
-        data = np.array(data).astype('|S73')
+        data = np.array(data).astype('|S78')
         molgrp.create_dataset(name, data=data)
 
 
@@ -1341,6 +1341,8 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
         # close the db
         sqldb.close()
 
+        # TODO the output does not obey PDB format
+        # TODO should not strip them!
         # export the data to h5
         data = []
         for d in sqldata:
@@ -1358,6 +1360,7 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
             line += '{: 8.3f}'.format(d[7])  # x
             line += '{: 8.3f}'.format(d[8])  # y
             line += '{: 8.3f}'.format(d[9])  # z
+            # TODO add the element
             try:
                 line += '{: 6.2f}'.format(d[10])    # occ
                 line += '{: 6.2f}'.format(d[11])    # temp
@@ -1366,7 +1369,7 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle):
                 line += '{: 6.2f}'.format(0)    # temp
             data.append(line)
 
-        data = np.array(data).astype('|S73')
+        data = np.array(data).astype('|S78')
         molgrp.create_dataset(name, data=data)
 
         return center

From dc551745d3295edd4b715e6b1798643c3ce0605f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 12 Sep 2019 17:14:35 +0200
Subject: [PATCH 05/14] update chemicals

---
 deeprank/config/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deeprank/config/__init__.py b/deeprank/config/__init__.py
index cb0eb546..fdde521a 100644
--- a/deeprank/config/__init__.py
+++ b/deeprank/config/__init__.py
@@ -5,6 +5,7 @@
 from .chemicals import AA_codes, AA_codes_3to1, AA_codes_1to3 
 from .chemicals import AA_codes_pssm_ordered
 from .chemicals import AA_properties
+from .chemicals import atom_vdw_radius, atom_vdw_radius_noH
 
 # Debug
 DEBUG = False

From 47e952720d1da6fe30a0a2dc078f5dea81cc6e5e Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 12 Sep 2019 17:18:44 +0200
Subject: [PATCH 06/14] Update atom density values

---
 deeprank/generate/DataGenerator.py |  2 +-
 deeprank/generate/GridTools.py     |  4 +--
 deeprank/utils/launch.py           |  7 +---
 example/generate_dataset.py        |  2 +-
 example/learn.py                   |  2 +-
 example/learn_batch.py             |  2 +-
 example/learn_batch_new.py         |  2 +-
 test/test_learn.py                 | 58 +++++++-----------------------
 8 files changed, 21 insertions(+), 58 deletions(-)

diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py
index 5c838ef3..921b8249 100644
--- a/deeprank/generate/DataGenerator.py
+++ b/deeprank/generate/DataGenerator.py
@@ -803,7 +803,7 @@ def map_features(self, grid_info={},
         >>> grid_info = {
         >>>     'number_of_points' : [30,30,30],
         >>>     'resolution' : [1.,1.,1.],
-        >>>     'atomic_densities' : {'CA':3.5,'N':3.5,'O':3.5,'C':3.5},
+        >>>     'atomic_densities' : {'C':1.7, 'N':1.55, 'O':1.52, 'S':1.8},
         >>> }
         >>>
         >>> database.map_features(grid_info,try_sparse=True,time=False,prog_bar=True)
diff --git a/deeprank/generate/GridTools.py b/deeprank/generate/GridTools.py
index 6a8b4426..37d2483d 100644
--- a/deeprank/generate/GridTools.py
+++ b/deeprank/generate/GridTools.py
@@ -35,8 +35,8 @@ def __init__(self, molgrp,
             number_of_points(int, optional): number of points we want in
                 each direction of the grid.
             resolution(float, optional): distance(in Angs) between two points.
-            atomic_densities(dict, optional): dictionary of atom types with
-                their vdw radius, e.g. {'CA':1.7, 'C':1.7, 'N':1.55, 'O':1.52}
+            atomic_densities(dict, optional): dictionary of element types with
+                their vdw radius, see deeprank.config.atom_vdw_radius_noH 
             atomic_densities_mode(str, optional): Mode for mapping
                 (deprecated must be 'ind').
             feature(None, optional): Name of the features to be mapped.
diff --git a/deeprank/utils/launch.py b/deeprank/utils/launch.py
index 9a06e668..99b8805b 100755
--- a/deeprank/utils/launch.py
+++ b/deeprank/utils/launch.py
@@ -59,12 +59,7 @@ def generate(LIST_NAME, clean=False):
                 1.,
                 1.,
                 1.],
-            'atomic_densities': {
-                'CA': 3.5,
-                'CB': 3.5,
-                'N': 3.5,
-                'O': 3.5,
-                'C': 3.5},
+            'atomic_densities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
             'atomic_densities_mode': 'diff',
             'feature_mode': 'sum'}
 
diff --git a/example/generate_dataset.py b/example/generate_dataset.py
index a9949026..7255c761 100644
--- a/example/generate_dataset.py
+++ b/example/generate_dataset.py
@@ -48,7 +48,7 @@
 # grid_info = {
 #   'number_of_points' : [30,30,30],
 #   'resolution' : [1.,1.,1.],
-#   'atomic_densities' : {'CA':3.5,'N':3.5,'O':3.5,'C':3.5},
+#   'atomic_densities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
 # }
 
 # generate the grid
diff --git a/example/learn.py b/example/learn.py
index 5227de4d..92498f42 100644
--- a/example/learn.py
+++ b/example/learn.py
@@ -34,7 +34,7 @@
                            10, 10, 10], 'resolution': [
                            3, 3, 3]},
 
-                   select_feature={'AtomicDensities': {'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52},
+                   select_feature={'AtomicDensities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
                                    'Features': ['coulomb', 'vdwaals', 'charge', 'PSSM_*']},
 
                    # select_target='DOCKQ',  # regression
diff --git a/example/learn_batch.py b/example/learn_batch.py
index 8b8b1c27..c97f5cf3 100644
--- a/example/learn_batch.py
+++ b/example/learn_batch.py
@@ -148,7 +148,7 @@ def main():
                                6, 6, 6], 'resolution': [
                                5, 5, 5]},
 
-                       #            select_feature={'AtomicDensities' : {'CA':1.7, 'C':1.7, 'N':1.55, 'O':1.52},
+                       #            select_feature={'AtomicDensities' : {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
                        #                			'Features'        : ['coulomb','vdwaals','charge','PSSM_*'] },
                        #           select_feature = 'all',
                        select_feature={'Features': ['PSSM_*']},
diff --git a/example/learn_batch_new.py b/example/learn_batch_new.py
index 05907f94..91602ed2 100644
--- a/example/learn_batch_new.py
+++ b/example/learn_batch_new.py
@@ -161,7 +161,7 @@ def main():
                                6, 6, 6], 'resolution': [
                                5, 5, 5]},
 
-                       #            select_feature={'AtomicDensities' : {'CA':1.7, 'C':1.7, 'N':1.55, 'O':1.52},
+                       #            select_feature={'AtomicDensities' : {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
                        #                			'Features'        : ['coulomb','vdwaals','charge','PSSM_*'] },
                        #select_feature = 'all',
                        select_feature={'Feature_ind': ['coulomb']},
diff --git a/test/test_learn.py b/test/test_learn.py
index 4f292c20..ebfe5b93 100644
--- a/test/test_learn.py
+++ b/test/test_learn.py
@@ -43,33 +43,18 @@ def test_learn_3d_reg_mapfly():
             mapfly=True,
             use_rotation=1,
             grid_info={
-                'number_of_points': (
-                    10,
-                    10,
-                    10),
-                'resolution': (
-                    3,
-                    3,
-                    3)},
+                'number_of_points': (10, 10, 10),
+                'resolution': (3, 3, 3)},
             select_feature={
-                'AtomicDensities': {
-                    'CA': 1.7,
-                    'C': 1.7,
-                    'N': 1.55,
-                    'O': 1.52},
-                'Features': [
-                    'coulomb',
-                    'vdwaals',
-                    'charge',
-                    'PSSM_*']},
+                'AtomicDensities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
+                'Features': ['coulomb', 'vdwaals', 'charge', 'PSSM_*']},
             select_target='DOCKQ',
             tqdm=True,
             normalize_features=False,
             normalize_targets=False,
             clip_features=False,
             pair_chain_feature=np.add,
-            dict_filter={
-                'DOCKQ': '<1'})
+            dict_filter={'DOCKQ': '<1'})
         # dict_filter={'IRMSD':'<4. or >10.'})
 
         # create the networkt
@@ -81,6 +66,7 @@ def test_learn_3d_reg_mapfly():
             nepoch=5,
             divide_trainset=0.8,
             train_batch_size=5,
+            preshuffle_seed=2019,
             num_workers=0)
 
     @unittest.skipIf(skip, "torch fails on Travis")
@@ -109,25 +95,17 @@ def test_learn_3d_reg():
             test_database=None,
             mapfly=False,
             use_rotation=2,
-            grid_shape=(
-                30,
-                30,
-                30),
+            grid_shape=(30, 30, 30),
             select_feature={
                 'AtomicDensities_ind': 'all',
-                'Feature_ind': [
-                    'coulomb',
-                    'vdwaals',
-                    'charge',
-                    'PSSM_*']},
+                'Feature_ind': [ 'coulomb', 'vdwaals', 'charge', 'PSSM_*']},
             select_target='DOCKQ',
             tqdm=True,
             normalize_features=True,
             normalize_targets=True,
             clip_features=False,
             pair_chain_feature=np.add,
-            dict_filter={
-                'DOCKQ': '<1.'})
+            dict_filter={ 'DOCKQ': '<1.'})
         # dict_filter={'IRMSD':'<4. or >10.'})
 
         # create the networkt
@@ -140,6 +118,7 @@ def test_learn_3d_reg():
             divide_trainset=0.8,
             train_batch_size=5,
             num_workers=0,
+            preshuffle_seed=2019,
             save_model='all')
 
     @unittest.skipIf(skip, "Torch fails on Travis")
@@ -163,17 +142,10 @@ def test_learn_3d_class():
             valid_database=None,
             test_database=None,
             mapfly=False,
-            grid_shape=(
-                30,
-                30,
-                30),
+            grid_shape=( 30, 30, 30),
             select_feature={
                 'AtomicDensities_ind': 'all',
-                'Feature_ind': [
-                    'coulomb',
-                    'vdwaals',
-                    'charge',
-                    'PSSM_*']},
+                'Feature_ind': [ 'coulomb', 'vdwaals', 'charge', 'PSSM_*']},
             select_target='BIN_CLASS',
             tqdm=True,
             normalize_features=True,
@@ -220,11 +192,7 @@ def test_learn_2d_reg():
             mapfly=False,
             select_feature={
                 'AtomicDensities_ind': 'all',
-                'Feature_ind': [
-                    'coulomb',
-                    'vdwaals',
-                    'charge',
-                    'PSSM_*']},
+                'Feature_ind': [ 'coulomb', 'vdwaals', 'charge', 'PSSM_*']},
             select_target='DOCKQ',
             tqdm=True,
             normalize_features=True,

From 10705c492e2289822afa15a6b0357901e1767ce9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 12 Sep 2019 17:19:14 +0200
Subject: [PATCH 07/14] Update atom density values

---
 test/test_generate_cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_generate_cuda.py b/test/test_generate_cuda.py
index 438d55f3..2e4d7ff7 100644
--- a/test/test_generate_cuda.py
+++ b/test/test_generate_cuda.py
@@ -43,7 +43,7 @@ def test_generate_cuda():
         grid_info = {
             'number_of_points': [30, 30, 30],
             'resolution': [1., 1., 1.],
-            'atomic_densities': {'CA': 3.5, 'N': 3.5, 'O': 3.5, 'C': 3.5},
+            'atomic_densities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
         }
 
         # tune the kernel

From 92f7b2c6f844742b75ab15e44f84637ee652b92f Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 12 Sep 2019 17:19:46 +0200
Subject: [PATCH 08/14] Update mapfly atom density feature

---
 deeprank/learn/DataSet.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index 8c011a5e..7be7fd81 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -10,6 +10,7 @@
 import numpy as np
 from tqdm import tqdm
 
+from deeprank import config
 from deeprank.config import logger
 from deeprank.generate import MinMaxParam, NormalizeData, NormParam
 from deeprank.tools import pdb2sql, sparse
@@ -65,8 +66,7 @@ def __init__(self, train_database, valid_database=None, test_database=None,
                     Select the features used in the learning.
                     if mapfly is True:
                         {'AtomDensities': 'all', 'Features': 'all'}
-                        {'AtomicDensities': {
-                            'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52},
+                        {'AtomicDensities': config.atom_vdw_radius_noH,
                             'Features': ['PSSM_*', 'pssm_ic_*']}
                     if mapfly is False:
                         {'AtomDensities_ind': 'all', 
@@ -310,6 +310,7 @@ def __getitem__(self, index):
         """
 
         fname, mol, angle, axis = self.index_complexes[index]
+        print(fname, mol)
 
         if self.mapfly:
             feature, target = self.map_one_molecule(fname, mol, angle, axis)
@@ -632,8 +633,7 @@ def get_raw_feature_name(self):
             class parameter self.select_feature examples:
             - 'all'
             - {'AtomicDensities': 'all', 'Features':all}
-            - {'AtomicDensities': {
-                'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52},
+            - {'AtomicDensities': config.atom_vaw_radius_noH,
                'Features': ['PSSM_*', 'pssm_ic_*']}
 
             Feature type must be: 'AtomicDensities' or 'Features'.
@@ -650,8 +650,7 @@ class parameter self.select_feature examples:
         # if we select all the features
         if self.select_feature == "all":
             self.select_feature = {}
-            self.select_feature['AtomicDensities'] = {
-                'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52}
+            self.select_feature['AtomicDensities'] = config.atom_vdw_radius_noH
             self.select_feature['Features'] = [
                 name for name in raw_data.keys()]
 
@@ -663,8 +662,8 @@ class parameter self.select_feature examples:
                 # if for a given type we need all the feature
                 if feat_names == 'all':
                     if feat_type == 'AtomicDensities':
-                        self.select_feature['AtomicDensities'] = {
-                            'CA': 1.7, 'C': 1.7, 'N': 1.55, 'O': 1.52}
+                        self.select_feature['AtomicDensities'] = \
+                            config.atom_vdw_radius_noH
                     elif feat_type == 'Features':
                         self.select_feature[feat_type] = list(raw_data.keys())
                     else:
@@ -683,7 +682,7 @@ class parameter self.select_feature examples:
                                 match = name.split('*')[0]
                                 possible_names = list(raw_data.keys())
                                 match_names = [
-                                    n for n in possible_names
+                                    n for n in possible_nasdfsfasdfasdfmes
                                     if n.startswith(match)]
                                 self.select_feature[feat_type] += match_names
                             else:
@@ -1271,7 +1270,7 @@ def map_atomic_densities(
         """Map atomic densities.
         
         Args:
-            feat_names(dict): Atom type and vdw radius
+            feat_names(dict): Element type and vdw radius
             mol_data(h5 group): HDF5 molecule group
             grid(tuple): mesh grid of x,y,z
             npts(tuple): number of points on axis x,y,z
@@ -1289,16 +1288,21 @@ def map_atomic_densities(
             center = [np.mean(g) for g in grid]
 
         densities = []
-        for atomtype, vdw_rad in feat_names.items():
+        for elementtype, vdw_rad in feat_names.items():
 
             # get pos of the contact atoms of correct type
-            xyzA = np.array(sql.get('x,y,z', rowID=index[0], name=atomtype))
-            xyzB = np.array(sql.get('x,y,z', rowID=index[1], name=atomtype))
+            xyzA = np.array(sql.get(
+                'x,y,z', rowID=index[0], element=elementtype))
+            xyzB = np.array(sql.get(
+                'x,y,z', rowID=index[1], element=elementtype))
 
             # rotate if necessary
             if angle is not None:
-                xyzA = self._rotate_coord(xyzA, center, angle, axis)
-                xyzB = self._rotate_coord(xyzB, center, angle, axis)
+                if xyzA != np.array([]):
+                    xyzA = self._rotate_coord(xyzA, center, angle, axis)
+
+                if xyzB != np.array([]):
+                    xyzB = self._rotate_coord(xyzB, center, angle, axis)
 
             # init the grid
             atdensA = np.zeros(npts)

From 1952531cb7e90eb6ec6e412f3032f0329321b057 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Thu, 12 Sep 2019 17:20:48 +0200
Subject: [PATCH 09/14] Update test_generate.py

---
 test/test_generate.py | 56 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/test/test_generate.py b/test/test_generate.py
index c0736456..df197793 100644
--- a/test/test_generate.py
+++ b/test/test_generate.py
@@ -1,5 +1,6 @@
 import os
 import unittest
+import numpy as np
 from time import time
 
 from deeprank.generate import *
@@ -16,6 +17,9 @@
 class TestGenerateData(unittest.TestCase):
     """Test the data generation process."""
 
+    # set random seed to make results repeatable
+    np.random.seed(2019)
+
     h5file = ['./1ak4.hdf5', 'native.hdf5']
     pdb_source = ['./1AK4/decoys/', './1AK4/native/']
     # pdb_native is only used to calculate i-RMSD, dockQ and so on. The native
@@ -65,9 +69,9 @@ def test_1_generate(self):
 
             # map the features
             grid_info = {
-                'number_of_points': [30, 30, 30],
-                'resolution': [1., 1., 1.],
-                'atomic_densities': {'CA': 3.5, 'N': 3.5, 'O': 3.5, 'C': 3.5},
+                'number_of_points': [10, 10, 10],
+                'resolution': [3., 3., 3.],
+                'atomic_densities': {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
             }
 
             t0 = time()
@@ -86,6 +90,44 @@ def test_1_generate(self):
             norm.get()
             print(' ' * 25 + '--> Done in %f s.' % (time() - t0))
 
+    def test_1_generate_mapfly(self):
+        """Generate the database."""
+
+        # clean old files
+        files = [
+            '1ak4_mapfly.hdf5',
+            '1ak4_mapfly.pckl'
+            ]
+        for f in files:
+            if os.path.isfile(f):
+                os.remove(f)
+
+        h5 = "./1ak4_mapfly.hdf5"
+        src = self.pdb_source[0]
+
+        # init the data assembler
+
+        database = DataGenerator(
+            pdb_source=src,
+            pdb_native=self.pdb_native,
+            pssm_source='./1AK4/pssm_new/',
+            # data_augmentation=1,
+            compute_targets=[
+                'deeprank.targets.dockQ',
+                'deeprank.targets.binary_class'],
+            compute_features=[
+                'deeprank.features.AtomicFeature',
+                'deeprank.features.FullPSSM',
+                'deeprank.features.PSSM_IC',
+                'deeprank.features.BSA',
+                'deeprank.features.ResidueDensity'],
+            hdf5=h5)
+
+        # create new files
+        print('{:25s}'.format('Create new database') + database.hdf5)
+        database.create_database(prog_bar=True)
+    
+
     def test_2_add_target(self):
         """Add a target (e.g., class labels) to the database."""
 
@@ -145,4 +187,10 @@ def test_4_add_feature(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    # unittest.main()
+
+    inst = TestGenerateData()
+    inst.test_1_generate()
+    inst.test_1_generate_mapfly()
+    inst.test_3_add_unique_target()
+    inst.test_4_add_feature()

From 2a593ca1640f57295a2e73c2788fdf07c7bc9b76 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 16 Sep 2019 11:48:37 +0200
Subject: [PATCH 10/14] fix typo

---
 deeprank/learn/DataSet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index 7be7fd81..ec2bb3dd 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -682,7 +682,7 @@ class parameter self.select_feature examples:
                                 match = name.split('*')[0]
                                 possible_names = list(raw_data.keys())
                                 match_names = [
-                                    n for n in possible_nasdfsfasdfasdfmes
+                                    n for n in possible_names
                                     if n.startswith(match)]
                                 self.select_feature[feat_type] += match_names
                             else:

From 334308577afb14094e19dab84027cd728e18d05b Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 16 Sep 2019 13:27:16 +0200
Subject: [PATCH 11/14] remove trailing whitespace

---
 deeprank/config/__init__.py    |  2 +-
 deeprank/features/FullPSSM.py  |  2 +-
 deeprank/generate/GridTools.py |  2 +-
 deeprank/learn/DataSet.py      | 64 +++++++++++++++++-----------------
 deeprank/learn/NeuralNet.py    | 54 ++++++++++++++--------------
 deeprank/tools/README.md       | 22 ++++++------
 deeprank/tools/pdb2sql.py      | 12 +++----
 test/test_generate.py          |  2 +-
 8 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/deeprank/config/__init__.py b/deeprank/config/__init__.py
index fdde521a..a11cdb88 100644
--- a/deeprank/config/__init__.py
+++ b/deeprank/config/__init__.py
@@ -2,7 +2,7 @@
 import logging.config
 
 from . import logger_settings
-from .chemicals import AA_codes, AA_codes_3to1, AA_codes_1to3 
+from .chemicals import AA_codes, AA_codes_3to1, AA_codes_1to3
 from .chemicals import AA_codes_pssm_ordered
 from .chemicals import AA_properties
 from .chemicals import atom_vdw_radius, atom_vdw_radius_noH
diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py
index 22a1eb2e..44e79f4e 100644
--- a/deeprank/features/FullPSSM.py
+++ b/deeprank/features/FullPSSM.py
@@ -238,7 +238,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw, out_type='pssmvalue'):
                     f"check 'config.PATH_PSSM_SOURCE'")
     else:
         path = config.PATH_PSSM_SOURCE
- 
+
     mol_name = os.path.split(featgrp.name)[0]
     mol_name = mol_name.lstrip('/')
 
diff --git a/deeprank/generate/GridTools.py b/deeprank/generate/GridTools.py
index 37d2483d..a3d0aeed 100644
--- a/deeprank/generate/GridTools.py
+++ b/deeprank/generate/GridTools.py
@@ -36,7 +36,7 @@ def __init__(self, molgrp,
                 each direction of the grid.
             resolution(float, optional): distance(in Angs) between two points.
             atomic_densities(dict, optional): dictionary of element types with
-                their vdw radius, see deeprank.config.atom_vdw_radius_noH 
+                their vdw radius, see deeprank.config.atom_vdw_radius_noH
             atomic_densities_mode(str, optional): Mode for mapping
                 (deprecated must be 'ind').
             feature(None, optional): Name of the features to be mapped.
diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index ec2bb3dd..9066e3c7 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -37,7 +37,7 @@ def __init__(self, train_database, valid_database=None, test_database=None,
         '''Generates the dataset needed for pytorch.
 
         This class hanldes the data generated by deeprank.generate to be
-        used in the deep learning part of DeepRank. 
+        used in the deep learning part of DeepRank.
 
         Args:
             train_database (list(str)): names of the hdf5 files used for
@@ -46,30 +46,30 @@ def __init__(self, train_database, valid_database=None, test_database=None,
             valid_database (list(str)): names of the hdf5 files used for
                 the validation.
                 Example : ['1ACB.hdf5','4JHF.hdf5',...]
-            test_database (list(str)): names of the hdf5 files used for 
+            test_database (list(str)): names of the hdf5 files used for
                 the test.
                 Example : ['7CEI.hdf5']
 
-            mapfly (bool): do we compute the map in the batch 
+            mapfly (bool): do we compute the map in the batch
                 preparation or read them
 
             grid_info(dict) : grid information to map the feature on the
                 fly. if None the original grid points are used.
-                Example: 
+                Example:
                     {'number_of_points': [X,Y,Z], 'resolution': [X,Y,Z]}
 
             use_rotation (int): number of rotations to use.
                 Example: 0 (use only original data)
                 Default: None  (use all data of the database)
 
-            select_feature (dict or 'all', optional): 
+            select_feature (dict or 'all', optional):
                     Select the features used in the learning.
                     if mapfly is True:
                         {'AtomDensities': 'all', 'Features': 'all'}
                         {'AtomicDensities': config.atom_vdw_radius_noH,
                             'Features': ['PSSM_*', 'pssm_ic_*']}
                     if mapfly is False:
-                        {'AtomDensities_ind': 'all', 
+                        {'AtomDensities_ind': 'all',
                             'Feature_ind': 'all'}
                         {'Feature_ind': ['PSSM_*', 'pssm_ic_*']}
                     Default : 'all'
@@ -80,31 +80,31 @@ def __init__(self, train_database, valid_database=None, test_database=None,
                 Default : True
             normalize_targets (Bool, optional): normalize targets or not
                 Default : True
-            target_ordering (str): 'lower' (the lower the better) or 
+            target_ordering (str): 'lower' (the lower the better) or
                 'higher' (the higher the better)
-                By default is not specified (None) and the code tries 
+                By default is not specified (None) and the code tries
                 to identify it. If identification fails 'lower' is used.
 
             dict_filter (None or dict, optional): Specify if we filter
                 the complexes based on target values
-                Example : {'IRMSD' : '<4. or >10'} 
+                Example : {'IRMSD' : '<4. or >10'}
                     (select complexes with IRMSD lower than 4 or larger than 10)
                 Default : None
-            pair_chain_feature (None or callable, optional): 
+            pair_chain_feature (None or callable, optional):
                 method to pair features of chainA and chainB
                 Example : np.sum (sum the chainA and chainB features)
-            transform_to_2D (bool, optional):  
+            transform_to_2D (bool, optional):
                 Boolean to use 2d maps instead of full 3d
                 Default : False
             projection (int): Projection axis from 3D to 2D:
                 Mapping : 0 -> yz, 1 -> xz, 2 -> xy
                 Default = 0
-            grid_shape (None or tuple(int), optional): 
+            grid_shape (None or tuple(int), optional):
                 Shape of the grid in the hdf5 file. Is not necessary
                 if the grid points are still present in the HDF5 file.
-            clip_features (bool, optional): 
+            clip_features (bool, optional):
                 Remove too large values of the grid.
-                Can be needed for native complexes where the coulomb 
+                Can be needed for native complexes where the coulomb
                 feature might be too large
             clip_factor (float, optional): the features are clipped at:
                 +/-mean + clip_factor * std
@@ -115,7 +115,7 @@ def __init__(self, train_database, valid_database=None, test_database=None,
         Examples:
             >>> from deeprank.learn import *
             >>> train_database = '1ak4.hdf5'
-            >>> data_set = DataSet(train_database, 
+            >>> data_set = DataSet(train_database,
             >>>                    valid_database = None,
             >>>                    test_database = None,
             >>>                    grid_shape=(30,30,30),
@@ -196,10 +196,10 @@ def __init__(self, train_database, valid_database=None, test_database=None,
     @staticmethod
     def _get_database_name(database):
         """Get the list of hdf5 database file names.
-        
+
         Args:
             database(None, str or list(str)): hdf5 database name(s).
-        
+
         Returns:
             list: hdf5 file names
         """
@@ -363,8 +363,8 @@ def check_hdf5_files(database):
     def create_index_molecules(self):
         """Create the indexing of each molecule in the dataset.
 
-        Create the indexing: 
-        [('1ak4.hdf5,1AK4_100w),...,('1fqj.hdf5,1FGJ_400w)] 
+        Create the indexing:
+        [('1ak4.hdf5,1AK4_100w),...,('1fqj.hdf5,1FGJ_400w)]
         This allows to refer to one complex with its index in the list.
 
         Raises:
@@ -501,7 +501,7 @@ def filter(self, molgrp):
 
         Args:
             molgrp (str): group name of the molecule in the hdf5 file
-            
+
         Returns:
             bool: True if we keep the complex False otherwise
 
@@ -542,7 +542,7 @@ class parameter self.select_feature examples:
             - {'Feature_ind': ['PSSM_*', 'pssm_ic_*']}
 
             Feature type must be: 'AtomicDensities_ind' or 'Feature_ind'.
-        
+
         Raises:
             KeyError: Wrong feature type.
             KeyError: Wrong feature type.
@@ -637,7 +637,7 @@ class parameter self.select_feature examples:
                'Features': ['PSSM_*', 'pssm_ic_*']}
 
             Feature type must be: 'AtomicDensities' or 'Features'.
-        
+
         Raises:
             KeyError: Wrong feature type.
             KeyError: Wrong feature type.
@@ -668,7 +668,7 @@ class parameter self.select_feature examples:
                         self.select_feature[feat_type] = list(raw_data.keys())
                     else:
                         raise KeyError(
-                            f'Wrong feature type {feat_type}. ' 
+                            f'Wrong feature type {feat_type}. '
                             f'It should be "AtomicDensities" or "Features".')
 
                 else:
@@ -689,7 +689,7 @@ class parameter self.select_feature examples:
                                 self.select_feature[feat_type] += [name]
                     else:
                         raise KeyError(
-                            f'Wrong feature type {feat_type}. ' 
+                            f'Wrong feature type {feat_type}. '
                             f'It should be "AtomicDensities" or "Features".')
 
         f5.close()
@@ -746,7 +746,7 @@ def get_input_shape(self):
 
         Note:
             self.data_shape : shape of the raw 3d data set
-            self.input_shape: input size of the CNN 
+            self.input_shape: input size of the CNN
                               (potentially after 2d transformation)
         """
 
@@ -770,7 +770,7 @@ def get_grid_shape(self):
         """Get the shape of the matrices.
 
         Raises:
-            ValueError: If no grid shape is provided or is present in 
+            ValueError: If no grid shape is provided or is present in
                 the HDF5 file
         """
         if self.mapfly is False:
@@ -1075,7 +1075,7 @@ def load_one_molecule(self, fname, mol=None):
                 logger.error(
                     f'Feature type {feat_type} not found in file {fname} '
                     f'for molecule {mol}.\n'
-                    f'Possible feature types are :\n\t' + 
+                    f'Possible feature types are :\n\t' +
                     '\n\t'.join(list(mol_data['mapped_features'].keys()))
                     )
                 raise ValueError(feat_type, ' not supported')
@@ -1219,13 +1219,13 @@ def make_feature_pair(feature, op):
 
     def get_grid(self, mol_data):
         """Get meshed grids and number of pointgs
-        
+
         Args:
             mol_data(h5 group): HDF5 moleucle group
-        
+
         Raises:
             ValueError: Grid points not found in mol_data.
-        
+
         Returns:
             tuple, tuple: meshgrid, npts
         """
@@ -1268,7 +1268,7 @@ def get_grid(self, mol_data):
     def map_atomic_densities(
             self, feat_names, mol_data, grid, npts, angle, axis):
         """Map atomic densities.
-        
+
         Args:
             feat_names(dict): Element type and vdw radius
             mol_data(h5 group): HDF5 molecule group
@@ -1276,7 +1276,7 @@ def map_atomic_densities(
             npts(tuple): number of points on axis x,y,z
             angle(float): rotation angle
             axis(list): rotation axis
-        
+
         Returns:
             list : atomic densities of each atom type on each chain
         """
diff --git a/deeprank/learn/NeuralNet.py b/deeprank/learn/NeuralNet.py
index 07f15a3e..6bc0587c 100644
--- a/deeprank/learn/NeuralNet.py
+++ b/deeprank/learn/NeuralNet.py
@@ -38,19 +38,19 @@ def __init__(self, data_set, model,
         """Train a Convolutional Neural Network for DeepRank.
 
         Args:
-            data_set (deeprank.DataSet or str): Data set used for 
+            data_set (deeprank.DataSet or str): Data set used for
                 training or testing.
                 - deeprank.DataSet for training;
-                - str (e.g. 'xxx.hdf5') for testing when pretrained 
+                - str (e.g. 'xxx.hdf5') for testing when pretrained
                     model is loaded.
 
-            model (nn.Module): Definition of the NN to use. 
+            model (nn.Module): Definition of the NN to use.
                 Must subclass nn.Module.
                 See examples in model2d.py and model3d.py
 
-            model_type (srt): Type of model we want to use. 
+            model_type (srt): Type of model we want to use.
                 Must be '2d' or '3d'.
-                If we specify a 2d model, the data set is automatically 
+                If we specify a 2d model, the data set is automatically
                 converted to the correct format.
 
             task (str 'ref' or 'class'): Task to perform.
@@ -59,16 +59,16 @@ def __init__(self, data_set, model,
                 The loss function, the target datatype and plot functions
                 will be autmatically adjusted depending on the task.
 
-            pretrained_model (str): Saved model to be used for further 
+            pretrained_model (str): Saved model to be used for further
                 training or testing.
 
             cuda (bool): Use CUDA.
 
-            ngpu (int): number of GPU to be used. 
+            ngpu (int): number of GPU to be used.
 
             plot (bool): Plot the prediction results.
 
-            save_hitrate (bool): Save and plot hit rate. 
+            save_hitrate (bool): Save and plot hit rate.
 
             save_classmetrics (bool): Save and plot classification metrics.
                 Classification metrics include:
@@ -82,12 +82,12 @@ def __init__(self, data_set, model,
 
         Examples:
             >>> # create the network
-            >>> model = NeuralNet(data_set, cnn, 
+            >>> model = NeuralNet(data_set, cnn,
             ...                   model_type='3d', task='reg',
             ...                   plot=True, save_hitrate=True,
             ...                   outdir='./out/')
             >>> # start the training
-            >>> model.train(nepoch = 50, divide_trainset=0.8, 
+            >>> model.train(nepoch = 50, divide_trainset=0.8,
             ...             train_batch_size = 5, num_workers=0)
         """
 
@@ -212,7 +212,7 @@ def __init__(self, data_set, model,
 
         # load parameters of pretrained model if provided
         if self.pretrained_model:
-            # a prefix 'module.' is added to parameter names if 
+            # a prefix 'module.' is added to parameter names if
             # torch.nn.DataParallel was used
             # https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel
             if self.state['cuda']:
@@ -282,12 +282,12 @@ def train(self,
               num_workers=1,
               save_model='best',
               save_epoch='intermediate'):
-        """Perform a simple training of the model. 
+        """Perform a simple training of the model.
 
         Args:
-            nepoch (int, optional): number of iterations 
+            nepoch (int, optional): number of iterations
 
-            divide_trainset (list, optional): the percentage assign to 
+            divide_trainset (list, optional): the percentage assign to
                 the training, validation and test set.
                 Examples: [0.7, 0.2, 0.1], [0.8, 0.2], None
 
@@ -295,21 +295,21 @@ def train(self,
 
             train_batch_size (int, optional): size of the batch
 
-            preshuffle (bool, optional): preshuffle the dataset before 
+            preshuffle (bool, optional): preshuffle the dataset before
                 dividing it.
 
             preshuffle_seed (int, optional): set random seed for preshuffle
 
-            export_intermediate (bool, optional): export data at 
+            export_intermediate (bool, optional): export data at
                 intermediate epochs.
 
             num_workers (int, optional): number of workers to be used to
                 prepare the batch data
 
-            save_model (str, optional): 'best' or 'all', save only the 
+            save_model (str, optional): 'best' or 'all', save only the
                 best model or all models.
 
-            save_epoch (str, optional): 'intermediate' or 'all', 
+            save_epoch (str, optional): 'intermediate' or 'all',
                 save the epochs data to HDF5.
 
         """
@@ -371,7 +371,7 @@ def test(self, hdf5='test_data.hdf5'):
 
         Args:
             hdf5 (str, optional): hdf5 file to store the test results
-    
+
         Examples:
             >>> # adress of the database
             >>> database = '1ak4.hdf5'
@@ -484,18 +484,18 @@ def load_data_params(self):
         self.data_set.grid_info = self.state['grid_info']
 
     def _divide_dataset(self, divide_set, preshuffle, preshuffle_seed):
-        """Divide the data set into training, validation and test 
+        """Divide the data set into training, validation and test
         according to the percentage in divide_set.
 
         Args:
-            divide_set (list(float)): percentage used for 
+            divide_set (list(float)): percentage used for
                 training/validation/test.
                 Example: [0.8, 0.1, 0.1], [0.8, 0.2]
             preshuffle (bool): shuffle the dataset before dividing it
             preshuffle_seed (int, optional): set random seed
 
         Returns:
-            list(int),list(int),list(int): Indices of the 
+            list(int),list(int),list(int): Indices of the
                 training/validation/test set.
         """
         # if user only provided one number
@@ -507,7 +507,7 @@ def _divide_dataset(self, divide_set, preshuffle, preshuffle_seed):
         if len(divide_set) == 3 and self.data_set.test_database is not None:
             divide_set = [divide_set[0], 1. - divide_set[0]]
             logger.info(f'   : test data set AND test in training set detected\n'
-                        f'   : Divide training set as ' 
+                        f'   : Divide training set as '
                         f'{divide_set[0]} train {divide_set[1]} valid.\n'
                         f'   : Keep test set for testing')
 
@@ -548,9 +548,9 @@ def _train(self, index_train, index_valid, index_test,
             nepoch (int, optional): numbr of epoch
             train_batch_size (int, optional): size of the batch
             export_intermediate (bool, optional):export itnermediate data
-            num_workers (int, optional): number of workers pytorch 
+            num_workers (int, optional): number of workers pytorch
                 uses to create the batch size
-            save_epoch (str,optional): 'intermediate' or 'all' 
+            save_epoch (str,optional): 'intermediate' or 'all'
             save_model (str, optional): 'all' or 'best'
 
         Returns:
@@ -1024,7 +1024,7 @@ def plot_hit_rate(self, figname):
         """Plot the hit rate of the different training/valid/test sets.
 
         The hit rate is defined as:
-            The percentage of positive(near-native) decoys that are 
+            The percentage of positive(near-native) decoys that are
             included among the top m decoys.
 
         Args:
@@ -1176,7 +1176,7 @@ def _export_epoch_hdf5(self, epoch, data):
         """Export the epoch data to the hdf5 file.
 
         Export the data of a given epoch in train/valid/test group.
-        In each group are stored the predcited values (outputs), 
+        In each group are stored the predcited values (outputs),
         ground truth (targets) and molecule name (mol).
 
         Args:
diff --git a/deeprank/tools/README.md b/deeprank/tools/README.md
index f914c5f0..29a4b815 100644
--- a/deeprank/tools/README.md
+++ b/deeprank/tools/README.md
@@ -4,7 +4,7 @@ Here are located all the generic tools used during one or across multiple steps
 
 ## PDB2SQL
 
-The file pdb2sql.py contains a class named pdb2sql that allows using sqlite3 to manipulate PDB files. The use of SQL queries makes it very easy to extract information from the PDB file using only one line of code. 
+The file pdb2sql.py contains a class named pdb2sql that allows using sqlite3 to manipulate PDB files. The use of SQL queries makes it very easy to extract information from the PDB file using only one line of code.
 
 ### Create a SQl data base
 
@@ -27,11 +27,11 @@ After its creation the database contains 13 columns and one line for each atom i
   * resName : the name of the residue the atom belongs to
   * chaiID  : the ID of the chain the atom belongs to
   * resSeq  : the residue number the atom belongs to
-  * iCode   : Code for insertion of residue 
+  * iCode   : Code for insertion of residue
   * x       : x coordinate of the atom
   * y       : y coordinate of the atom
   * z       : z coordinate of the atom
-  * occ     : occupancy 
+  * occ     : occupancy
   * temp    : temperature factor
 
 
@@ -138,19 +138,19 @@ sqldb.close(rmdb=False)
 
 ## FeatureClass
 
-The file FeatureClass.py contain a super class that all feature calculations should subclass. So far the super class only contains one method **FatureClass.export_data()** that is used to export the data of the feature to a file. This ensure that we keep the same syntax for all the features. The class has 3 attributes 
+The file FeatureClass.py contain a super class that all feature calculations should subclass. So far the super class only contains one method **FatureClass.export_data()** that is used to export the data of the feature to a file. This ensure that we keep the same syntax for all the features. The class has 3 attributes
 
 
   * self.type         : "Atomic" or "Residue"
   * self.feature_data : dictionary {feature_name : feature_dict}
 
     feature_name is the name of the feature e.g. 'coulomb' or 'vdwaals'
-    
+
     feature_dict is a dictionary. The format of the key depends on the type of feature
-    
+
     residue-based feature
     {(chainID, residue_name(3-letter), residue_number) : [values1, values2, ....]}
-    
+
     atomic-based feature
     {(chainID, residue_name(3-letter), residue_number, atom_name) : [values1, values2, ....]}
 
@@ -191,11 +191,11 @@ The file atomic_feature.py contains a class named atomicFeature that allows comp
   * a file containing the vdw parameters
   * evantually a patch file for the force field parameters
 
-An example of use is provided in ./example/grid/atomicfeature.py. 
+An example of use is provided in ./example/grid/atomicfeature.py.
 
-```python 
+```python
 from deeprank.tools import atomicFeature
-  
+
 PDB = 'complex.pdb'
 FF = './forcefield/'
 
@@ -225,4 +225,4 @@ atfeat.sqldb.close()
 ```
 
 
-In this example we compute the pair interactions and the atomic charges of the complex given in the example folder and using the force field parameters also located there. The pair interactions are outputed on the screen. For the charges, the contact atom list is extended to all the residues that contains at least one contact atom. 
+In this example we compute the pair interactions and the atomic charges of the complex given in the example folder and using the force field parameters also located there. The pair interactions are outputed on the screen. For the charges, the contact atom list is extended to all the residues that contains at least one contact atom.
diff --git a/deeprank/tools/pdb2sql.py b/deeprank/tools/pdb2sql.py
index 3abbfddb..9b6a40c7 100644
--- a/deeprank/tools/pdb2sql.py
+++ b/deeprank/tools/pdb2sql.py
@@ -114,7 +114,7 @@ def _create_sql(self):
                     'z': 'REAL',
                     'occ': 'REAL',
                     'temp': 'REAL',
-                    'element': 'TEXT' 
+                    'element': 'TEXT'
                     }
 
         # delimtier of the column format
@@ -234,7 +234,7 @@ def _create_sql(self):
                     data_col = float(data_col)
 
                 # get element if it does not exist
-                if colname == "element" and not data_col: 
+                if colname == "element" and not data_col:
                     data_col = pdb2sql._get_element(line)
 
                 # append keep the comma !!
@@ -250,20 +250,20 @@ def _create_sql(self):
     @staticmethod
     def _get_element(pdb_line):
         """Get element type from the atom type of a pdb line
-        
+
         Notes:
             Atom type occupies 13-16th columns of a PDB line.
             http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
             Four situations exist:
                 13 14 15 16
                    C  A      The element is C
-                C  A         The element is Ca   
+                C  A         The element is Ca
                 1  H  G      The element is H
                 H  E  2  1   The element is H
 
         Args:
             pdb_line(str): one PDB ATOM line
-        
+
         Returns:
             [str]: element name
         """
@@ -277,7 +277,7 @@ def _get_element(pdb_line):
                 elem = "H"
             else:
                 elem = pdb_line[12:14]
-            
+
         else:
             elem = pdb_line[13]
         return elem
diff --git a/test/test_generate.py b/test/test_generate.py
index df197793..bfc25e9a 100644
--- a/test/test_generate.py
+++ b/test/test_generate.py
@@ -126,7 +126,7 @@ def test_1_generate_mapfly(self):
         # create new files
         print('{:25s}'.format('Create new database') + database.hdf5)
         database.create_database(prog_bar=True)
-    
+
 
     def test_2_add_target(self):
         """Add a target (e.g., class labels) to the database."""

From ec2a6eb427645b114bd5b31f05f163958b81edff Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Mon, 16 Sep 2019 13:38:56 +0200
Subject: [PATCH 12/14] fix syntax error

---
 deeprank/learn/DataSet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index 9066e3c7..b6730c68 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -897,7 +897,7 @@ def _read_norm(self):
 
             # if the file doesn't exist we create it
             if not os.path.isfile(fdata):
-                logger.info("      Computing norm for ", f5)
+                logger.info(f"      Computing norm for {f5}")
                 norm = NormalizeData(f5, shape=self.grid_shape)
                 norm.get()
 

From 0700e4769994e67344ce69c2e0a0b75ba181e3b9 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 17 Sep 2019 13:21:10 +0200
Subject: [PATCH 13/14] Update chemicals.py

---
 deeprank/config/chemicals.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/deeprank/config/chemicals.py b/deeprank/config/chemicals.py
index c39b003b..e102fe00 100644
--- a/deeprank/config/chemicals.py
+++ b/deeprank/config/chemicals.py
@@ -74,7 +74,9 @@
 
 
 # atom vdw radius
-# https://en.wikipedia.org/wiki/Van_der_Waals_radius
+# William M Haynes. CRC Handbook of Chemistry and Physics. 
+# ISBN 9781482208689. 
+# URL: https://books.google.no/books?id=bNDMBQAAQBAJ.
 
 atom_vdw_radius_noH = {
     "C": 1.7,
@@ -83,4 +85,4 @@
     "S": 1.8,
     }
 
-atom_vdw_radius = {**atom_vdw_radius_noH, "H": 1.1}
\ No newline at end of file
+atom_vdw_radius = {**atom_vdw_radius_noH, "H": 1.1}

From 4bd1a58c6225df8acb7ba2047fe71456194c3021 Mon Sep 17 00:00:00 2001
From: Cunliang Geng <c.geng@esciencecenter.nl>
Date: Tue, 17 Sep 2019 13:51:11 +0200
Subject: [PATCH 14/14] Update chemicals.py

---
 deeprank/config/chemicals.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deeprank/config/chemicals.py b/deeprank/config/chemicals.py
index e102fe00..42641cba 100644
--- a/deeprank/config/chemicals.py
+++ b/deeprank/config/chemicals.py
@@ -74,8 +74,8 @@
 
 
 # atom vdw radius
-# William M Haynes. CRC Handbook of Chemistry and Physics. 
-# ISBN 9781482208689. 
+# William M Haynes. CRC Handbook of Chemistry and Physics.
+# ISBN 9781482208689.
 # URL: https://books.google.no/books?id=bNDMBQAAQBAJ.
 
 atom_vdw_radius_noH = {