From e8841fe4ba3c9cf246dd2eb5b531f6b5b5122fed Mon Sep 17 00:00:00 2001
From: NicoRenaud <nicolas.gm.renaud@gmail.com>
Date: Tue, 21 Apr 2020 13:51:05 +0200
Subject: [PATCH 1/3] raise last exception when unable to load molecule and
 print mol/file name

---
 deeprank/learn/DataSet.py | 175 ++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 66 deletions(-)

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index 1f479351..850e0f39 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -254,10 +254,12 @@ def process_dataset(self):
         self.check_hdf5_files(self.train_database)
 
         if self.valid_database:
-            self.valid_database = self.check_hdf5_files(self.valid_database)
+            self.valid_database = self.check_hdf5_files(
+                self.valid_database)
 
         if self.test_database:
-            self.test_database = self.check_hdf5_files(self.test_database)
+            self.test_database = self.check_hdf5_files(
+                self.test_database)
 
         # create the indexing system
         # alows to associate each mol to an index
@@ -288,13 +290,16 @@ def process_dataset(self):
 
         logger.info('\n')
         logger.info("   Data Set Info:")
-        logger.info(f'   Training set        : {self.ntrain} conformations')
+        logger.info(
+            f'   Training set        : {self.ntrain} conformations')
         if self.data_augmentation is not None:
             logger.info(
                 f'   Augmentation        : {self.data_augmentation} rotations')
 
-        logger.info(f'   Validation set      : {self.nvalid} conformations')
-        logger.info(f'   Test set            : {self.ntest} conformations')
+        logger.info(
+            f'   Validation set      : {self.nvalid} conformations')
+        logger.info(
+            f'   Test set            : {self.ntest} conformations')
         logger.info(f'   Number of channels  : {self.input_shape[0]}')
         logger.info(f'   Grid Size           : {self.data_shape[1]}, '
                     f'{self.data_shape[2]}, {self.data_shape[3]}')
@@ -316,31 +321,36 @@ def __getitem__(self, index):
         Returns:
             dict: {'mol':[fname,mol],'feature':feature,'target':target}
         """
-
         fname, mol, angle, axis = self.index_complexes[index]
-        print(fname, mol)
+        try:
 
-        if self.mapfly:
-            feature, target = self.map_one_molecule(fname, mol, angle, axis)
-        else:
-            feature, target = self.load_one_molecule(fname, mol)
+            if self.mapfly:
+                feature, target = self.map_one_molecule(
+                    fname, mol, angle, axis)
+            else:
+                feature, target = self.load_one_molecule(fname, mol)
 
-        if self.clip_features:
-            feature = self._clip_feature(feature)
+            if self.clip_features:
+                feature = self._clip_feature(feature)
 
-        if self.normalize_features:
-            feature = self._normalize_feature(feature)
+            if self.normalize_features:
+                feature = self._normalize_feature(feature)
 
-        if self.normalize_targets:
-            target = self._normalize_target(target)
+            if self.normalize_targets:
+                target = self._normalize_target(target)
 
-        if self.pair_chain_feature:
-            feature = self.make_feature_pair(feature, self.pair_chain_feature)
+            if self.pair_chain_feature:
+                feature = self.make_feature_pair(
+                    feature, self.pair_chain_feature)
 
-        if self.transform:
-            feature = self.convert2d(feature, self.proj2D)
+            if self.transform:
+                feature = self.convert2d(feature, self.proj2D)
 
-        return {'mol': [fname, mol], 'feature': feature, 'target': target}
+            return {'mol': [fname, mol], 'feature': feature, 'target': target}
+
+        except:
+            raise
+            print('Unable to load molecule %s from %s' % (mol, fname))
 
     @staticmethod
     def check_hdf5_files(database):
@@ -385,7 +395,8 @@ def create_index_molecules(self):
         # Training dataset
         desc = '{:25s}'.format('   Train dataset')
         if self.tqdm:
-            data_tqdm = tqdm(self.train_database, desc=desc, file=sys.stdout)
+            data_tqdm = tqdm(self.train_database,
+                             desc=desc, file=sys.stdout)
         else:
             logger.info('   Train dataset')
             data_tqdm = self.train_database
@@ -400,11 +411,13 @@ def create_index_molecules(self):
                 mol_names = self._select_pdb(mol_names)
                 for k in mol_names:
                     if self.filter(fh5[k]):
-                        self.index_complexes += [(fdata, k, None, None)]
+                        self.index_complexes += [(fdata,
+                                                  k, None, None)]
                         for irot in range(self.data_augmentation):
                             axis, angle = pdb2sql.transform.get_rot_axis_angle(
                                 self.rotation_seed)
-                            self.index_complexes += [(fdata, k, angle, axis)]
+                            self.index_complexes += [
+                                (fdata, k, angle, axis)]
                 fh5.close()
             except Exception:
                 logger.exception(f'Ignore file: {fdata}')
@@ -413,7 +426,8 @@ def create_index_molecules(self):
         self.index_train = list(range(self.ntrain))
 
         if self.ntrain == 0:
-            raise ValueError('No avaiable training data after filtering')
+            raise ValueError(
+                'No avaiable training data after filtering')
 
         # Validation dataset
         if self.valid_database:
@@ -421,7 +435,7 @@ def create_index_molecules(self):
             desc = '{:25s}'.format('   Validation dataset')
             if self.tqdm:
                 data_tqdm = tqdm(self.valid_database,
-                                desc=desc, file=sys.stdout)
+                                 desc=desc, file=sys.stdout)
             else:
                 data_tqdm = self.valid_database
                 logger.info('   Validation dataset')
@@ -470,7 +484,8 @@ def create_index_molecules(self):
                     logger.exception(f'Ignore file: {fdata}')
 
         self.ntot = len(self.index_complexes)
-        self.index_test = list(range(self.ntrain + self.nvalid, self.ntot))
+        self.index_test = list(
+            range(self.ntrain + self.nvalid, self.ntot))
         self.ntest = self.ntot - self.ntrain - self.nvalid
 
     def _select_pdb(self, mol_names):
@@ -492,7 +507,7 @@ def _select_pdb(self, mol_names):
             if self.use_rotation > 0:
                 for i in range(self.use_rotation):
                     fnames_augmented += list(filter(lambda x:
-                        re.search('_r%03d$' % (i + 1), x), mol_names))
+                                                    re.search('_r%03d$' % (i + 1), x), mol_names))
                 selected_mol_names = fnames_original + fnames_augmented
             else:
                 selected_mol_names = fnames_original
@@ -532,11 +547,13 @@ def filter(self, molgrp):
                 ops = ['>', '<', '==', '<=', '>=']
                 new_cond_vals = cond_vals
                 for o in ops:
-                    new_cond_vals = new_cond_vals.replace(o, 'val' + o)
+                    new_cond_vals = new_cond_vals.replace(
+                        o, 'val' + o)
                 if not eval(new_cond_vals):
                     return False
             else:
-                raise ValueError('Conditions not supported', cond_vals)
+                raise ValueError(
+                    'Conditions not supported', cond_vals)
 
         return True
 
@@ -570,7 +587,8 @@ class parameter self.select_feature examples:
 
             # loop over the feat types and add all the feat_names
             for feat_type, feat_names in mapped_data.items():
-                self.select_feature[feat_type] = [name for name in feat_names]
+                self.select_feature[feat_type] = [
+                    name for name in feat_names]
 
         # if a selection was made
         else:
@@ -630,7 +648,8 @@ class parameter self.select_feature examples:
                         # (we probably relaod a pretrained model)
                         # and we simply append the feaature name
                         else:
-                            self.select_feature[feat_type].append(name)
+                            self.select_feature[feat_type].append(
+                                name)
 
         f5.close()
 
@@ -672,7 +691,8 @@ class parameter self.select_feature examples:
                         self.select_feature['AtomicDensities'] = \
                             config.atom_vdw_radius_noH
                     elif feat_type == 'Features':
-                        self.select_feature[feat_type] = list(raw_data.keys())
+                        self.select_feature[feat_type] = list(
+                            raw_data.keys())
                     else:
                         raise KeyError(
                             f'Wrong feature type {feat_type}. '
@@ -719,7 +739,8 @@ def print_possible_features(self):
             logger.info('\nYour selection was:')
             for feat_type, feat in self.select_feature.items():
                 if feat_type not in list(mapgrp.keys()):
-                    logger.info('== \x1b[0;37;41m' + feat_type + '\x1b[0m')
+                    logger.info(
+                        '== \x1b[0;37;41m' + feat_type + '\x1b[0m')
                 else:
                     logger.info('== %s' % feat_type)
                     if isinstance(feat, str):
@@ -729,7 +750,7 @@ def print_possible_features(self):
                             logger.info('  -- %s' % f)
 
         logger.info("You don't need to specify _chainA _chainB for each feature. " +
-              "The code will append it automatically")
+                    "The code will append it automatically")
 
     def get_pairing_feature(self):
         """Creates the index of paired features."""
@@ -766,7 +787,8 @@ def get_input_shape(self):
         self.data_shape = feature.shape
 
         if self.pair_chain_feature:
-            feature = self.make_feature_pair(feature, self.pair_chain_feature)
+            feature = self.make_feature_pair(
+                feature, self.pair_chain_feature)
 
         if self.transform:
             feature = self.convert2d(feature, self.proj2D)
@@ -826,9 +848,11 @@ def compute_norm(self):
 
             # get the feature/target
             if self.mapfly:
-                feature, target = self.map_one_molecule(fname, mol=molname)
+                feature, target = self.map_one_molecule(
+                    fname, mol=molname)
             else:
-                feature, target = self.load_one_molecule(fname, mol=molname)
+                feature, target = self.load_one_molecule(
+                    fname, mol=molname)
 
             # create the norm isntances at the first passage
             if first:
@@ -857,8 +881,10 @@ def compute_norm(self):
                 self.param_norm['features'][ifeat].std = 1
 
             # store as array for fast access
-            self.feature_mean.append(self.param_norm['features'][ifeat].mean)
-            self.feature_std.append(self.param_norm['features'][ifeat].std)
+            self.feature_mean.append(
+                self.param_norm['features'][ifeat].mean)
+            self.feature_std.append(
+                self.param_norm['features'][ifeat].std)
 
         self.target_min = self.param_norm['targets'].min[0]
         self.target_max = self.param_norm['targets'].max[0]
@@ -876,7 +902,8 @@ def get_norm(self):
         for feat_type, feat_names in self.select_feature.items():
             self.param_norm['features'][feat_type] = {}
             for name in feat_names:
-                self.param_norm['features'][feat_type][name] = NormParam()
+                self.param_norm['features'][feat_type][name] = NormParam(
+                )
         self.param_norm['targets'][self.select_target] = MinMaxParam()
 
         # read the normalization
@@ -917,20 +944,25 @@ def _read_norm(self):
                     mean = data['features'][feat_type][name].mean
                     var = data['features'][feat_type][name].var
                     if var == 0:
-                        logger.info('  : STD is null for %s in %s' % (name, f5))
-                    self.param_norm['features'][feat_type][name].add(mean, var)
+                        logger.info(
+                            '  : STD is null for %s in %s' % (name, f5))
+                    self.param_norm['features'][feat_type][name].add(
+                        mean, var)
 
             # handle the target
             minv = data['targets'][self.select_target].min
             maxv = data['targets'][self.select_target].max
-            self.param_norm['targets'][self.select_target].update(minv)
-            self.param_norm['targets'][self.select_target].update(maxv)
+            self.param_norm['targets'][self.select_target].update(
+                minv)
+            self.param_norm['targets'][self.select_target].update(
+                maxv)
 
         # process the std
         nfile = len(self.train_database)
         for feat_types, feat_dict in self.param_norm['features'].items():
             for feat in feat_dict:
-                self.param_norm['features'][feat_types][feat].process(nfile)
+                self.param_norm['features'][feat_types][feat].process(
+                    nfile)
                 if self.param_norm['features'][feat_types][feat].std == 0:
                     logger.info(
                         '  Final STD Null for %s/%s. Changed it to 1' %
@@ -958,7 +990,8 @@ def _get_target_ordering(self, order):
             elif self.select_target in NA_list:
                 self.target_ordering = None
             else:
-                warnings.warn('  Target ordering unidentified. lower assumed')
+                warnings.warn(
+                    '  Target ordering unidentified. lower assumed')
                 self.target_ordering = 'lower'
 
     def backtransform_target(self, data):
@@ -1077,14 +1110,16 @@ def load_one_molecule(self, fname, mol=None):
 
             # see if the feature exists
             if 'mapped_features/' + feat_type in mol_data.keys():
-                feat_dict = mol_data.get('mapped_features/' + feat_type)
+                feat_dict = mol_data.get(
+                    'mapped_features/' + feat_type)
             else:
                 logger.error(
                     f'Feature type {feat_type} not found in file {fname} '
                     f'for molecule {mol}.\n'
                     f'Possible feature types are :\n\t' +
-                    '\n\t'.join(list(mol_data['mapped_features'].keys()))
-                    )
+                    '\n\t'.join(
+                        list(mol_data['mapped_features'].keys()))
+                )
                 raise ValueError(feat_type, ' not supported')
 
             # loop through all the desired feat names
@@ -1099,9 +1134,10 @@ def load_one_molecule(self, fname, mol=None):
                         f'{mol} and feature type {feat_type}.\n'
                         f'Possible feature are :\n\t' +
                         '\n\t'.join(list(
-                            mol_data['mapped_features/' + feat_type].keys()
-                            ))
-                        )
+                            mol_data['mapped_features/' +
+                                     feat_type].keys()
+                        ))
+                    )
 
                 # check its sparse attribute
                 # if true get a FLAN
@@ -1215,12 +1251,14 @@ def make_feature_pair(feature, op):
             raise ValueError('Operation not callable', op)
 
         nFeat = len(feature)
-        pair_indexes = list(np.arange(nFeat).reshape(int(nFeat / 2), 2))
+        pair_indexes = list(
+            np.arange(nFeat).reshape(int(nFeat / 2), 2))
 
         outtype = feature.dtype
         new_feat = []
         for ind in pair_indexes:
-            new_feat.append(op(feature[ind[0], ...], feature[ind[1], ...]))
+            new_feat.append(
+                op(feature[ind[0], ...], feature[ind[1], ...]))
 
         return np.array(new_feat).astype(outtype)
 
@@ -1247,7 +1285,8 @@ def get_grid(self, mol_data):
 
             except BaseException:
 
-                raise ValueError("Grid points not found in the data file")
+                raise ValueError(
+                    "Grid points not found in the data file")
 
         else:
 
@@ -1306,10 +1345,12 @@ def map_atomic_densities(
             # rotate if necessary
             if angle is not None:
                 if xyzA != np.array([]):
-                    xyzA = pdb2sql.transform.rot_xyz_around_axis(xyzA, axis, angle, center)
+                    xyzA = pdb2sql.transform.rot_xyz_around_axis(
+                        xyzA, axis, angle, center)
 
                 if xyzB != np.array([]):
-                    xyzB = pdb2sql.transform.rot_xyz_around_axis(xyzB, axis, angle, center)
+                    xyzB = pdb2sql.transform.rot_xyz_around_axis(
+                        xyzB, axis, angle, center)
 
             # init the grid
             atdensA = np.zeros(npts)
@@ -1345,7 +1386,8 @@ def _densgrid(center, vdw_radius, grid, npts):
         """
 
         x0, y0, z0 = center
-        dd = np.sqrt((grid[0] - x0)**2 + (grid[1] - y0)**2 + (grid[2] - z0)**2)
+        dd = np.sqrt((grid[0] - x0)**2 +
+                     (grid[1] - y0)**2 + (grid[2] - z0)**2)
         dgrid = np.zeros(npts)
 
         dgrid[dd < vdw_radius] = np.exp(
@@ -1362,7 +1404,6 @@ def map_feature(self, feat_names, mol_data, grid, npts, angle, axis):
 
         __vectorize__ = False
 
-
         if angle is not None:
             center = [np.mean(g) for g in grid]
 
@@ -1382,14 +1423,15 @@ def map_feature(self, feat_names, mol_data, grid, npts, angle, axis):
             feat_value = data[:, 4]
 
             if angle is not None:
-                pos = pdb2sql.transform.rot_xyz_around_axis(pos, axis, angle, center)
+                pos = pdb2sql.transform.rot_xyz_around_axis(
+                    pos, axis, angle, center)
 
             if __vectorize__ or __vectorize__ == 'both':
 
                 for chainID in [0, 1]:
                     tmp_feat_vect[chainID] = np.sum(
-                        vmap(pos[chain == chainID,:],
-                        feat_value[chain == chainID]),
+                        vmap(pos[chain == chainID, :],
+                             feat_value[chain == chainID]),
                         0)
 
             if not __vectorize__ or __vectorize__ == 'both':
@@ -1431,7 +1473,8 @@ def _featgrid(center, value, grid, npts):
         beta = 0.5 / (sigma**2)
         cutoff = 5. * beta
 
-        dd = np.sqrt((grid[0] - x0)**2 + (grid[1] - y0)**2 + (grid[2] - z0)**2)
+        dd = np.sqrt((grid[0] - x0)**2 +
+                     (grid[1] - y0)**2 + (grid[2] - z0)**2)
 
         dd[dd < cutoff] = value * np.exp(-beta * dd[dd < cutoff])
         dd[dd > cutoff] = 0
@@ -1440,4 +1483,4 @@ def _featgrid(center, value, grid, npts):
         #dgrid[dd<cutoff] = value*np.exp(-beta*dd[dd<cutoff])
         # print(np.allclose(dgrid,dd))
 
-        return dd
\ No newline at end of file
+        return dd

From 70f2f66d525d6ff4cc4b55a0f74703d9a71c7bd7 Mon Sep 17 00:00:00 2001
From: LilySnow <me.lixue@gmail.com>
Date: Mon, 27 Apr 2020 15:19:47 +0200
Subject: [PATCH 2/3] added error report if mapped_features grp does not exist

---
 deeprank/learn/DataSet.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py
index 850e0f39..f66864a1 100644
--- a/deeprank/learn/DataSet.py
+++ b/deeprank/learn/DataSet.py
@@ -1104,6 +1104,12 @@ def load_one_molecule(self, fname, mol=None):
         # get the mol
         mol_data = fh5.get(mol)
 
+        # xue:
+        if 'mapped_features' not in mol_data.keys():
+            logger.error(f"xue: Error: mol: {mol} in {fname} does not have mapped_features ")
+            fh5.close()
+            sys.exit()
+
         # get the features
         feature = []
         for feat_type, feat_names in self.select_feature.items():

From 84810d9168ba68dafd2177f02f5d73be727af811 Mon Sep 17 00:00:00 2001
From: LilySnow <me.lixue@gmail.com>
Date: Sat, 16 May 2020 09:30:24 +0200
Subject: [PATCH 3/3] fixed a bug in read_epoch_data() in plot_utils.py

---
 deeprank/utils/plot_utils.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/deeprank/utils/plot_utils.py b/deeprank/utils/plot_utils.py
index af5ff73f..a71158d1 100755
--- a/deeprank/utils/plot_utils.py
+++ b/deeprank/utils/plot_utils.py
@@ -46,25 +46,23 @@ def read_epoch_data(DR_h5FL, epoch):
     0  Test  1AVX_ranair-it0_5286       0  0.503823  /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5
     1  Test     1AVX_ti5-itw_354w       1  0.502845  /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5
     """
-    print (f"-> Read epoch data from {DR_h5FL} into df")
+    print (f"-> Read epoch {epoch} data from {DR_h5FL} into df")
 
     # -- 1. read deeprank output data for the specific epoch
     h5 = h5py.File(DR_h5FL, 'r')
+
+    keys = list(h5.keys())
+    last_epoch_key = list(filter(lambda x: 'epoch_' in x, keys))[-1]
+
     if epoch is None:
-        print(f"epoch is not provided. Use the last epoch data.")
-        keys = list(h5.keys())
-        last_epoch_key = list(filter(lambda x: 'epoch_' in x, keys))[-1]
+        print(f"epoch is not provided. Use the last epoch data: {last_epoch_key}.")
+        epoch_key = last_epoch_key
     else:
-        last_epoch_key = 'epoch_%04d' % epoch
-        if last_epoch_key not in h5:
-            print(
-                'Incorrect epcoh name\n Possible options are: ' +
-                ' '.join(
-                    list(
-                        h5.keys())))
-            h5.close()
-            return
-    data = h5[last_epoch_key]
+        epoch_key = 'epoch_%04d' % epoch
+        if epoch_key not in h5:
+            print('Incorrect epoch name. Use the last epoch data: {last_epoch_key}.')
+            epoch_key = last_epoch_key
+    data = h5[epoch_key]
 
     # -- 2. convert into pd.DataFrame
     labels = list(data)  # labels = ['train', 'test', 'valid']