From e8841fe4ba3c9cf246dd2eb5b531f6b5b5122fed Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 21 Apr 2020 13:51:05 +0200 Subject: [PATCH 1/3] raise last exception when unable to load molecule and print mol/file name --- deeprank/learn/DataSet.py | 175 ++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 66 deletions(-) diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py index 1f479351..850e0f39 100644 --- a/deeprank/learn/DataSet.py +++ b/deeprank/learn/DataSet.py @@ -254,10 +254,12 @@ def process_dataset(self): self.check_hdf5_files(self.train_database) if self.valid_database: - self.valid_database = self.check_hdf5_files(self.valid_database) + self.valid_database = self.check_hdf5_files( + self.valid_database) if self.test_database: - self.test_database = self.check_hdf5_files(self.test_database) + self.test_database = self.check_hdf5_files( + self.test_database) # create the indexing system # alows to associate each mol to an index @@ -288,13 +290,16 @@ def process_dataset(self): logger.info('\n') logger.info(" Data Set Info:") - logger.info(f' Training set : {self.ntrain} conformations') + logger.info( + f' Training set : {self.ntrain} conformations') if self.data_augmentation is not None: logger.info( f' Augmentation : {self.data_augmentation} rotations') - logger.info(f' Validation set : {self.nvalid} conformations') - logger.info(f' Test set : {self.ntest} conformations') + logger.info( + f' Validation set : {self.nvalid} conformations') + logger.info( + f' Test set : {self.ntest} conformations') logger.info(f' Number of channels : {self.input_shape[0]}') logger.info(f' Grid Size : {self.data_shape[1]}, ' f'{self.data_shape[2]}, {self.data_shape[3]}') @@ -316,31 +321,36 @@ def __getitem__(self, index): Returns: dict: {'mol':[fname,mol],'feature':feature,'target':target} """ - fname, mol, angle, axis = self.index_complexes[index] - print(fname, mol) + try: - if self.mapfly: - feature, target = self.map_one_molecule(fname, mol, angle, axis) - else: - feature, target = self.load_one_molecule(fname, mol) + if self.mapfly: + feature, target = self.map_one_molecule( + fname, mol, angle, axis) + else: + feature, target = self.load_one_molecule(fname, mol) - if self.clip_features: - feature = self._clip_feature(feature) + if self.clip_features: + feature = self._clip_feature(feature) - if self.normalize_features: - feature = self._normalize_feature(feature) + if self.normalize_features: + feature = self._normalize_feature(feature) - if self.normalize_targets: - target = self._normalize_target(target) + if self.normalize_targets: + target = self._normalize_target(target) - if self.pair_chain_feature: - feature = self.make_feature_pair(feature, self.pair_chain_feature) + if self.pair_chain_feature: + feature = self.make_feature_pair( + feature, self.pair_chain_feature) - if self.transform: - feature = self.convert2d(feature, self.proj2D) + if self.transform: + feature = self.convert2d(feature, self.proj2D) - return {'mol': [fname, mol], 'feature': feature, 'target': target} + return {'mol': [fname, mol], 'feature': feature, 'target': target} + + except: + raise + print('Unable to load molecule %s from %s' % (mol, fname)) @staticmethod def check_hdf5_files(database): @@ -385,7 +395,8 @@ def create_index_molecules(self): # Training dataset desc = '{:25s}'.format(' Train dataset') if self.tqdm: - data_tqdm = tqdm(self.train_database, desc=desc, file=sys.stdout) + data_tqdm = tqdm(self.train_database, + desc=desc, file=sys.stdout) else: logger.info(' Train dataset') data_tqdm = self.train_database @@ -400,11 +411,13 @@ def create_index_molecules(self): mol_names = self._select_pdb(mol_names) for k in mol_names: if self.filter(fh5[k]): - self.index_complexes += [(fdata, k, None, None)] + self.index_complexes += [(fdata, + k, None, None)] for irot in range(self.data_augmentation): axis, angle = pdb2sql.transform.get_rot_axis_angle( self.rotation_seed) - self.index_complexes += [(fdata, k, angle, axis)] + self.index_complexes += [ + (fdata, k, angle, axis)] fh5.close() except Exception: logger.exception(f'Ignore file: {fdata}') @@ -413,7 +426,8 @@ def create_index_molecules(self): self.index_train = list(range(self.ntrain)) if self.ntrain == 0: - raise ValueError('No avaiable training data after filtering') + raise ValueError( + 'No avaiable training data after filtering') # Validation dataset if self.valid_database: @@ -421,7 +435,7 @@ def create_index_molecules(self): desc = '{:25s}'.format(' Validation dataset') if self.tqdm: data_tqdm = tqdm(self.valid_database, - desc=desc, file=sys.stdout) + desc=desc, file=sys.stdout) else: data_tqdm = self.valid_database logger.info(' Validation dataset') @@ -470,7 +484,8 @@ def create_index_molecules(self): logger.exception(f'Ignore file: {fdata}') self.ntot = len(self.index_complexes) - self.index_test = list(range(self.ntrain + self.nvalid, self.ntot)) + self.index_test = list( + range(self.ntrain + self.nvalid, self.ntot)) self.ntest = self.ntot - self.ntrain - self.nvalid def _select_pdb(self, mol_names): @@ -492,7 +507,7 @@ def _select_pdb(self, mol_names): if self.use_rotation > 0: for i in range(self.use_rotation): fnames_augmented += list(filter(lambda x: - re.search('_r%03d$' % (i + 1), x), mol_names)) + re.search('_r%03d$' % (i + 1), x), mol_names)) selected_mol_names = fnames_original + fnames_augmented else: selected_mol_names = fnames_original @@ -532,11 +547,13 @@ def filter(self, molgrp): ops = ['>', '<', '==', '<=', '>='] new_cond_vals = cond_vals for o in ops: - new_cond_vals = new_cond_vals.replace(o, 'val' + o) + new_cond_vals = new_cond_vals.replace( + o, 'val' + o) if not eval(new_cond_vals): return False else: - raise ValueError('Conditions not supported', cond_vals) + raise ValueError( + 'Conditions not supported', cond_vals) return True @@ -570,7 +587,8 @@ class parameter self.select_feature examples: # loop over the feat types and add all the feat_names for feat_type, feat_names in mapped_data.items(): - self.select_feature[feat_type] = [name for name in feat_names] + self.select_feature[feat_type] = [ + name for name in feat_names] # if a selection was made else: @@ -630,7 +648,8 @@ class parameter self.select_feature examples: # (we probably relaod a pretrained model) # and we simply append the feaature name else: - self.select_feature[feat_type].append(name) + self.select_feature[feat_type].append( + name) f5.close() @@ -672,7 +691,8 @@ class parameter self.select_feature examples: self.select_feature['AtomicDensities'] = \ config.atom_vdw_radius_noH elif feat_type == 'Features': - self.select_feature[feat_type] = list(raw_data.keys()) + self.select_feature[feat_type] = list( + raw_data.keys()) else: raise KeyError( f'Wrong feature type {feat_type}. ' @@ -719,7 +739,8 @@ def print_possible_features(self): logger.info('\nYour selection was:') for feat_type, feat in self.select_feature.items(): if feat_type not in list(mapgrp.keys()): - logger.info('== \x1b[0;37;41m' + feat_type + '\x1b[0m') + logger.info( + '== \x1b[0;37;41m' + feat_type + '\x1b[0m') else: logger.info('== %s' % feat_type) if isinstance(feat, str): @@ -729,7 +750,7 @@ def print_possible_features(self): logger.info(' -- %s' % f) logger.info("You don't need to specify _chainA _chainB for each feature. " + - "The code will append it automatically") + "The code will append it automatically") def get_pairing_feature(self): """Creates the index of paired features.""" @@ -766,7 +787,8 @@ def get_input_shape(self): self.data_shape = feature.shape if self.pair_chain_feature: - feature = self.make_feature_pair(feature, self.pair_chain_feature) + feature = self.make_feature_pair( + feature, self.pair_chain_feature) if self.transform: feature = self.convert2d(feature, self.proj2D) @@ -826,9 +848,11 @@ def compute_norm(self): # get the feature/target if self.mapfly: - feature, target = self.map_one_molecule(fname, mol=molname) + feature, target = self.map_one_molecule( + fname, mol=molname) else: - feature, target = self.load_one_molecule(fname, mol=molname) + feature, target = self.load_one_molecule( + fname, mol=molname) # create the norm isntances at the first passage if first: @@ -857,8 +881,10 @@ def compute_norm(self): self.param_norm['features'][ifeat].std = 1 # store as array for fast access - self.feature_mean.append(self.param_norm['features'][ifeat].mean) - self.feature_std.append(self.param_norm['features'][ifeat].std) + self.feature_mean.append( + self.param_norm['features'][ifeat].mean) + self.feature_std.append( + self.param_norm['features'][ifeat].std) self.target_min = self.param_norm['targets'].min[0] self.target_max = self.param_norm['targets'].max[0] @@ -876,7 +902,8 @@ def get_norm(self): for feat_type, feat_names in self.select_feature.items(): self.param_norm['features'][feat_type] = {} for name in feat_names: - self.param_norm['features'][feat_type][name] = NormParam() + self.param_norm['features'][feat_type][name] = NormParam( + ) self.param_norm['targets'][self.select_target] = MinMaxParam() # read the normalization @@ -917,20 +944,25 @@ def _read_norm(self): mean = data['features'][feat_type][name].mean var = data['features'][feat_type][name].var if var == 0: - logger.info(' : STD is null for %s in %s' % (name, f5)) - self.param_norm['features'][feat_type][name].add(mean, var) + logger.info( + ' : STD is null for %s in %s' % (name, f5)) + self.param_norm['features'][feat_type][name].add( + mean, var) # handle the target minv = data['targets'][self.select_target].min maxv = data['targets'][self.select_target].max - self.param_norm['targets'][self.select_target].update(minv) - self.param_norm['targets'][self.select_target].update(maxv) + self.param_norm['targets'][self.select_target].update( + minv) + self.param_norm['targets'][self.select_target].update( + maxv) # process the std nfile = len(self.train_database) for feat_types, feat_dict in self.param_norm['features'].items(): for feat in feat_dict: - self.param_norm['features'][feat_types][feat].process(nfile) + self.param_norm['features'][feat_types][feat].process( + nfile) if self.param_norm['features'][feat_types][feat].std == 0: logger.info( ' Final STD Null for %s/%s. Changed it to 1' % @@ -958,7 +990,8 @@ def _get_target_ordering(self, order): elif self.select_target in NA_list: self.target_ordering = None else: - warnings.warn(' Target ordering unidentified. lower assumed') + warnings.warn( + ' Target ordering unidentified. lower assumed') self.target_ordering = 'lower' def backtransform_target(self, data): @@ -1077,14 +1110,16 @@ def load_one_molecule(self, fname, mol=None): # see if the feature exists if 'mapped_features/' + feat_type in mol_data.keys(): - feat_dict = mol_data.get('mapped_features/' + feat_type) + feat_dict = mol_data.get( + 'mapped_features/' + feat_type) else: logger.error( f'Feature type {feat_type} not found in file {fname} ' f'for molecule {mol}.\n' f'Possible feature types are :\n\t' + - '\n\t'.join(list(mol_data['mapped_features'].keys())) - ) + '\n\t'.join( + list(mol_data['mapped_features'].keys())) + ) raise ValueError(feat_type, ' not supported') # loop through all the desired feat names @@ -1099,9 +1134,10 @@ def load_one_molecule(self, fname, mol=None): f'{mol} and feature type {feat_type}.\n' f'Possible feature are :\n\t' + '\n\t'.join(list( - mol_data['mapped_features/' + feat_type].keys() - )) - ) + mol_data['mapped_features/' + + feat_type].keys() + )) + ) # check its sparse attribute # if true get a FLAN @@ -1215,12 +1251,14 @@ def make_feature_pair(feature, op): raise ValueError('Operation not callable', op) nFeat = len(feature) - pair_indexes = list(np.arange(nFeat).reshape(int(nFeat / 2), 2)) + pair_indexes = list( + np.arange(nFeat).reshape(int(nFeat / 2), 2)) outtype = feature.dtype new_feat = [] for ind in pair_indexes: - new_feat.append(op(feature[ind[0], ...], feature[ind[1], ...])) + new_feat.append( + op(feature[ind[0], ...], feature[ind[1], ...])) return np.array(new_feat).astype(outtype) @@ -1247,7 +1285,8 @@ def get_grid(self, mol_data): except BaseException: - raise ValueError("Grid points not found in the data file") + raise ValueError( + "Grid points not found in the data file") else: @@ -1306,10 +1345,12 @@ def map_atomic_densities( # rotate if necessary if angle is not None: if xyzA != np.array([]): - xyzA = pdb2sql.transform.rot_xyz_around_axis(xyzA, axis, angle, center) + xyzA = pdb2sql.transform.rot_xyz_around_axis( + xyzA, axis, angle, center) if xyzB != np.array([]): - xyzB = pdb2sql.transform.rot_xyz_around_axis(xyzB, axis, angle, center) + xyzB = pdb2sql.transform.rot_xyz_around_axis( + xyzB, axis, angle, center) # init the grid atdensA = np.zeros(npts) @@ -1345,7 +1386,8 @@ def _densgrid(center, vdw_radius, grid, npts): """ x0, y0, z0 = center - dd = np.sqrt((grid[0] - x0)**2 + (grid[1] - y0)**2 + (grid[2] - z0)**2) + dd = np.sqrt((grid[0] - x0)**2 + + (grid[1] - y0)**2 + (grid[2] - z0)**2) dgrid = np.zeros(npts) dgrid[dd < vdw_radius] = np.exp( @@ -1362,7 +1404,6 @@ def map_feature(self, feat_names, mol_data, grid, npts, angle, axis): __vectorize__ = False - if angle is not None: center = [np.mean(g) for g in grid] @@ -1382,14 +1423,15 @@ def map_feature(self, feat_names, mol_data, grid, npts, angle, axis): feat_value = data[:, 4] if angle is not None: - pos = pdb2sql.transform.rot_xyz_around_axis(pos, axis, angle, center) + pos = pdb2sql.transform.rot_xyz_around_axis( + pos, axis, angle, center) if __vectorize__ or __vectorize__ == 'both': for chainID in [0, 1]: tmp_feat_vect[chainID] = np.sum( - vmap(pos[chain == chainID,:], - feat_value[chain == chainID]), + vmap(pos[chain == chainID, :], + feat_value[chain == chainID]), 0) if not __vectorize__ or __vectorize__ == 'both': @@ -1431,7 +1473,8 @@ def _featgrid(center, value, grid, npts): beta = 0.5 / (sigma**2) cutoff = 5. * beta - dd = np.sqrt((grid[0] - x0)**2 + (grid[1] - y0)**2 + (grid[2] - z0)**2) + dd = np.sqrt((grid[0] - x0)**2 + + (grid[1] - y0)**2 + (grid[2] - z0)**2) dd[dd < cutoff] = value * np.exp(-beta * dd[dd < cutoff]) dd[dd > cutoff] = 0 @@ -1440,4 +1483,4 @@ def _featgrid(center, value, grid, npts): #dgrid[dd Date: Mon, 27 Apr 2020 15:19:47 +0200 Subject: [PATCH 2/3] added error report if mapped_features grp does not exist --- deeprank/learn/DataSet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py index 850e0f39..f66864a1 100644 --- a/deeprank/learn/DataSet.py +++ b/deeprank/learn/DataSet.py @@ -1104,6 +1104,12 @@ def load_one_molecule(self, fname, mol=None): # get the mol mol_data = fh5.get(mol) + # xue: + if 'mapped_features' not in mol_data.keys(): + logger.error(f"xue: Error: mol: {mol} in {fname} does not have mapped_features ") + fh5.close() + sys.exit() + # get the features feature = [] for feat_type, feat_names in self.select_feature.items(): From 84810d9168ba68dafd2177f02f5d73be727af811 Mon Sep 17 00:00:00 2001 From: LilySnow Date: Sat, 16 May 2020 09:30:24 +0200 Subject: [PATCH 3/3] fixed a bug in read_epoch_data() in plot_utils.py --- deeprank/utils/plot_utils.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/deeprank/utils/plot_utils.py b/deeprank/utils/plot_utils.py index af5ff73f..a71158d1 100755 --- a/deeprank/utils/plot_utils.py +++ b/deeprank/utils/plot_utils.py @@ -46,25 +46,23 @@ def read_epoch_data(DR_h5FL, epoch): 0 Test 1AVX_ranair-it0_5286 0 0.503823 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 1 Test 1AVX_ti5-itw_354w 1 0.502845 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 """ - print (f"-> Read epoch data from {DR_h5FL} into df") + print (f"-> Read epoch {epoch} data from {DR_h5FL} into df") # -- 1. read deeprank output data for the specific epoch h5 = h5py.File(DR_h5FL, 'r') + + keys = list(h5.keys()) + last_epoch_key = list(filter(lambda x: 'epoch_' in x, keys))[-1] + if epoch is None: - print(f"epoch is not provided. Use the last epoch data.") - keys = list(h5.keys()) - last_epoch_key = list(filter(lambda x: 'epoch_' in x, keys))[-1] + print(f"epoch is not provided. Use the last epoch data: {last_epoch_key}.") + epoch_key = last_epoch_key else: - last_epoch_key = 'epoch_%04d' % epoch - if last_epoch_key not in h5: - print( - 'Incorrect epcoh name\n Possible options are: ' + - ' '.join( - list( - h5.keys()))) - h5.close() - return - data = h5[last_epoch_key] + epoch_key = 'epoch_%04d' % epoch + if epoch_key not in h5: + print('Incorrect epoch name. Use the last epoch data: {last_epoch_key}.') + epoch_key = last_epoch_key + data = h5[epoch_key] # -- 2. convert into pd.DataFrame labels = list(data) # labels = ['train', 'test', 'valid']