From e251f8ddaffa82cad390a43fb9a17e26f148497b Mon Sep 17 00:00:00 2001 From: Li Xue Date: Tue, 7 May 2019 16:36:17 +0200 Subject: [PATCH 01/38] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e6476906..8b73f5b3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ Minimal information to install the module * clone the repository `git clone https://github.com/DeepRank/deeprank.git` * go there `cd deeprank` * install the module `pip install -e ./` + * install freesasa-python [install developer's version from https://github.com/freesasa/freesasa-python] * go int the test dir `cd test` * run the test suite `pytest` From b8cb3347261816d37155db30f6e8e851de548efe Mon Sep 17 00:00:00 2001 From: cunlianggeng Date: Mon, 8 Jul 2019 14:34:12 +0200 Subject: [PATCH 02/38] Update .gitignore Ignore Mac .DS_Store file --- .gitignore | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6671065b..08ffc680 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ -Byte-compiled / optimized +Byte-compiled / optimized *__pycache__* -# distribution +# distribution deeprank.egg-info database* @@ -9,7 +9,7 @@ database* example/grid/ELEC example/grid/VDW example/grid/data_viz -example/grid/input +example/grid/input example/workflow/test_out # datafile @@ -26,3 +26,6 @@ example/workflow/test_out #docs/_build #docs/_static #docs/_templates + +# Mac OSX files +.DS_Store From 7df93d135cca2c4c66deddb49be51c5aed5ab31f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 20 Feb 2020 09:17:25 +0100 Subject: [PATCH 03/38] Update issue templates Added bug report tempalte --- .github/ISSUE_TEMPLATE/bug-report.md | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 00000000..85257fbd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,32 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Environment:** +- OS system: +- Version: +- Branch commit ID: +- Inputs: + +**To Reproduce** +Steps/commands to reproduce the behaviour: + 1. + 2. + 3. + +**Expected Results** +A clear and concise description of what you expected to happen. + +**Actual Results or Error Info** +If applicable, add screenshots to help explain your problem. + +**Additional Context** +Add any other context about the problem here. From 476ad17c3dbb7014695008e004eb9f73ee5c9eaf Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 20 Feb 2020 09:21:29 +0100 Subject: [PATCH 04/38] Update issue templates Added feature request issue template --- .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..bbcbbe7d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 08d8d82278508c93b05f0b9aaea0571b8edde6fb Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 24 Mar 2020 13:24:15 +0100 Subject: [PATCH 05/38] change to _close() for pdb2sql --- deeprank/features/AtomicFeature.py | 4 +- deeprank/features/BSA.py | 6 +- deeprank/features/FullPSSM.py | 2 +- deeprank/features/NaivePSSM.py | 2 +- deeprank/features/ResidueDensity.py | 6 +- deeprank/generate/DataGenerator.py | 99 ++++++++++++++++++++++++----- deeprank/generate/GridTools.py | 4 +- deeprank/learn/DataSet.py | 2 +- deeprank/tools/sasa.py | 4 +- deeprank/utils/visualize3Ddata.py | 2 +- example/generate_dataset.py | 1 + test/2OUL/test.py | 2 +- test/test_atomic_features.py | 4 +- 13 files changed, 102 insertions(+), 36 deletions(-) diff --git a/deeprank/features/AtomicFeature.py b/deeprank/features/AtomicFeature.py index eb20f322..cde7dce8 100644 --- a/deeprank/features/AtomicFeature.py +++ b/deeprank/features/AtomicFeature.py @@ -61,7 +61,7 @@ def __init__(self, pdbfile, param_charge=None, param_vdw=None, >>> atfeat.evaluate_pair_interaction(save_interactions=test_name) >>> >>> # close the db - >>> atfeat.sqldb.close() + >>> atfeat.sqldb._close() """ super().__init__("Atomic") @@ -939,7 +939,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw): atfeat.export_data_hdf5(featgrp_raw) # close - atfeat.sqldb.close() + atfeat.sqldb._close() ######################################################################## diff --git a/deeprank/features/BSA.py b/deeprank/features/BSA.py index cdf5ac03..37d8ac6b 100644 --- a/deeprank/features/BSA.py +++ b/deeprank/features/BSA.py @@ -31,7 +31,7 @@ def __init__(self, pdb_data, chainA='A', chainB='B'): >>> bsa = BSA('1AK4.pdb') >>> bsa.get_structure() >>> bsa.get_contact_residue_sasa() - >>> bsa.sql.close() + >>> bsa.sql._close() """ self.pdb_data = pdb_data self.sql = pdb2sql.interface(pdb_data) @@ -156,7 +156,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw): bsa.export_data_hdf5(featgrp_raw) # close the file - bsa.sql.close() + bsa.sql._close() ######################################################################## @@ -177,7 +177,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw): bsa = BSA(pdb_file) bsa.get_structure() bsa.get_contact_residue_sasa() - bsa.sql.close() + bsa.sql._close() pprint(bsa.feature_data) print() diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py index a8715662..4e6572a1 100644 --- a/deeprank/features/FullPSSM.py +++ b/deeprank/features/FullPSSM.py @@ -180,7 +180,7 @@ def get_feature_value(self, cutoff=5.5): # get interface contact residues # ctc_res = {"A":[chain 1 residues], "B": [chain2 residues]} ctc_res = sql.get_contact_residues(cutoff=cutoff) - sql.close() + sql._close() ctc_res = ctc_res["A"] + ctc_res["B"] # handle with small interface or no interface diff --git a/deeprank/features/NaivePSSM.py b/deeprank/features/NaivePSSM.py index 603131df..0e509814 100644 --- a/deeprank/features/NaivePSSM.py +++ b/deeprank/features/NaivePSSM.py @@ -159,7 +159,7 @@ def get_feature_value(self, contact_only=True): contact_residue = sql.get_contact_residue(cutoff=5.5) contact_residue = contact_residue["A"] + contact_residue["B"] - sql.close() + sql._close() pssm_data_xyz = {} pssm_data = {} diff --git a/deeprank/features/ResidueDensity.py b/deeprank/features/ResidueDensity.py index bf784a04..9965a7fb 100644 --- a/deeprank/features/ResidueDensity.py +++ b/deeprank/features/ResidueDensity.py @@ -86,7 +86,7 @@ def get(self, cutoff=5.5): # handle with small interface or no interface if total_ctc == 0: # first close the sql - self.sql.close() + self.sql._close() raise ValueError( f"No residue contact found with the cutoff {cutoff}Å. " @@ -179,7 +179,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw): resdens.export_data_hdf5(featgrp_raw) # close sql - resdens.sql.close() + resdens.sql._close() ######################################################################## # @@ -203,7 +203,7 @@ def __compute_feature__(pdb_data, featgrp, featgrp_raw): resdens.get() resdens.extract_features() - resdens.sql.close() + resdens.sql._close() pprint(resdens.feature_data) print() diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index e42e0509..faad6dd9 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -33,7 +33,7 @@ def _printif(string, cond): return print(string) if cond else None class DataGenerator(object): def __init__(self, pdb_select=None, pdb_source=None, - pdb_native=None, pssm_source=None, + pdb_native=None, pssm_source=None, align=None, compute_targets=None, compute_features=None, data_augmentation=None, hdf5='database.h5', mpi_comm=None): """Generate the data (features/targets/maps) required for deeprank. @@ -77,6 +77,7 @@ def __init__(self, pdb_select=None, pdb_source=None, self.pdb_select = pdb_select or [] self.pdb_source = pdb_source or [] self.pdb_native = pdb_native or [] + self.align = align or None if pssm_source is not None: config.PATH_PSSM_SOURCE = pssm_source @@ -285,9 +286,9 @@ def create_database( molgrp.attrs['type'] = 'molecule' # add the ref and the complex - self._add_pdb(molgrp, cplx, 'complex') + self._add_pdb(molgrp, cplx, 'complex', self.align) if ref is not None: - self._add_pdb(molgrp, ref, 'native') + self._add_pdb(molgrp, ref, 'native', self.align) if verbose: self.logger.info( @@ -403,10 +404,14 @@ def create_database( # copy the ref into it if ref is not None: - self._add_pdb(molgrp, ref, 'native') + self._add_pdb(molgrp, ref, 'native', self.align) # get the rotation axis and angle - axis, angle = pdb2sql.transform.get_rot_axis_angle(random_seed) + if self.align is None: + axis, angle = pdb2sql.transform.get_rot_axis_angle(random_seed) + else: + axis, angle = self._get_aligned_rotation_axis_angle(random_seed, + self.align) # create the new pdb and get molecule center # molecule center is the origin of rotation @@ -733,7 +738,7 @@ def _get_grid_center(pdb, contact_distance): center_contact = np.mean( np.array(sqldb.get('x,y,z', rowID=contact_atoms)), 0) - sqldb.close() + sqldb._close() return center_contact @@ -1302,7 +1307,7 @@ def _compute_targets(targ_list, pdb_data, targrp): # ==================================================================================== @staticmethod - def _add_pdb(molgrp, pdbfile, name): + def _add_pdb(molgrp, pdbfile, name, dict_align): """Add a pdb to a molgrp. Args: @@ -1310,12 +1315,30 @@ def _add_pdb(molgrp, pdbfile, name): pdbfile (str): psb file to add name (str): dataset name in the hdf5 molgroup """ - # read the pdb and extract the ATOM lines - with open(pdbfile, 'r') as fi: - data = [line.split('\n')[0] - for line in fi if line.startswith('ATOM')] - # PDB default line length is 80 - # http://www.wwpdb.org/documentation/file-format + + # no alignement + if dict_align is None: + # read the pdb and extract the ATOM lines + with open(pdbfile, 'r') as fi: + data = [line.split('\n')[0] + for line in fi if line.startswith('ATOM')] + # PDB default line length is 80 + # http://www.wwpdb.org/documentation/file-format + + # some alignement + elif isinstance(dict_align, dict): + + if dict_align['selection'] == 'interface': + sqldb = pdb2sql.align.align_interface(pdbfile, + plane=dict_align['plane'], + export=False) + + else: + sqldb = pdb2sq.align.align(pdbfile, axis=dict_align['axis'], + export=False, + **dict_align['selection']) + data = sqldb.sql2pdb() + data = np.array(data).astype('|S78') molgrp.create_dataset(name, data=data) @@ -1326,8 +1349,50 @@ def _add_pdb(molgrp, pdbfile, name): # # ==================================================================================== - # add a rotated pdb structure to the database + @staticmethod + def _get_aligned_rotation_axis_angle(random_seed, dict_align): + """Returns the axis and angle of rotation for data + augmentation with aligned complexes + + Arguments: + random_seed {int} -- random seed of rotation + dict_align {dict} -- the dict describing the alignement + + Returns: + list(float): axis of rotation + float: angle of rotation + """ + if seed is not None: + np.random.seed(seed) + + angle = 2 * np.pi * np.random.rand() + + if 'plane' in dict_align.keys(): + if dict_align['plane'] == 'xy': + axis = [0.,0.,1.] + elif dict_align['plane'] == 'xz': + axis = [0.,1.,0.] + elif dict_align['plane'] == 'yz': + axis = [1.,0.,0.] + else: + raise ValueError("plane must be xy, xz or yz") + + elif 'axis' in dict_align.keys(): + if dict_align['axis'] == 'x': + axis = [1.,0.,0.] + elif dict_align['axis'] == 'y': + axis = [0.,1.,0.] + elif dict_align['axis'] == 'z': + axis = [0.,0.,1.] + else: + raise ValueError("axis must be x, y or z") + else: + raise ValueError('dict_align must contains plane or axis') + + return axis, angle + + # add a rotated pdb structure to the database @staticmethod def _add_aug_pdb(molgrp, pdbfile, name, axis, angle): """Add augmented pdbs to the dataset. @@ -1337,12 +1402,12 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle): pdbfile (str): pdb file name name (str): name of the dataset axis (list(float)): axis of rotation - angle (folat): angle of rotation + angle (float): angle of rotation Returns: list(float): center of the molecule """ - # create tthe sqldb and extract positions + # create the sqldb and extract positions sqldb = pdb2sql.pdb2sql(pdbfile) # rotate the positions @@ -1358,7 +1423,7 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle): molgrp.create_dataset(name, data=data) # close the db - sqldb.close() + sqldb._close() return center diff --git a/deeprank/generate/GridTools.py b/deeprank/generate/GridTools.py index 421851b1..0b7f7b70 100644 --- a/deeprank/generate/GridTools.py +++ b/deeprank/generate/GridTools.py @@ -168,7 +168,7 @@ def create_new_data(self): self.add_all_atomic_densities() # cloe the db file - self.sqldb.close() + self.sqldb._close() ################################################################ @@ -199,7 +199,7 @@ def update_feature(self): self.add_all_atomic_densities() # cloe the db file - self.sqldb.close() + self.sqldb._close() ################################################################ diff --git a/deeprank/learn/DataSet.py b/deeprank/learn/DataSet.py index 91049f72..1f479351 100644 --- a/deeprank/learn/DataSet.py +++ b/deeprank/learn/DataSet.py @@ -1325,7 +1325,7 @@ def map_atomic_densities( densities += [atdensA, atdensB] - sql.close() + sql._close() return densities diff --git a/deeprank/tools/sasa.py b/deeprank/tools/sasa.py index 2616513f..f97eb27c 100644 --- a/deeprank/tools/sasa.py +++ b/deeprank/tools/sasa.py @@ -83,7 +83,7 @@ def get_residue_center(self, chainA='A', chainB='B'): for r in resB[:, :2]: if tuple(r) not in self.resinfo[chainB]: self.resinfo[chainB].append(tuple(r)) - sql.close() + sql._close() def get_residue_carbon_beta(self, chainA='A', chainB='B'): """Extract the position of the carbon beta of each residue. @@ -104,7 +104,7 @@ def get_residue_carbon_beta(self, chainA='A', chainB='B'): 'resSeq,resName,x,y,z', name='CB', chainID=chainB)) - sql.close() + sql._close() assert len(resA[:, 0].astype(np.int).tolist()) == len( np.unique(resA[:, 0].astype(np.int)).tolist()) diff --git a/deeprank/utils/visualize3Ddata.py b/deeprank/utils/visualize3Ddata.py index 096d4c11..9206920c 100755 --- a/deeprank/utils/visualize3Ddata.py +++ b/deeprank/utils/visualize3Ddata.py @@ -57,7 +57,7 @@ def visualize3Ddata(hdf5=None, mol_name=None, out=None): # create the pdb file sqldb = pdb2sql.pdb2sql(molgrp['complex'][:]) sqldb.exportpdb(outdir + '/complex.pdb') - sqldb.close() + sqldb._close() # get the grid grid = {} diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 7255c761..51a10ca5 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -24,6 +24,7 @@ pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, + align={"selection":'interface', "plane":'xy'}, data_augmentation=0, compute_targets=[ 'deeprank.targets.dockQ', diff --git a/test/2OUL/test.py b/test/2OUL/test.py index 39428b23..56555856 100644 --- a/test/2OUL/test.py +++ b/test/2OUL/test.py @@ -42,4 +42,4 @@ # atfeat.compute_vdw_interchain_only(contact_only=False) # # close the db -# atfeat.sqldb.close() +# atfeat.sqldb._close() diff --git a/test/test_atomic_features.py b/test/test_atomic_features.py index 6f13734e..f1a9ad65 100644 --- a/test/test_atomic_features.py +++ b/test/test_atomic_features.py @@ -103,7 +103,7 @@ def test_atomic_haddock(): atfeat.compute_vdw_interchain_only(contact_only=False) # close the db - atfeat.sqldb.close() + atfeat.sqldb._close() # @staticmethod # def test_atomic_zdock(): @@ -142,7 +142,7 @@ def test_atomic_haddock(): # atfeat.compute_vdw_interchain_only(contact_only=False) # # close the db - # atfeat.sqldb.close() + # atfeat.sqldb._close() if __name__ == '__main__': From 488377e65db111c8c22e92d7b0a94d22c590be0c Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 24 Mar 2020 13:30:57 +0100 Subject: [PATCH 06/38] install align branch of pdb2sql --- .travis.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c587a554..91e8e086 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,12 +16,19 @@ before_install: # Useful for debugging any issues with conda - conda info -a - # python + # python - conda install python=3.7 # install openmpi for mpi4py - sudo apt-get install libopenmpi-dev openmpi-bin + # pdb2sql align branch + # to be removed when align is merged + - git clone -b align --single-branch https://github.com/DeepRank/pdb2sql.git + - cd pdb2sql + - pip install -e ./ + - cd ../ + install: - pip install -qe .[test] From f4e7b8924073b8d2b0239e6bca6a179202ff7155 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 24 Mar 2020 13:56:12 +0100 Subject: [PATCH 07/38] choose chains in interface align --- deeprank/generate/DataGenerator.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index faad6dd9..3eec87ef 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -13,6 +13,8 @@ from deeprank.config import logger from deeprank.generate import GridTools as gt import pdb2sql +from pdb2sql.align import align as align_along_axis +from pdb2sql.align import align_interface try: from tqdm import tqdm @@ -1329,14 +1331,19 @@ def _add_pdb(molgrp, pdbfile, name, dict_align): elif isinstance(dict_align, dict): if dict_align['selection'] == 'interface': - sqldb = pdb2sql.align.align_interface(pdbfile, - plane=dict_align['plane'], - export=False) + if np.all([k in dict_align for k in ['chain1','chain2']]): + chains = {'chain1':dict_align['chain1'], + 'chain2':dict_align['chain2']} + else: + chains = {} + sqldb = align_interface(pdbfile, + plane=dict_align['plane'], + export=False, **chains) else: - sqldb = pdb2sq.align.align(pdbfile, axis=dict_align['axis'], - export=False, - **dict_align['selection']) + sqldb = align_along_axis(pdbfile, axis=dict_align['axis'], + export=False, + **dict_align['selection']) data = sqldb.sql2pdb() data = np.array(data).astype('|S78') From 19a43bc28d8e3fd756f153a49537663ef752d26a Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Wed, 25 Mar 2020 12:12:39 +0100 Subject: [PATCH 08/38] remove possible - in molname --- deeprank/generate/DataGenerator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 3eec87ef..642d18d3 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -242,6 +242,7 @@ def create_database( # names of the molecule mol_name = os.path.splitext(os.path.basename(cplx))[0] + mol_name = mol_name.replace('-', '_') mol_aug_name_list = [] try: @@ -895,6 +896,7 @@ def map_features(self, grid_info={}, for m in modes: if m not in grid_info: grid_info[m] = 'ind' + ################################################################ # ################################################################ From 34c7979f761cabafaefaaa0fd8f88023e8d508a1 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Wed, 25 Mar 2020 12:41:57 +0100 Subject: [PATCH 09/38] export optional in DataGenerator --- deeprank/generate/DataGenerator.py | 17 +++++++++++++---- example/generate_dataset.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 642d18d3..4bb69387 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -1332,19 +1332,28 @@ def _add_pdb(molgrp, pdbfile, name, dict_align): # some alignement elif isinstance(dict_align, dict): + if 'selection' not in dict_align.keys(): + dict_align['selection'] = {} + + if 'export' not in dict_align.keys(): + dict_align['export'] = False + if dict_align['selection'] == 'interface': + if np.all([k in dict_align for k in ['chain1','chain2']]): chains = {'chain1':dict_align['chain1'], 'chain2':dict_align['chain2']} else: chains = {} - sqldb = align_interface(pdbfile, - plane=dict_align['plane'], - export=False, **chains) + + sqldb = align_interface(pdbfile, plane=dict_align['plane'], + export = dict_align['export'], + **chains) else: + sqldb = align_along_axis(pdbfile, axis=dict_align['axis'], - export=False, + export = dict_align['export'], **dict_align['selection']) data = sqldb.sql2pdb() diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 51a10ca5..1b62583f 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -24,7 +24,7 @@ pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, - align={"selection":'interface', "plane":'xy'}, + align={"axis":'z'}, data_augmentation=0, compute_targets=[ 'deeprank.targets.dockQ', From 970e17de4ef2d54c6c102a9b68b9c1a8a6611a69 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Wed, 25 Mar 2020 15:27:35 +0100 Subject: [PATCH 10/38] fixed data augmentation --- deeprank/generate/DataGenerator.py | 83 +++++++++++++++++------------- example/generate_dataset.py | 4 +- 2 files changed, 50 insertions(+), 37 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 4bb69387..45de2cca 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -289,9 +289,9 @@ def create_database( molgrp.attrs['type'] = 'molecule' # add the ref and the complex - self._add_pdb(molgrp, cplx, 'complex', self.align) + self._add_pdb(molgrp, cplx, 'complex') if ref is not None: - self._add_pdb(molgrp, ref, 'native', self.align) + self._add_pdb(molgrp, ref, 'native') if verbose: self.logger.info( @@ -399,7 +399,7 @@ def create_database( f' with {self.data_augmentation} times...') # loop over the complexes - for _, mol_aug_name in enumerate(mol_aug_name_list): + for mol_aug_name in mol_aug_name_list: # crete a subgroup for the molecule molgrp = self.f5.require_group(mol_aug_name) @@ -407,7 +407,7 @@ def create_database( # copy the ref into it if ref is not None: - self._add_pdb(molgrp, ref, 'native', self.align) + self._add_pdb(molgrp, ref, 'native') # get the rotation axis and angle if self.align is None: @@ -417,7 +417,7 @@ def create_database( self.align) # create the new pdb and get molecule center - # molecule center is the origin of rotation + # molecule center is the origin of rotation) mol_center = self._add_aug_pdb( molgrp, cplx, 'complex', axis, angle) @@ -1309,9 +1309,8 @@ def _compute_targets(targ_list, pdb_data, targrp): # ADD PDB FILE # # ==================================================================================== - - @staticmethod - def _add_pdb(molgrp, pdbfile, name, dict_align): + + def _add_pdb(self, molgrp, pdbfile, name): """Add a pdb to a molgrp. Args: @@ -1321,7 +1320,7 @@ def _add_pdb(molgrp, pdbfile, name, dict_align): """ # no alignement - if dict_align is None: + if self.align is None: # read the pdb and extract the ATOM lines with open(pdbfile, 'r') as fi: data = [line.split('\n')[0] @@ -1330,36 +1329,47 @@ def _add_pdb(molgrp, pdbfile, name, dict_align): # http://www.wwpdb.org/documentation/file-format # some alignement - elif isinstance(dict_align, dict): + elif isinstance(self.align, dict): - if 'selection' not in dict_align.keys(): - dict_align['selection'] = {} + sqldb = self._get_aligned_sqldb(pdbfile, self.align) + data = sqldb.sql2pdb() + + data = np.array(data).astype('|S78') + molgrp.create_dataset(name, data=data) - if 'export' not in dict_align.keys(): - dict_align['export'] = False + @staticmethod + def _get_aligned_sqldb(pdbfile, dict_align): + """return a sqldb of the pdb that is aligned as specified in the dict + + Arguments: + pdbfile {str} -- path ot the pdb + dict_align {dict} -- dictionanry of options to align the pdb + """ + if 'selection' not in dict_align.keys(): + dict_align['selection'] = {} - if dict_align['selection'] == 'interface': + if 'export' not in dict_align.keys(): + dict_align['export'] = False - if np.all([k in dict_align for k in ['chain1','chain2']]): - chains = {'chain1':dict_align['chain1'], - 'chain2':dict_align['chain2']} - else: - chains = {} - - sqldb = align_interface(pdbfile, plane=dict_align['plane'], - export = dict_align['export'], - **chains) + if dict_align['selection'] == 'interface': + if np.all([k in dict_align for k in ['chain1','chain2']]): + chains = {'chain1':dict_align['chain1'], + 'chain2':dict_align['chain2']} else: + chains = {} + + sqldb = align_interface(pdbfile, plane=dict_align['plane'], + export = dict_align['export'], + **chains) - sqldb = align_along_axis(pdbfile, axis=dict_align['axis'], - export = dict_align['export'], - **dict_align['selection']) - data = sqldb.sql2pdb() + else: - data = np.array(data).astype('|S78') - molgrp.create_dataset(name, data=data) + sqldb = align_along_axis(pdbfile, axis=dict_align['axis'], + export = dict_align['export'], + **dict_align['selection']) + return sqldb # ==================================================================================== # @@ -1381,8 +1391,8 @@ def _get_aligned_rotation_axis_angle(random_seed, dict_align): float: angle of rotation """ - if seed is not None: - np.random.seed(seed) + if random_seed is not None: + np.random.seed(random_seed) angle = 2 * np.pi * np.random.rand() @@ -1411,8 +1421,7 @@ def _get_aligned_rotation_axis_angle(random_seed, dict_align): return axis, angle # add a rotated pdb structure to the database - @staticmethod - def _add_aug_pdb(molgrp, pdbfile, name, axis, angle): + def _add_aug_pdb(self, molgrp, pdbfile, name, axis, angle): """Add augmented pdbs to the dataset. Args: @@ -1421,12 +1430,16 @@ def _add_aug_pdb(molgrp, pdbfile, name, axis, angle): name (str): name of the dataset axis (list(float)): axis of rotation angle (float): angle of rotation + dict_align (dict) : dict for alignement of the original pdb Returns: list(float): center of the molecule """ # create the sqldb and extract positions - sqldb = pdb2sql.pdb2sql(pdbfile) + if self.align is None: + sqldb = pdb2sql.pdb2sql(pdbfile) + else: + sqldb = self._get_aligned_sqldb(pdbfile, self.align) # rotate the positions pdb2sql.transform.rot_axis(sqldb, axis, angle) diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 1b62583f..2b4a4f08 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -24,8 +24,8 @@ pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, - align={"axis":'z'}, - data_augmentation=0, + align={"axis":'z','export':True}, + data_augmentation=2, compute_targets=[ 'deeprank.targets.dockQ', 'deeprank.targets.binary_class'], From 684ce11071010cee37a442968af5f1fd6054844f Mon Sep 17 00:00:00 2001 From: Nicolas Renaud Date: Wed, 25 Mar 2020 15:43:54 +0100 Subject: [PATCH 11/38] Update generate_dataset.py set export to False --- example/generate_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 2b4a4f08..3cf24189 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -24,7 +24,7 @@ pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, - align={"axis":'z','export':True}, + align={"axis":'z','export':False}, data_augmentation=2, compute_targets=[ 'deeprank.targets.dockQ', From 4ff7834e6d74e3e656266695bfbe55d1d40e4505 Mon Sep 17 00:00:00 2001 From: LilySnow Date: Thu, 26 Mar 2020 20:10:59 +0100 Subject: [PATCH 12/38] call Rscript from plot_utils.py instead of r2py --- deeprank/utils/cal_hitrate_successrate.py | 108 +++++- deeprank/utils/plot_utils.py | 399 +++++++++++++--------- 2 files changed, 331 insertions(+), 176 deletions(-) diff --git a/deeprank/utils/cal_hitrate_successrate.py b/deeprank/utils/cal_hitrate_successrate.py index 171cf243..fbb6b80c 100644 --- a/deeprank/utils/cal_hitrate_successrate.py +++ b/deeprank/utils/cal_hitrate_successrate.py @@ -3,9 +3,63 @@ from deeprank.learn import rankingMetrics +def cal_hitrate_successrate(df): + """calculate the hit rate and success_rate of the different train/valid/test + sets with HS (haddock scores) + + The hit rate is defined as: + the percentage of positive decoys that are included among the top m decoys. + a positive decoy is a native-like one with a i-rmsd <= 4A + + Steps: + 0. Input data: + + label caseID modelID target DR HS + 0 Test 1AVX 1AVX_ranair-it0_5286 0 0.503823 6.980802 + 1 Test 1AVX 1AVX_ti5-itw_354w 1 0.502845 -95.158100 + 2 Test 1AVX 1AVX_ranair-it0_6223 0 0.511688 -11.961460 + + + 1. For each case, calculate hit rate and success. Success is a binary, indicating whether this case is success when evaluating its top N models. + + caseID success_DR hitRate_DR success_HS hitRate_HS + 1ZHI 1 0.1 0 0.01 + 1ZHI 1 0.2 1 0.3 + ... + + 1ACB 0 0 1 0.3 + 1ACB 1 0.2 1 0.4 + ... + 2. Calculate success rate and hit rate over all cases. + """ + + # -- 1. calculate success rate (SR) and hit rate (HR) + HR_SR_per_case = evaluate(df) + HR_SR_per_case = add_rank(HR_SR_per_case, groupby = ['label', 'caseID']) + HR_SR_per_case = add_perc(HR_SR_per_case, groupby = ['label', 'caseID']) + + HR_SR_ave = ave_evaluate(HR_SR_per_case) + HR_SR_ave = add_rank(HR_SR_ave, groupby = 'label') + + HR_SR_ave.label =pd.Categorical(HR_SR_ave.label, categories=["Train","Valid", "Test"]) + + # -- 2. write to tsv files + outFL1 = 'hitrate_successrate_per_case.tsv' + HR_SR_per_case.to_csv(outFL1, index = False, sep = '\t', float_format = '%.2f') + + outFL2 = 'hitrate_successrate_ave.tsv' + HR_SR_ave.to_csv(outFL2, index = False, sep="\t", float_format = '%.2f') + + print("") + print (f"{outFL1} generated") + print (f"{outFL2} generated") + + #-- 3. return value + return HR_SR_per_case, HR_SR_ave + def evaluate(data): - """Calculate hit rate and success. + """Calculate hit rate and success rate for each case. data: a data frame. @@ -112,12 +166,12 @@ def ave_evaluate(data): # l = 'Train', 'Test' or 'Valid' top_N = min(num_models[l]) - print(f"Calculate hitrate/successrate over {num_cases[l]} cases on top 1-{top_N} models.") + print(f"Calculate hitrate/successrate over {num_cases[l]} cases in {l} on top 1-{top_N} models.") perf_ave = pd.DataFrame() perf_ave['label'] = [l] * top_N - for col in perf.columns[perf.columns.str.contains('^(hitRate_|success)')]: + for col in perf.columns[perf.columns.str.contains('^(hitRate_|success|perc)', case= False)]: # col = 'success_HS', 'hitRate_HS', 'success_DR', 'hitRate_DR' perf_ave[col] = np.zeros(top_N) @@ -165,8 +219,10 @@ def count(df): return num_cases, num_models -def add_rank(df): +def add_rank(df, groupby = ['label', 'caseID']): """ + groupby (list or str): add rank by the specified column(s) -> ['label', 'caseID'] + INPUT (a data frame): label success_DR hitRate_DR success_HS hitRate_HS Test 0.0 0.000000 0.0 0.000000 @@ -177,6 +233,41 @@ def add_rank(df): OUTPUT: label success_DR hitRate_DR success_HS hitRate_HS rank + Test 0.0 0.000000 0.0 0.000000 1 + Test 0.0 0.000000 1.0 0.012821 2 + + Train 0.0 0.000000 1.0 0.012821 1 + Train 0.0 0.000000 1.0 0.025641 2 + """ + + # -- add the 'rank' column to df + frames = [] # dfs for train/valid/test, respectively + rank = [] + for _, df_per_grp in df.groupby(groupby): + num_mol = len(df_per_grp) + rank_raw = np.array(range(num_mol)) + 1 + tmp_df = df_per_grp.copy() + tmp_df['rank'] = rank_raw + frames.append(tmp_df) + + new_df = pd.concat(frames) + + return new_df + +def add_perc(df, groupby = ['label', 'caseID']): + """ + groupby (list or str): add perc by the specified column(s) -> ['label', 'caseID'] + + INPUT (a data frame): + label success_DR hitRate_DR success_HS hitRate_HS + Test 0.0 0.000000 0.0 0.000000 + Test 0.0 0.000000 1.0 0.012821 + + Train 0.0 0.000000 1.0 0.012821 + Train 0.0 0.000000 1.0 0.025641 + + OUTPUT: + label success_DR hitRate_DR success_HS hitRate_HS perc Test 0.0 0.000000 0.0 0.000000 0.000949 Test 0.0 0.000000 1.0 0.012821 0.001898 @@ -184,14 +275,13 @@ def add_rank(df): Train 0.0 0.000000 1.0 0.025641 0.003795 """ - # -- add the 'rank' column to df + # -- add the 'perc' column to df frames = [] # dfs for train/valid/test, respectively rank = [] - for _, df_per_label in df.groupby('label'): - num_mol = len(df_per_label) + for _, df_per_grp in df.groupby(groupby): + num_mol = len(df_per_grp) rank_raw = np.array(range(num_mol)) + 1 - tmp_df = df_per_label.copy() - tmp_df['rank'] = rank_raw + tmp_df = df_per_grp.copy() tmp_df['perc'] = rank_raw/num_mol frames.append(tmp_df) diff --git a/deeprank/utils/plot_utils.py b/deeprank/utils/plot_utils.py index 3cd0112e..a4d22510 100755 --- a/deeprank/utils/plot_utils.py +++ b/deeprank/utils/plot_utils.py @@ -17,16 +17,14 @@ import numpy as np import pandas as pd -import rpy2.robjects as ro import torch import torch.nn.functional as F -from cal_hitrate_successrate import add_rank, ave_evaluate, evaluate -from rpy2.rinterface import RRuntimeWarning -from rpy2.robjects import pandas2ri -from rpy2.robjects.lib.ggplot2 import * +from cal_hitrate_successrate import add_rank, add_perc, ave_evaluate, evaluate, cal_hitrate_successrate +import subprocess +from shlex import quote, split import pdb -warnings.filterwarnings("ignore", category=RRuntimeWarning) +#warnings.filterwarnings("ignore", category=RRuntimeWarning) USAGE = __doc__.format(sys.argv[0]) @@ -39,52 +37,6 @@ def zip_equal(*iterables): yield combo -def plot_boxplot(df, figname=None, inverse=False): - """Plot a boxplot of predictions vs. targets. Useful to visualize the - performance of the training algorithm. This is only useful in - classification tasks. - - INPUT (pd.DataFrame): - - label modelID target DR sourceFL - Test 1AVX_ranair-it0_5286 0 0.503823 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 - Test 1AVX_ti5-itw_354w 1 0.502845 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 - Test 1AVX_ranair-it0_6223 0 0.511688 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 - """ - - print('\n --> Box Plot : ', figname, '\n') - - data = df - - font_size = 20 - # line = "#1F3552" - - text_style = element_text(size=font_size, family="Tahoma", face="bold") - - colormap_raw = [['0', 'ivory3'], - ['1', 'steelblue']] - - colormap = ro.StrVector([elt[1] for elt in colormap_raw]) - colormap.names = ro.StrVector([elt[0] for elt in colormap_raw]) - - p = ggplot(data) + \ - aes_string(x='target', y='DR', fill='target') + \ - geom_boxplot(width=0.2, alpha=0.7) + \ - facet_grid(ro.Formula('.~label')) +\ - scale_fill_manual(values=colormap) + \ - theme_bw() +\ - theme(**{'plot.title': text_style, - 'text': text_style, - 'axis.title': text_style, - 'axis.text.x': element_text(size=font_size), - 'legend.position': 'right'}) +\ - scale_x_discrete(name="Target") - - # p.plot() - ggplot2.ggsave(figname, dpi=100) - return p - - def read_epoch_data(DR_h5FL, epoch): """# read epoch data into a data frame. @@ -274,52 +226,29 @@ def plot_HS_iRMSD(df, figname=None): return p -def plot_successRate_hitRate(df, figname=None, inverse=False): - """Plot the hit rate and success_rate of the different training/valid/test - sets with HS (haddock scores) - - The hit rate is defined as: - the percentage of positive decoys that are included among the top m decoys. - a positive decoy is a native-like one with a i-rmsd <= 4A - - Args: - DR_h5FL (str): the hdf5 file generated by DeepRank. - HS_h5FL (str): the hdf5 file that saves data from haddock *.stats files - figname (str): filename for the plot - perc (binary): True -> top N% (over all models); False -> top N (up to minimum # of models of all cases) - - Steps: - 0. Input data: - - label caseID modelID target DR HS - 0 Test 1AVX 1AVX_ranair-it0_5286 0 0.503823 6.980802 - 1 Test 1AVX 1AVX_ti5-itw_354w 1 0.502845 -95.158100 - 2 Test 1AVX 1AVX_ranair-it0_6223 0 0.511688 -11.961460 +def plot_boxplot(dataFL, figname=None): + """Plot a boxplot of predictions vs. targets. Useful to visualize the + performance of the training algorithm. This is only useful in + classification tasks. + INPUT (pd.DataFrame): - 1. For each case, calculate hit rate and success. Success is a binary, indicating whether this case is success when evaluating its top N models. + label caseID modelID target DR HS + Test 1YVB 1YVB_ranair-it0_4286 0 0.56818 4.04629 + Test 1PPE 1PPE_ranair-it0_2999 0 0.56486 50.17506 + """ - caseID success_DR hitRate_DR success_HS hitRate_HS - 1ZHI 1 0.1 0 0.01 - 1ZHI 1 0.2 1 0.3 - ... + print('\n --> Box Plot : ', figname, '\n') - 1ACB 0 0 1 0.3 - 1ACB 1 0.2 1 0.4 - ... - 2. Calculate success rate and hit rate over all cases. - """ + command = f'Rscript boxplot.R {dataFL} {figname}' + print(command) + command = split(command) + subprocess.check_call(command) - # -- 1. calculate success rate and hit rate - performance_per_case = evaluate(df) - performance_ave = ave_evaluate(performance_per_case) - performance_ave = add_rank(performance_ave) - # -- 2. plot - plot_evaluation(performance_ave, figname) +def plot_successRate_hitRate(df, figname): -def plot_evaluation(df, figname): ''' INPUT: label success_DR hitRate_DR success_HS hitRate_HS rank perc @@ -336,24 +265,18 @@ def plot_evaluation(df, figname): # ---------- hit rate plot ------- figname1 = figname + '.hitRate.png' - print(f'\n --> Hit Rate plot:', figname1, '\n') - hit_rate_plot(df) - ggplot2.ggsave(figname1, height=7*3, width=7 * 1.2*3, dpi=50) + hit_rate_plot(df, figname = figname1) # ---------- success rate plot ------- figname2 = figname + '.successRate.png' - print(f'\n --> Success Rate plot:', figname2, '\n') + success_rate_plot(df, figname = figname2) - success_rate_plot(df) - ggplot2.ggsave(figname2, height=7*3, width=7 * 1.2*3, dpi=50) - - -def hit_rate_plot(df, sep = True): +def hit_rate_plot(df, figname ='hitrate.png'): ''' - sep: True -> plot train/valid/test in 3 panels. False -> all in one panel. + plot train/valid/test in 3 panels. - INPUT: - label success_DR hitRate_DR success_HS hitRate_HS rank perc + input: + label success_dr hitRate_dr success_hs hitRate_hs rank perc Test 0.0 0.000000 0.0 0.000000 1 0.000949 Test 0.0 0.000000 1.0 0.012821 2 0.001898 @@ -362,41 +285,31 @@ def hit_rate_plot(df, sep = True): ''' + print(f'\n --> hit rate plot:', figname, '\n') + # -- melt df df_melt = pd.melt(df, id_vars=['label', 'rank']) - idx1 = df_melt.variable.str.contains('^hitRate') + idx1 = df_melt.variable.str.contains('^hitRate', case = False) df_tmp = df_melt.loc[idx1, :].copy() df_tmp.columns = ['label', 'rank', 'Methods', 'hit_rate'] tmp = list(df_tmp['Methods']) df_tmp.loc[:, 'Methods'] = [ - re.sub('hitRate_', '', x) for x in tmp] # success_DR -> DR + re.sub('hitrate_', '', x) for x in tmp] # success_dr -> dr - font_size = 40 - breaks = pd.to_numeric(np.arange(0, 1.01, 0.25)) - #xlabels = list(map(lambda x: str('%d' % (x * 100)) +' % ', np.arange(0, 1.01, 0.25))) - text_style = element_text(size=font_size, family="Tahoma", face="bold") + #-- write to tsv file + dataFL = 'hitrate_melted.tsv' + df_tmp.to_csv(dataFL, sep='\t', index = False) + print(f'{dataFL} generated') - p = ggplot(df_tmp) + \ - aes_string(x='rank', y='hit_rate', color='label', linetype='Methods') + \ - facet_grid(ro.Formula('label ~.')) +\ - geom_line(size=1) + \ - labs(**{'x': 'Top N models', 'y': 'Hit Rate'}) + \ - theme_bw() + \ - theme(**{ - 'legend.position': 'right', - 'plot.title': text_style, - 'text': text_style, - 'axis.text.x': element_text(size=font_size), - 'axis.text.y': element_text(size=font_size)}) +\ - labs(**{'colour': "Sets"}) #change legend title to 'Sets' - - # scale_x_continuous(**{'breaks': breaks, 'labels': xlabels}) - - return p + #-- plot + command = f'Rscript hitrate_plot.R {dataFL} {figname}' + print(command) + command = split(command) + subprocess.check_call(command) -def success_rate_plot(df): +def success_rate_plot(df, figname): """ INPUT: a pandas data frame label success_DR hitRate_DR success_HS hitRate_HS rank perc @@ -408,9 +321,11 @@ def success_rate_plot(df): """ + print(f'\n --> Success Rate plot:', figname, '\n') + # -- melt df df_melt = pd.melt(df, id_vars=['label', 'rank']) - idx1 = df_melt.variable.str.contains('^success_') + idx1 = df_melt.variable.str.contains('^success_', case = False) df_tmp = df_melt.loc[idx1, :].copy() df_tmp.columns = ['label', 'rank', 'Methods', 'success_rate'] @@ -418,29 +333,16 @@ def success_rate_plot(df): df_tmp.loc[:, 'Methods'] = [ re.sub('success_', '', x) for x in tmp] # success_DR -> DR - font_size = 40 -# breaks = pd.to_numeric(np.arange(0, 1.01, 0.25)) -# xlabels = list(map(lambda x: str('%d' % (x * 100)) + -# ' % ', np.arange(0, 1.01, 0.25))) - text_style = element_text(size=font_size, family="Tahoma", face="bold") - - p = ggplot(df_tmp) + \ - aes_string(x='rank', y='success_rate', color='label', linetype='Methods') + \ - facet_grid(ro.Formula('label ~.')) +\ - geom_line(size=1) + \ - labs(**{'x': 'Top N models', 'y': 'Success Rate'}) + \ - theme_bw() + \ - theme(**{'legend.position': 'right', - 'plot.title': text_style, - 'text': text_style, - 'axis.text.x': element_text(size=font_size), - 'axis.text.y': element_text(size=font_size)}) +\ - labs(**{'colour': "Sets"}) #change legend title to 'Sets' -# scale_x_continuous(**{'breaks': breaks, 'labels': xlabels}) - -# p.plot() - return p + # -- write to tsv file + dataFL = 'successrate_melted.tsv' + df_tmp.to_csv(dataFL, sep = '\t', index = False) + print(f'{dataFL} generated') + #-- plot + command = f'Rscript successrate_plot.R {dataFL} {figname}' + print(command) + command = split(command) + subprocess.check_call(command) def get_irmsd(source_hdf5, modelIDs): @@ -484,7 +386,7 @@ def filter_models(df, label = 'Test', scenario = 'ranair'): print (f"-> Keep models for {scenario} in the {label} set") idx1 = df.label == label - idx2 = df.modelID.str.contains( scenario ) + idx2 = df.modelID.str.contains( scenario , case = False) throw = idx1 & ~idx2 df = df[~throw] return df @@ -621,8 +523,11 @@ def hit_statistics(df): Test 1AVX_ti5-itw_354w 1 0.502845 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 3.668682 """ + df['target'] = df['target'].astype(int) + grouped = df.groupby('label') + # # -- 1. count num_hit based on i-rmsd # num_hits = grouped['irmsd'].apply(lambda x: len(x[x <= 4])) # num_models = grouped.apply(len) @@ -634,7 +539,7 @@ def hit_statistics(df): # print("") # -- 2. count num_hit based on the 'target' column - num_hits = grouped['target'].apply(lambda x: len(x[x == '1'])) + num_hits = grouped['target'].apply(lambda x: len(x[x == 1])) num_models = grouped.apply(len) for label, _ in grouped: @@ -647,7 +552,7 @@ def hit_statistics(df): df_tmp['caseID'] = df['modelID'].apply(get_caseID) grp1 = df_tmp.groupby(['label', 'caseID']) - num_hits = grp1['target'].apply(lambda x: sum(x.astype(int))) # the number of hits for each case + num_hits = grp1['target'].apply(lambda x: sum(x)) # the number of hits for each case grp2 = num_hits.groupby('label') num_cases_total = grp2.apply(len) num_cases_wo_hit = grp2.apply(lambda x: len(x[x == 0])) @@ -685,38 +590,198 @@ def get_caseID(modelID): caseID = tmp[0] return caseID +#------------------------------------- +#--- BEGIN: functions not used ------- + +def plot_boxplot_r2py(df, figname=None, inverse=False): + """Plot a boxplot of predictions vs. targets. Useful to visualize the + performance of the training algorithm. This is only useful in + classification tasks. + + INPUT (pd.DataFrame): + + label modelID target DR sourceFL + Test 1AVX_ranair-it0_5286 0 0.503823 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 + Test 1AVX_ti5-itw_354w 1 0.502845 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 + Test 1AVX_ranair-it0_6223 0 0.511688 /home/lixue/DBs/BM5-haddock24/hdf5/000_1AVX.hdf5 + """ + + pandas2ri.activate() + + print('\n --> Box Plot : ', figname, '\n') + + data = df + + font_size = 20 + # line = "#1F3552" + + text_style = element_text(size=font_size, family="Tahoma", face="bold") + + colormap_raw = [['0', 'ivory3'], + ['1', 'steelblue']] + + colormap = ro.StrVector([elt[1] for elt in colormap_raw]) + colormap.names = ro.StrVector([elt[0] for elt in colormap_raw]) -def main(HS_h5FL='/home/lixue/DBs/BM5-haddock24/stats/stats.h5'): # on alembick -#def main(HS_h5FL='/projects/0/deeprank/BM5/docked_models/stats.h5'): # on cartesius + p = ggplot(data) + \ + aes_string(x='target', y='DR', fill='target') + \ + geom_boxplot(width=0.2, alpha=0.7) + \ + facet_grid(ro.Formula('.~label')) +\ + scale_fill_manual(values=colormap) + \ + theme_bw() +\ + theme(**{'plot.title': text_style, + 'text': text_style, + 'axis.title': text_style, + 'axis.text.x': element_text(size=font_size), + 'legend.position': 'right'}) +\ + scale_x_discrete(name="Target") + + # p.plot() + ggplot2.ggsave(figname, dpi=100) + return p + + + +def hit_rate_plot_r2py(df, figname ='hitrate.png'): + ''' + plot train/valid/test in 3 panels. + + input: + label success_dr hitrate_dr success_hs hitrate_hs rank perc + test 0.0 0.000000 0.0 0.000000 1 0.000949 + test 0.0 0.000000 1.0 0.012821 2 0.001898 + + train 0.0 0.000000 1.0 0.012821 1 0.002846 + train 0.0 0.000000 1.0 0.025641 2 0.003795 + + ''' + pandas2ri.activate() + print(f'\n --> hit rate plot:', figname, '\n') + + # -- melt df + df_melt = pd.melt(df, id_vars=['label', 'rank']) + idx1 = df_melt.variable.str.contains('^hitrate', case = False) + df_tmp = df_melt.loc[idx1, :].copy() + df_tmp.columns = ['label', 'rank', 'methods', 'hit_rate'] + + tmp = list(df_tmp['methods']) + df_tmp.loc[:, 'methods'] = [ + re.sub('hitrate_', '', x) for x in tmp] # success_dr -> dr + + font_size = 40 + breaks = pd.to_numeric(np.arange(0, 1.01, 0.25)) + #xlabels = list(map(lambda x: str('%d' % (x * 100)) +' % ', np.arange(0, 1.01, 0.25))) + text_style = element_text(size=font_size, family="tahoma", face="bold") + + df_tmp.to_csv('hitrate_melted.tsv', sep='\t', index = false) + print('hitrate_melted.tsv generated') + p = ggplot(df_tmp) + \ + aes_string(x='rank', y='hit_rate', color='label', linetype='methods') + \ + facet_grid(ro.formula("label~.")) +\ + geom_line(size=1) + \ + labs(**{'x': 'top n models', 'y': 'hit rate'}) + \ + theme_bw() + \ + theme(**{ + 'legend.position': 'right', + 'plot.title': text_style, + 'text': text_style, + 'axis.text.x': element_text(size=font_size), + 'axis.text.y': element_text(size=font_size)}) +\ + labs(**{'colour': "sets"}) #change legend title to 'sets' + + # scale_x_continuous(**{'breaks': breaks, 'labels': xlabels}) + + ggplot2.ggsave(figname, height=7*3, width=7 * 1.2*3, dpi=50) + +def success_rate_plot_r2py(df): + """ + INPUT: a pandas data frame + label success_DR hitRate_DR success_HS hitRate_HS rank perc + Test 0.0 0.000000 0.0 0.000000 1 0.000949 + Test 0.0 0.000000 1.0 0.012821 2 0.001898 + + Train 0.0 0.000000 1.0 0.012821 1 0.002846 + Train 0.0 0.000000 1.0 0.025641 2 0.003795 + + """ + + pandas2ri.activate() + + # -- melt df + df_melt = pd.melt(df, id_vars=['label', 'rank']) + idx1 = df_melt.variable.str.contains('^success_', case = False) + df_tmp = df_melt.loc[idx1, :].copy() + df_tmp.columns = ['label', 'rank', 'Methods', 'success_rate'] + + tmp = list(df_tmp['Methods']) + df_tmp.loc[:, 'Methods'] = [ + re.sub('success_', '', x) for x in tmp] # success_DR -> DR + + font_size = 40 +# breaks = pd.to_numeric(np.arange(0, 1.01, 0.25)) +# xlabels = list(map(lambda x: str('%d' % (x * 100)) + +# ' % ', np.arange(0, 1.01, 0.25))) + text_style = element_text(size=font_size, family="Tahoma", face="bold") + + df_tmp.to_csv('successrate_melted.tsv', sep='\t', index = False) + print('successrate_melted.tsv generated') + + p = ggplot(df_tmp) + \ + aes_string(x='rank', y='success_rate', color='label', linetype='Methods') + \ + facet_grid(ro.Formula('label ~.')) +\ + geom_line(size=1) + \ + labs(**{'x': 'Top N models', 'y': 'Success Rate'}) + \ + theme_bw() + \ + theme(**{'legend.position': 'right', + 'plot.title': text_style, + 'text': text_style, + 'axis.text.x': element_text(size=font_size), + 'axis.text.y': element_text(size=font_size)}) +\ + labs(**{'colour': "Sets"}) #change legend title to 'Sets' +# scale_x_continuous(**{'breaks': breaks, 'labels': xlabels}) + + ggplot2.ggsave(figname, height=7*3, width=7 * 1.2*3, dpi=50) + +#----------------------------------- +#--- END: functions not used ------- + + +#def main(HS_h5FL='/home/lixue/DBs/BM5-haddock24/stats/stats.h5'): # on alembick +def main(HS_h5FL='/projects/0/deeprank/BM5/docked_models/stats.h5'): # on cartesius if len(sys.argv) != 5: sys.exit(USAGE) # the output h5 file from deeprank: 'epoch_data.hdf5' deeprank_h5FL = sys.argv[1] epoch = int(sys.argv[2]) # 9 - scenario = sys.argv[3] # cm, ranair, refb, ti5, ti + scenario = sys.argv[3] # cm, ranair, refb, ti5, ti, or it0, it1, itw or other patterns in the modelID figname = sys.argv[4] + #-- read deeprank.hdf5 and HS.hdf5 to a pandas df df = prepare_df(deeprank_h5FL, HS_h5FL, epoch, scenario) + rawdataFL=f'{scenario}.rawdata.tsv' + df.to_csv(rawdataFL, sep = '\t', index = False, float_format = '%.5f') + print(f'{rawdataFL} generated.\n') # -- report the number of hits for train/valid/test hit_statistics(df) - #-- plot - pandas2ri.activate() -#-- -#Note: plot_HS_iRMSD and plot_DR_iRMSD disabled due to the long running time. -# plot_HS_iRMSD(df, figname=f"{figname}.epo{epoch}.{scenario}.irsmd_HS.png") -# plot_DR_iRMSD(df, figname=f"{figname}.epo{epoch}.{scenario}.irsmd_HS.png") -#-- - plot_boxplot(df, figname=f"{figname}.epo{epoch}.{scenario}.boxplot.png", inverse = False) - plot_successRate_hitRate(df[['label', + #-- calculate hit rate and success rate + hitrate_successrate_per_case, hitrate_successrate_df = cal_hitrate_successrate(df[['label', 'caseID', 'modelID', 'target', 'DR', - 'HS']].copy(), - figname=f"{figname}.epo{epoch}.{scenario}", - inverse=False) + 'HS']].copy()) + #-- plot + plot_successRate_hitRate(hitrate_successrate_df, figname=f"{figname}.epo{epoch}.{scenario}") + plot_boxplot(rawdataFL, figname=f"{figname}.epo{epoch}.{scenario}.boxplot.png") + + #-- + #Note: plot_HS_iRMSD and plot_DR_iRMSD disabled due to the long running time. + # plot_HS_iRMSD(df, figname=f"{figname}.epo{epoch}.{scenario}.irsmd_HS.png") + # plot_DR_iRMSD(df, figname=f"{figname}.epo{epoch}.{scenario}.irsmd_HS.png") + #-- + if __name__ == '__main__': From 5c33022f4a512fb6776c63bf3d7a081bd4ec1ef6 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 10:46:06 +0200 Subject: [PATCH 13/38] change install for new pdb2sql --- .travis.yml | 7 ------- setup.py | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 91e8e086..d387d189 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,13 +22,6 @@ before_install: # install openmpi for mpi4py - sudo apt-get install libopenmpi-dev openmpi-bin - # pdb2sql align branch - # to be removed when align is merged - - git clone -b align --single-branch https://github.com/DeepRank/pdb2sql.git - - cd pdb2sql - - pip install -e ./ - - cd ../ - install: - pip install -qe .[test] diff --git a/setup.py b/setup.py index 42fd4cb8..a8bd4100 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ 'matplotlib', 'torchsummary', 'torch < 1.4.0', - 'pdb2sql >= 0.2.1', + 'pdb2sql >= 0.3.0', 'freesasa==2.0.3.post7;platform_system=="Linux"', 'freesasa==2.0.3.post6;platform_system=="Darwin"' ], From 6c77a0ad41239e7753b76c9ec304accebdc1127d Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 10:53:21 +0200 Subject: [PATCH 14/38] syntax --- deeprank/generate/DataGenerator.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 45de2cca..c8cb1799 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -79,7 +79,7 @@ def __init__(self, pdb_select=None, pdb_source=None, self.pdb_select = pdb_select or [] self.pdb_source = pdb_source or [] self.pdb_native = pdb_native or [] - self.align = align or None + self.align = align if pssm_source is not None: config.PATH_PSSM_SOURCE = pssm_source @@ -1325,8 +1325,6 @@ def _add_pdb(self, molgrp, pdbfile, name): with open(pdbfile, 'r') as fi: data = [line.split('\n')[0] for line in fi if line.startswith('ATOM')] - # PDB default line length is 80 - # http://www.wwpdb.org/documentation/file-format # some alignement elif isinstance(self.align, dict): @@ -1334,6 +1332,8 @@ def _add_pdb(self, molgrp, pdbfile, name): sqldb = self._get_aligned_sqldb(pdbfile, self.align) data = sqldb.sql2pdb() + # PDB default line length is 80 + # http://www.wwpdb.org/documentation/file-format data = np.array(data).astype('|S78') molgrp.create_dataset(name, data=data) @@ -1353,14 +1353,14 @@ def _get_aligned_sqldb(pdbfile, dict_align): if dict_align['selection'] == 'interface': - if np.all([k in dict_align for k in ['chain1','chain2']]): - chains = {'chain1':dict_align['chain1'], - 'chain2':dict_align['chain2']} + if np.all([k in dict_align for k in ['chain1', 'chain2']]): + chains = {'chain1' : dict_align['chain1'], + 'chain2' : dict_align['chain2']} else: chains = {} sqldb = align_interface(pdbfile, plane=dict_align['plane'], - export = dict_align['export'], + export=dict_align['export'], **chains) else: From fd01c21cabd3c67908f4fda4c5d979c8272fbb0c Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 11:13:47 +0200 Subject: [PATCH 15/38] align in doc --- docs/deeprank.generate.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/deeprank.generate.rst b/docs/deeprank.generate.rst index b5995baa..d3e8f4bc 100644 --- a/docs/deeprank.generate.rst +++ b/docs/deeprank.generate.rst @@ -66,6 +66,37 @@ Example: The details of the different submodule are listed here. The only module that really needs to be used is ``DataGenerator`` and ``NormalizeData``. The ``GridTools`` class should not be directly used by inexperienced users. + +Structure Alignement +---------------------------------------- + +All the complexes contained in the dataset can be aligned similarly to facilitate and improve the training of the model. This can easily be done using the `align` option of the `DataGenerator` for example to align all the complexe along the 'z' direction one can use: + +>>> database = DataGenerator(pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, +>>> align={"axis":'z'}, data_augmentation=2, +>>> compute_targets=[ ... ], compute_features=[ ... ], ... ) + + +Other options are possbile, for example if you would like to have the alignement done only using a subpart of the complex, say the chains A and B you can use : + +>>> database = DataGenerator(pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, +>>> align={"axis":'z', "selection": {"chainID":["A","B"]} }, data_augmentation=2, +>>> compute_targets=[ ... ], compute_features=[ ... ], ... ) + +All the selection offered by `pdb2sql` can be used in the `align` dictionnary e.g. : "resId":[1,2,3], "resName":['VAL','LEU'], ... Only the atoms selected will be aligned in the give direction. + +You can also try to align the interface between two chains in a given plane. This can be done using : + +>>> database = DataGenerator(pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, +>>> align={"plane":'xy', "selection":"interface"}, data_augmentation=2, +>>> compute_targets=[ ... ], compute_features=[ ... ], ... ) + +which by default will use the interface between the first two chains. If you have more than two chains in the complex and want to specify wich chains are forming the interface to be aligned you can use : + +>>> database = DataGenerator(pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, +>>> align={"plane":'xy', "selection":"interface", "chain1":'A', "chain2":'C'}, data_augmentation=2, +>>> compute_targets=[ ... ], compute_features=[ ... ], ... ) + DataGenerator ---------------------------------------- From 31a611010cefb2508feef28abe6b48ca3e893936 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 11:25:43 +0200 Subject: [PATCH 16/38] added test --- example/generate_dataset.py | 2 +- test/test_generate.py | 65 ++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 3cf24189..7eea9c82 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -24,7 +24,7 @@ pdb_source=pdb_source, pdb_native=pdb_native, pssm_source=pssm_source, - align={"axis":'z','export':False}, + align={"axis":'z'}, data_augmentation=2, compute_targets=[ 'deeprank.targets.dockQ', diff --git a/test/test_generate.py b/test/test_generate.py index 323faa41..bbe3f70d 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -183,9 +183,72 @@ def test_4_add_feature(self): print(' ' * 25 + '--> Done in %f s.' % (time() - t0)) + def test_5_align(self): + """create a database where all the complex are aligned in the z direction.""" + + # clean old files + files = [ + '1ak4_aligned.hdf5', + '1ak4_aligned_norm.pckl'] + + for f in files: + if os.path.isfile(f): + os.remove(f) + + database = DataGenerator( + pdb_source='./1AK4/decoys/', + pdb_native=self.pdb_native, + pssm_source='./1AK4/pssm_new/', + align={"axis":'z'}, + data_augmentation=1, + compute_targets=['deeprank.targets.dockQ'], + compute_features=['deeprank.features.AtomicFeature'], + hdf5='./1ak4_aligned.hdf5') + + # create the database + if not os.path.isfile(database.hdf5): + t0 = time() + print('{:25s}'.format('Create new database') + database.hdf5) + database.create_database(prog_bar=True, random_seed=2019) + print(' ' * 25 + '--> Done in %f s.' % (time() - t0)) + else: + print('{:25s}'.format('Use existing database') + database.hdf5) + + def test_6_align_interface(self): + """create a database where all the interface are aligned in the xy plane.""" + + # clean old files + files = [ + '1ak4_aligned_interface.hdf5', + '1ak4_aligned_interface_norm.pckl'] + + for f in files: + if os.path.isfile(f): + os.remove(f) + + database = DataGenerator( + pdb_source='./1AK4/decoys/', + pdb_native=self.pdb_native, + pssm_source='./1AK4/pssm_new/', + align={"plane":'xy', "selection":'interface'}, + data_augmentation=1, + compute_targets=['deeprank.targets.dockQ'], + compute_features=['deeprank.features.AtomicFeature], + hdf5='./1ak4_aligned_interface.hdf5') + + # create the database + if not os.path.isfile(database.hdf5): + t0 = time() + print('{:25s}'.format('Create new database') + database.hdf5) + database.create_database(prog_bar=True, random_seed=2019) + print(' ' * 25 + '--> Done in %f s.' % (time() - t0)) + else: + print('{:25s}'.format('Use existing database') + database.hdf5) + + if __name__ == "__main__": - # unittest.main() + # unittest.main() inst = TestGenerateData() inst.test_1_generate() inst.test_1_generate_mapfly() From 7504da7d1feaf1d8fde615252a92560ce0628805 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 11:40:42 +0200 Subject: [PATCH 17/38] fix typo --- test/test_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_generate.py b/test/test_generate.py index bbe3f70d..fa63f135 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -233,7 +233,7 @@ def test_6_align_interface(self): align={"plane":'xy', "selection":'interface'}, data_augmentation=1, compute_targets=['deeprank.targets.dockQ'], - compute_features=['deeprank.features.AtomicFeature], + compute_features=['deeprank.features.AtomicFeature'], hdf5='./1ak4_aligned_interface.hdf5') # create the database From 5f68419ff832dd0b0555a17e78d3b96ffc5030d5 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 16:25:13 +0200 Subject: [PATCH 18/38] fix code reviews --- deeprank/generate/DataGenerator.py | 3 +++ test/test_generate.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index c8cb1799..f4596cf2 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -46,6 +46,9 @@ def __init__(self, pdb_select=None, pdb_source=None, pdb_native (list(str), optional): List of folders where to find the native comformations, nust set it if having targets to compute in parameter "compute_targets". pssm_source (list(str), optional): List of folders where to find the PSSM files + align (dict, optional): Dicitionary to align the compexes, + e.g. align = {"selection":{"chainID":["A","B"]},"axis":"z"}} + if "selection" is not specified the entire complex is used for alignement compute_targets (list(str), optional): List of python files computing the targets, "pdb_native" must be set if having targets to compute. compute_features (list(str), optional): List of python files computing the features diff --git a/test/test_generate.py b/test/test_generate.py index fa63f135..3f1d8d0f 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -254,3 +254,5 @@ def test_6_align_interface(self): inst.test_1_generate_mapfly() inst.test_3_add_unique_target() inst.test_4_add_feature() + inst.test_5_align() + inst.test_6_align_interface() \ No newline at end of file From 5a90e9c4614beaf805ce7a110ff9f541405781ee Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 17:03:03 +0200 Subject: [PATCH 19/38] doc for itnerface alignement --- deeprank/generate/DataGenerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index f4596cf2..6de5bb67 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -48,6 +48,7 @@ def __init__(self, pdb_select=None, pdb_source=None, pssm_source (list(str), optional): List of folders where to find the PSSM files align (dict, optional): Dicitionary to align the compexes, e.g. align = {"selection":{"chainID":["A","B"]},"axis":"z"}} + e.g. align = {"selection":"interface","plane":"xy"} if "selection" is not specified the entire complex is used for alignement compute_targets (list(str), optional): List of python files computing the targets, "pdb_native" must be set if having targets to compute. From 7d7f5fcbe285a69d35f1148c56b0c477401172ab Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 17:48:32 +0200 Subject: [PATCH 20/38] versioning --- deeprank/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeprank/__version__.py b/deeprank/__version__.py index 99c4176c..541f859d 100644 --- a/deeprank/__version__.py +++ b/deeprank/__version__.py @@ -1 +1 @@ -__version__ = '0.0.1' \ No newline at end of file +__version__ = '0.1.0' \ No newline at end of file From 4a19926742c5311065aa21520de095cb6c40a3fb Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Tue, 31 Mar 2020 18:01:45 +0200 Subject: [PATCH 21/38] fixed setup --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a8bd4100..e158c47d 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ version=version['__version__'], description='Rank Protein-Protein Interactions using Deep Learning', long_description=readme + '\n\n', - author='Nicolas Renaud et al.', + long_description_content_type='text/markdown', + author='Nicolas Renaud, CunLiang Geng Sonja Georgrievska, Li Xue', url='https://github.com/DeepRank/deeprank', project_urls={ 'Source Code': 'https://github.com/DeepRank/deeprank', @@ -32,8 +33,6 @@ classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Science/Research', - 'Intended Audience :: Education', - 'Intended Audience:: Developers', 'License :: OSI Approved :: Apache Software License', 'Natural Language :: English', 'Programming Language :: Python :: 3.7', From 22fdb4fee099bcdcd0b471770297ccad34ad0656 Mon Sep 17 00:00:00 2001 From: Nicolas Renaud Date: Tue, 31 Mar 2020 18:10:23 +0200 Subject: [PATCH 22/38] add doi --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ffba5078..1a97d586 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Build Status](https://secure.travis-ci.org/DeepRank/deeprank.svg?branch=master)](https://travis-ci.org/DeepRank/deeprank) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/9252e59633cf46a7ada0c3c614c175ea)](https://www.codacy.com/app/NicoRenaud/deeprank?utm_source=github.com&utm_medium=referral&utm_content=DeepRank/deeprank&utm_campaign=Badge_Grade) [![Coverage Status](https://coveralls.io/repos/github/DeepRank/deeprank/badge.svg?branch=master)](https://coveralls.io/github/DeepRank/deeprank?branch=master) -[![Documentation Status](https://readthedocs.org/projects/deeprank/badge/?version=latest)](http://deeprank.readthedocs.io/?badge=latest) +[![Documentation Status](https://readthedocs.org/projects/deeprank/badge/?version=latest)](http://deeprank.readthedocs.io/?badge=latest) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3735042.svg)](https://doi.org/10.5281/zenodo.3735042) The documentation of the module can be found on readthedocs : From 243a48ddad58749952736167e8f1a81f43c34266 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2020 08:12:50 +0000 Subject: [PATCH 23/38] Bump freesasa from 2.0.3.post7 to 2.0.5 Bumps [freesasa](https://github.com/mittinatten/freesasa) from 2.0.3.post7 to 2.0.5. - [Release notes](https://github.com/mittinatten/freesasa/releases) - [Changelog](https://github.com/mittinatten/freesasa/blob/master/CHANGELOG.md) - [Commits](https://github.com/mittinatten/freesasa/commits) Signed-off-by: dependabot-preview[bot] --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e158c47d..2851fdf4 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ 'torch < 1.4.0', 'pdb2sql >= 0.3.0', 'freesasa==2.0.3.post7;platform_system=="Linux"', - 'freesasa==2.0.3.post6;platform_system=="Darwin"' + 'freesasa==2.0.5;platform_system=="Darwin"' ], extras_require={ 'test': ['nose', 'coverage', 'pytest', 'pytest-cov', From cc4ce736b7866ca012cbaba25cc558ccf0570c03 Mon Sep 17 00:00:00 2001 From: "dependabot-preview[bot]" <27856297+dependabot-preview[bot]@users.noreply.github.com> Date: Wed, 1 Apr 2020 11:54:57 +0000 Subject: [PATCH 24/38] Update torch requirement from <1.4.0 to <1.5.0 Updates the requirements on [torch](https://github.com/pytorch/pytorch) to permit the latest version. - [Release notes](https://github.com/pytorch/pytorch/releases) - [Commits](https://github.com/pytorch/pytorch/compare/v0.1.1...v1.4.0) Signed-off-by: dependabot-preview[bot] --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2851fdf4..72de4e04 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ 'mpi4py', 'matplotlib', 'torchsummary', - 'torch < 1.4.0', + 'torch < 1.5.0', 'pdb2sql >= 0.3.0', 'freesasa==2.0.3.post7;platform_system=="Linux"', 'freesasa==2.0.5;platform_system=="Darwin"' From 106d80f38bebacfbe03e5db8ec90a57e4f56ae2f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 1 Apr 2020 13:55:52 +0200 Subject: [PATCH 25/38] remove torch version restriction --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 72de4e04..ca00595e 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ 'mpi4py', 'matplotlib', 'torchsummary', - 'torch < 1.5.0', + 'torch', 'pdb2sql >= 0.3.0', 'freesasa==2.0.3.post7;platform_system=="Linux"', 'freesasa==2.0.5;platform_system=="Darwin"' From 74ba8132924edd4520c34398189f1d0539aa983a Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Thu, 2 Apr 2020 15:14:40 +0200 Subject: [PATCH 26/38] ignore build and dist --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 7ecacabe..d3973212 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,13 @@ Byte-compiled / optimized *__pycache__* + + # distribution deeprank.egg-info database* +dist +build # specific architure files deeprank/learn/arch_* From 47fc92b11e73a3764c1e80b6eb60e884f1e3037a Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 3 Apr 2020 15:52:45 +0200 Subject: [PATCH 27/38] added align existing pdb in hdf5 --- deeprank/generate/DataGenerator.py | 91 ++++++++++++++++++++++++++++-- example/generate_dataset.py | 51 +++++++++-------- test/test_generate.py | 9 ++- 3 files changed, 121 insertions(+), 30 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 6de5bb67..05307516 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -83,10 +83,11 @@ def __init__(self, pdb_select=None, pdb_source=None, self.pdb_select = pdb_select or [] self.pdb_source = pdb_source or [] self.pdb_native = pdb_native or [] + self.pssm_source = pssm_source self.align = align - if pssm_source is not None: - config.PATH_PSSM_SOURCE = pssm_source + if self.pssm_source is not None: + config.PATH_PSSM_SOURCE = self.pssm_source self.compute_targets = compute_targets self.compute_features = compute_features @@ -123,7 +124,7 @@ def __init__(self, pdb_select=None, pdb_source=None, set.intersection(set(pssm_features), set(self.compute_features)): if config.PATH_PSSM_SOURCE is None: raise ValueError( - 'You must provide "pssm_source" to compaute PSSM features.') + 'You must provide "pssm_source" to compute PSSM features.') # get all the conformation path @@ -165,6 +166,7 @@ def create_database( prog_bar=False, contact_distance=8.5, random_seed=None): + """Create the hdf5 file architecture and compute the features/targets. Args: @@ -229,7 +231,12 @@ def create_database( # set metadata to hdf5 file self.f5.attrs['DeepRank_version'] = deeprank.__version__ - + self.f5.attrs['pdb_source'] = [os.path.abspath(f) for f in self.pdb_source] + self.f5.attrs['pdb_native'] = [os.path.abspath(f) for f in self.pdb_native] + self.f5.attrs['pssm_source'] = os.path.abspath(self.pssm_source) + self.f5.attrs['features'] = self.compute_features + self.f5.attrs['targets'] = self.compute_targets + ################################################## # Start generating HDF5 database ################################################## @@ -559,6 +566,7 @@ def add_feature(self, remove_error=True, prog_bar=True): error_flag = False if self.compute_features is not None: + # the internal features molgrp.require_group('features') molgrp.require_group('features_raw') @@ -722,6 +730,80 @@ def add_target(self, prog_bar=False): # close the file f5.close() + def realign_complexes(self, align, compute_features=None, pssm_source=None): + """align all the complexes already present in the HDF5 + + + Arguments: + align {dict} -- alignement dictionary (see __init__) + + Keyword Arguments: + compute_features {list} -- list of features to be computed + if None computes the features specified in + the attrs['features'] of the file (if present) + pssm_source {str} -- path of the pssm files. If None the source specfied in + the attrs['pssm_source'] will be used (if present) (default: {None}) + + Raises: + ValueError: If no PSSM detected + """ + + f5 = h5py.File(self.hdf5,'a') + + mol_names = f5.keys() + self.logger.info(f'\n# Start re creating HDF5 database: {self.hdf5}') + + # deal with the features + if self.compute_features is None: + if compute_features is None: + if 'features' in f5.attrs: + self.compute_features = list(f5.attrs['features']) + else: + self.compute_features = compute_features + + # deal with the pssm source + if self.pssm_source is not None: + config.PATH_PSSM_SOURCE = self.pssm_source + + elif pssm_source is not None: + config.PATH_PSSM_SOURCE = pssm_source + + elif 'pssm_source' in f5.attrs: + config.PATH_PSSM_SOURCE = f5.attrs['pssm_source'] + else : + raise ValueError('No pssm source detected') + + desc = '{:25s}'.format('Add features') + for mol in tqdm(mol_names, desc=desc, ncols=100): + + # align the pdb + molgrp = f5[mol] + pdb = molgrp['complex'][()] + + sqldb = self._get_aligned_sqldb(pdb, align) + data = sqldb.sql2pdb() + + data = np.array(data).astype('|S78') + molgrp['complex'][...] = data + + # remove prexisting features + if 'features' in molgrp: + del molgrp['features'] + if 'features_raw' in molgrp: + del molgrp['features_raw'] + + # the internal features + molgrp.require_group('features') + molgrp.require_group('features_raw') + + # compute features + error_flag = self._compute_features(self.compute_features, + molgrp['complex'][()], + molgrp['features'], + molgrp['features_raw'], + self.logger) + + f5.close() # ==================================================================================== # @@ -1009,6 +1091,7 @@ def remove(self, feature=True, pdb=True, points=True, grid=False): if feature and 'features' in mol_grp: del mol_grp['features'] + del mol_grp['features_raw'] if pdb and 'complex' in mol_grp and 'native' in mol_grp: del mol_grp['complex'] del mol_grp['native'] diff --git a/example/generate_dataset.py b/example/generate_dataset.py index 7eea9c82..3e9049e0 100644 --- a/example/generate_dataset.py +++ b/example/generate_dataset.py @@ -18,31 +18,32 @@ # where to find the pssm pssm_source = '../test/1AK4/pssm_new/' - -# initialize the database -database = DataGenerator( - pdb_source=pdb_source, - pdb_native=pdb_native, - pssm_source=pssm_source, - align={"axis":'z'}, - data_augmentation=2, - compute_targets=[ - 'deeprank.targets.dockQ', - 'deeprank.targets.binary_class'], - compute_features=[ - 'deeprank.features.AtomicFeature', - 'deeprank.features.FullPSSM', - 'deeprank.features.PSSM_IC', - 'deeprank.features.BSA', - 'deeprank.features.ResidueDensity'], - hdf5=h5file, - mpi_comm=comm) - - -# create the database -# compute features/targets for all complexes -print('{:25s}'.format('Create new database') + database.hdf5) -database.create_database(prog_bar=True) +# # initialize the database +# database = DataGenerator( +# pdb_source=pdb_source, +# pdb_native=pdb_native, +# pssm_source=pssm_source, +# data_augmentation=2, +# compute_targets=[ +# 'deeprank.targets.dockQ', +# 'deeprank.targets.binary_class'], +# compute_features=[ +# 'deeprank.features.AtomicFeature', +# 'deeprank.features.FullPSSM', +# 'deeprank.features.PSSM_IC', +# 'deeprank.features.BSA', +# 'deeprank.features.ResidueDensity'], +# hdf5=h5file, +# mpi_comm=comm) + + +# # create the database +# # compute features/targets for all complexes +# print('{:25s}'.format('Create new database') + database.hdf5) +# database.create_database(prog_bar=True) + +newdb = DataGenerator(hdf5=h5file) +newdb.realign_complexes(align={'axis':'z'}) # define the 3D grid diff --git a/test/test_generate.py b/test/test_generate.py index 3f1d8d0f..e9ed44fd 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -245,6 +245,12 @@ def test_6_align_interface(self): else: print('{:25s}'.format('Use existing database') + database.hdf5) + def test_7_realign(self): + '''Realign existing pdbs.''' + database = DataGenerator(hdf5='./1ak4.hdf5') + database.realign_complexes(align={'axis':'z'}) + + if __name__ == "__main__": @@ -255,4 +261,5 @@ def test_6_align_interface(self): inst.test_3_add_unique_target() inst.test_4_add_feature() inst.test_5_align() - inst.test_6_align_interface() \ No newline at end of file + inst.test_6_align_interface() + inst.test_7_realign() \ No newline at end of file From f2d572b01b02462e97cc36d25801c4338ed428ec Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 3 Apr 2020 15:59:37 +0200 Subject: [PATCH 28/38] removed NaivePSSM --- deeprank/features/NaivePSSM.py | 246 ----------------------------- deeprank/features/__init__.py | 1 - deeprank/generate/DataGenerator.py | 2 - docs/deeprank.features.rst | 9 -- docs/deeprank.generate.rst | 1 - docs/tutorial.rst | 1 - scripts/launch.py | 1 - scripts/simple_generate.py | 1 - test/test_generate_cuda.py | 1 - 9 files changed, 263 deletions(-) delete mode 100644 deeprank/features/NaivePSSM.py diff --git a/deeprank/features/NaivePSSM.py b/deeprank/features/NaivePSSM.py deleted file mode 100644 index 0e509814..00000000 --- a/deeprank/features/NaivePSSM.py +++ /dev/null @@ -1,246 +0,0 @@ -import os -from time import time - -import numpy as np -import pdb2sql - -from deeprank.features import FeatureClass -from deeprank.tools import SASA - - -def printif(string, cond): return print(string) if cond else None - - -########################################################################## -# -# Definition of the class -# -########################################################################## - -class NaivePSSM(FeatureClass): - - def __init__( - self, - mol_name=None, - pdbfile=None, - pssm_path=None, - nmask=17, - nsmooth=3, - debug=False): - """Compute compressed PSSM data. - - The method is adapted from: - Simplified Sequence-based method for ATP-binding prediction using contextual local evolutionary conservation - Algorithms for Molecular Biology 2014 9:7 - - Args: - mol_name (str): name of the molecule - pdbfile (str): name of the dbfile - pssm_path (str): path to the pssm data - nmask (int, optional): - nsmooth (int, optional): - - Example: - - >>> path = '/home/nico/Documents/projects/deeprank/data/HADDOCK/BM4_dimers/PSSM_newformat/' - >>> pssm = NaivePSSM(mol_name = '2ABZ', pdbfile='2ABZ_1w.pdb',pssm_path=path) - >>> - >>> # get the surface accessible solvent area - >>> pssm.get_sasa() - >>> - >>> # get the pssm smoothed sum score - >>> pssm.read_PSSM_data() - >>> pssm.process_pssm_data() - >>> pssm.get_feature_value() - >>> print(pssm.feature_data_xyz) - """ - - super().__init__("Residue") - print("== Warning : Please don't use NaivePSSM as a feature it's very experimental") - - self.mol_name = mol_name - self.pdbfile = pdbfile - self.pssm_path = pssm_path - self.molname = self.get_mol_name(mol_name) - self.nmask = nmask - self.nsmooth = nsmooth - self.debug = debug - - if isinstance(pdbfile, str) and mol_name is None: - self.mol_name = os.path.splitext(pdbfile)[0] - - def get_sasa(self): - """Get the sasa of the residues.""" - sasa = SASA(self.pdbfile) - self.sasa = sasa.neighbor_vector() - - @staticmethod - def get_mol_name(mol_name): - """Get the bared mol name.""" - return mol_name.split('_')[0] - - def read_PSSM_data(self): - """Read the PSSM data.""" - - names = os.listdir(self.pssm_path) - fname = [n for n in names if n.find(self.molname) == 0] - - if len(fname) > 1: - raise ValueError( - 'Multiple PSSM files found for %s in %s', - self.mol_name, - self.pssm_path) - if len(fname) == 0: - raise FileNotFoundError( - 'No PSSM file found for %s in %s', - self.mol_name, - self.pssm_path) - else: - fname = fname[0] - - f = open(self.pssm_path + '/' + fname, 'rb') - data = f.readlines() - f.close() - raw_data = list(map(lambda x: x.decode('utf-8').split(), data)) - - self.res_data = np.array(raw_data)[:, :3] - self.res_data = [(r[0], int(r[1]), r[2]) for r in self.res_data] - self.pssm_data = np.array(raw_data)[:, 3:].astype(np.float) - - def process_pssm_data(self): - """Process the PSSM data.""" - - self.pssm_data = self._mask_pssm(self.pssm_data, nmask=self.nmask) - self.pssm_data = self._filter_pssm(self.pssm_data) - self.pssm_data = self._smooth_pssm( - self.pssm_data, msmooth=self.nsmooth) - self.pssm_data = np.mean(self.pssm_data, 1) - - @staticmethod - def _mask_pssm(pssm_data, nmask=17): - - nres = len(pssm_data) - - masked_pssm = np.copy(pssm_data) - for idata in range(nres): - istart = np.max([idata - nmask, 0]) - iend = np.min([idata + nmask + 1, nres]) - N = 1. / (2 * (iend - 1 - istart)) - masked_pssm[idata, :] -= N * np.sum(pssm_data[istart:iend, :], 0) - return masked_pssm - - @staticmethod - def _filter_pssm(pssm_data): - pssm_data[pssm_data <= 0] = 0 - return pssm_data - - @staticmethod - def _smooth_pssm(pssm_data, msmooth=3): - - nres = len(pssm_data) - smoothed_pssm = np.copy(pssm_data) - for idata in range(nres): - istart = np.max([idata - msmooth, 0]) - iend = np.min([idata + msmooth + 1, nres]) - N = 1. / (2 * (iend - 1 - istart)) - smoothed_pssm[idata, :] = N * np.sum(pssm_data[istart:iend, :], 0) - return smoothed_pssm - - def get_feature_value(self, contact_only=True): - """get the feature value.""" - - sql = pdb2sql.interface(self.pdbfile) - xyz_info = sql.get('chainID,resSeq,resName', name='CB') - xyz = sql.get('x,y,z', name='CB') - - xyz_dict = {} - for pos, info in zip(xyz, xyz_info): - xyz_dict[tuple(info)] = pos - - contact_residue = sql.get_contact_residue(cutoff=5.5) - contact_residue = contact_residue["A"] + contact_residue["B"] - sql._close() - - pssm_data_xyz = {} - pssm_data = {} - - for res, data in zip(self.res_data, self.pssm_data): - - if contact_only and res not in contact_residue: - continue - - if tuple(res) in xyz_dict: - chain = {'A': 0, 'B': 1}[res[0]] - key = tuple([chain] + xyz_dict[tuple(res)]) - sasa = self.sasa[tuple(res)] - - pssm_data[res] = [data * sasa] - pssm_data_xyz[key] = [data * sasa] - else: - printif([tuple(res), ' not found in the pdbfile'], self.debug) - - # if we have no contact atoms - if len(pssm_data_xyz) == 0: - pssm_data_xyz[tuple([0, 0., 0., 0.])] = [0.0] - pssm_data_xyz[tuple([1, 0., 0., 0.])] = [0.0] - - self.feature_data['pssm'] = pssm_data - self.feature_data_xyz['pssm'] = pssm_data_xyz - - -########################################################################## -# -# THE MAIN FUNCTION CALLED IN THE INTERNAL FEATURE CALCULATOR -# -########################################################################## - -def __compute_feature__(pdb_data, featgrp, featgrp_raw): - - if '__PATH_PSSM_SOURCE__' not in globals(): - path = os.path.dirname(os.path.realpath(__file__)) - PSSM = path + '/PSSM/' - else: - PSSM = __PATH_PSSM_SOURCE__ - - mol_name = os.path.split(featgrp.name)[0] - mol_name = mol_name.lstrip('/') - - pssm = NaivePSSM(mol_name, pdb_data, PSSM) - - # get the sasa info - pssm.get_sasa() - - # read the raw data - pssm.read_PSSM_data() - - # get the pssm smoothed sum score - pssm.process_pssm_data() - - # get the feature vales - pssm.get_feature_value() - - # export in the hdf5 file - pssm.export_dataxyz_hdf5(featgrp) - pssm.export_data_hdf5(featgrp_raw) - - -########################################################################## -# -# IF WE JUST TEST THE CLASS -# -########################################################################## -if __name__ == '__main__': - - t0 = time() - path = '/home/nico/Documents/projects/deeprank/data/HADDOCK/BM4_dimers/PSSM_newformat/' - pssm = NaivePSSM(mol_name='2ABZ', pdbfile='2ABZ_1w.pdb', pssm_path=path) - - # get the surface accessible solvent area - pssm.get_sasa() - - # get the pssm smoothed sum score - pssm.read_PSSM_data() - pssm.process_pssm_data() - pssm.get_feature_value() - print(pssm.feature_data_xyz) - print(' Time %f ms' % ((time() - t0) * 1000)) diff --git a/deeprank/features/__init__.py b/deeprank/features/__init__.py index 4e1300e0..5da26024 100644 --- a/deeprank/features/__init__.py +++ b/deeprank/features/__init__.py @@ -2,6 +2,5 @@ from .BSA import BSA from .AtomicFeature import AtomicFeature from .FullPSSM import FullPSSM -from .NaivePSSM import NaivePSSM from .PSSM_IC import PSSM_IC from .ResidueDensity import ResidueDensity diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 05307516..d8eb546d 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -74,7 +74,6 @@ def __init__(self, pdb_select=None, pdb_source=None, >>> data_augmentation=None, >>> compute_targets=['deeprank.targets.dockQ'], >>> compute_features=['deeprank.features.AtomicFeature', - >>> 'deeprank.features.NaivePSSM', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) @@ -190,7 +189,6 @@ def create_database( >>> database = DataGenerator(pdb_source=pdb_source,pdb_native=pdb_native,data_augmentation=None, >>> compute_targets = ['deeprank.targets.dockQ'], >>> compute_features = ['deeprank.features.AtomicFeature', - >>> 'deeprank.features.NaivePSSM', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) diff --git a/docs/deeprank.features.rst b/docs/deeprank.features.rst index 68bf8a6e..04dadfda 100644 --- a/docs/deeprank.features.rst +++ b/docs/deeprank.features.rst @@ -8,7 +8,6 @@ Feature This module contains all the tools to compute feature values for molecular structure. Each submodule must be subclass ``deeprank.features.FeatureClass`` to inherit the export function. At the moment a few features have already been implemented. These are: - ``AtomicFeatures``:Coulomb, van der Waals interactions and atomic charges - ``BSA`` : Burried Surface area - - ``NaivePSSM`` : A very simple approach for PSSM data - ``PSSM_IC`` : Information content of the PSSM - ``ResidueDensity`` : The residue density for polar/apolar/charged pairs @@ -33,14 +32,6 @@ Burried Surface Area :undoc-members: -NaivePSSM ------------------------------------- - -.. automodule:: deeprank.features.NaivePSSM - :members: - :undoc-members: - - Information Content ----------------------------------- diff --git a/docs/deeprank.generate.rst b/docs/deeprank.generate.rst index d3e8f4bc..5aa64d7e 100644 --- a/docs/deeprank.generate.rst +++ b/docs/deeprank.generate.rst @@ -40,7 +40,6 @@ Example: >>> database = DataGenerator(pdb_source=pdb_source,pdb_native=pdb_native,data_augmentation=None, >>> compute_targets = ['deeprank.targets.dockQ'], >>> compute_features = ['deeprank.features.AtomicFeature', ->>> 'deeprank.features.NaivePSSM', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e0cb4aec..e4d2473c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -39,7 +39,6 @@ We are now ready to initialize the ``DataGenerator`` class included in DeepRank >>> database = DataGenerator(pdb_source=pdb_source,pdb_native=pdb_native, >>> compute_targets = ['deeprank.targets.dockQ'], >>> compute_features = ['deeprank.features.AtomicFeature', ->>> 'deeprank.features.NaivePSSM', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) diff --git a/scripts/launch.py b/scripts/launch.py index c20420de..b6d176cc 100755 --- a/scripts/launch.py +++ b/scripts/launch.py @@ -41,7 +41,6 @@ def generate(LIST_NAME, clean=False): 'deeprank.targets.binary_class'], compute_features=[ 'deeprank.features.AtomicFeature', - 'deeprank.features.NaivePSSM', 'deeprank.features.FullPSSM', 'deeprank.features.PSSM_IC', 'deeprank.features.BSA', diff --git a/scripts/simple_generate.py b/scripts/simple_generate.py index cfc3ba77..1495d554 100644 --- a/scripts/simple_generate.py +++ b/scripts/simple_generate.py @@ -19,7 +19,6 @@ 'deeprank.targets.binary_class'], compute_features=[ 'deeprank.features.AtomicFeature', - 'deeprank.features.NaivePSSM', 'deeprank.features.PSSM_IC', 'deeprank.features.BSA', 'deeprank.features.FullPSSM', diff --git a/test/test_generate_cuda.py b/test/test_generate_cuda.py index 2e4d7ff7..24085bb2 100644 --- a/test/test_generate_cuda.py +++ b/test/test_generate_cuda.py @@ -34,7 +34,6 @@ def test_generate_cuda(): compute_targets=['deeprank.targets.dockQ'], compute_features=[ 'deeprank.features.AtomicFeature', - 'deeprank.features.NaivePSSM', 'deeprank.features.PSSM_IC', 'deeprank.features.BSA'], hdf5=self.h5file) From 4485712d5cb525ee965e2bae8370a66da42f9233 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 3 Apr 2020 16:04:38 +0200 Subject: [PATCH 29/38] docstring of realign --- deeprank/generate/DataGenerator.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index d8eb546d..47c610b4 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -729,8 +729,7 @@ def add_target(self, prog_bar=False): f5.close() def realign_complexes(self, align, compute_features=None, pssm_source=None): - """align all the complexes already present in the HDF5 - + """Align all the complexes already present in the HDF5. Arguments: align {dict} -- alignement dictionary (see __init__) @@ -744,12 +743,21 @@ def realign_complexes(self, align, compute_features=None, pssm_source=None): Raises: ValueError: If no PSSM detected + + Example : + + >>> database = DataGenerator(hdf5='1ak4.hdf5') + >>> # if comute_features and pssm_source are not specified + >>> # the values in hdf5.attrs['features'] and hdf5.attrs['pssm_source'] will be used + >>> database.realign_complex(align={'axis':'x'}, + >>> compute_features['deeprank.features.X'], + >>> pssm_source='./1ak4_pssm/') """ f5 = h5py.File(self.hdf5,'a') mol_names = f5.keys() - self.logger.info(f'\n# Start re creating HDF5 database: {self.hdf5}') + self.logger.info(f'\n# Start aligning the HDF5 database: {self.hdf5}') # deal with the features if self.compute_features is None: @@ -771,6 +779,7 @@ def realign_complexes(self, align, compute_features=None, pssm_source=None): else : raise ValueError('No pssm source detected') + # loop over the complexes desc = '{:25s}'.format('Add features') for mol in tqdm(mol_names, desc=desc, ncols=100): From 8bbf37d634c5bd66c596669e494cacc3b41667c2 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 3 Apr 2020 17:05:45 +0200 Subject: [PATCH 30/38] removed mapped features --- deeprank/generate/DataGenerator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 47c610b4..bb640d0b 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -794,11 +794,11 @@ def realign_complexes(self, align, compute_features=None, pssm_source=None): molgrp['complex'][...] = data # remove prexisting features - if 'features' in molgrp: - del molgrp['features'] - if 'features_raw' in molgrp: - del molgrp['features_raw'] - + old_dir = ['features', 'features_raw', 'mapped_features'] + for od in old_dir: + if od in molgrp: + del molgrp[od] + # the internal features molgrp.require_group('features') molgrp.require_group('features_raw') From 1e61cda935233e6cfb2cdcac74e3b7c447c9748f Mon Sep 17 00:00:00 2001 From: LilySnow Date: Sat, 4 Apr 2020 12:48:24 +0200 Subject: [PATCH 31/38] add ploting R code to util --- deeprank/utils/boxplot.R | 39 +++++++++++++++++++++++++++++++ deeprank/utils/hitrate_plot.R | 38 ++++++++++++++++++++++++++++++ deeprank/utils/successrate_plot.R | 31 ++++++++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 deeprank/utils/boxplot.R create mode 100644 deeprank/utils/hitrate_plot.R create mode 100644 deeprank/utils/successrate_plot.R diff --git a/deeprank/utils/boxplot.R b/deeprank/utils/boxplot.R new file mode 100644 index 00000000..0fd0a41f --- /dev/null +++ b/deeprank/utils/boxplot.R @@ -0,0 +1,39 @@ +# INPUT: rawdata.tsv +# +# label caseID modelID target DR HS +# Test 1YVB 1YVB_ranair-it0_4286 0 0.56818 4.04629 +# Test 1PPE 1PPE_ranair-it0_2999 0 0.56486 50.17506 + +library(ggplot2) + +args = commandArgs(trailingOnly=TRUE) +input_FL = args[1] #'it0.rawdata.tsv' +outFL = args[2] #'boxplot.png' + + +data = read.csv(input_FL, sep = '\t') + +data$target = as.character(data$target) +data$label = factor(data$label, levels = c('Train','Valid','Test')) + +print(data[0:5,]) + +font_size = 20 +text_style = element_text(size=font_size, family="Helvetica", face="bold") +colormap = c('0' = 'ivory3', '1' = 'steelblue') + +ggplot(data, aes(x=target, y = DR, fill = target)) + + geom_boxplot(width=0.2,alpha=0.7) + + facet_grid(.~label ) + + scale_fill_manual(values = colormap) + + theme_bw() + + theme(legend.position = 'right', + plot.title= text_style, + text= text_style, + axis.text.x= element_text(size=font_size), + axis.text.y= element_text(size=font_size)) + + scale_x_discrete(name = 'Target') + + +ggsave(outFL, dpi = 100) +print(paste(outFL, 'generated.')) diff --git a/deeprank/utils/hitrate_plot.R b/deeprank/utils/hitrate_plot.R new file mode 100644 index 00000000..7d174f6d --- /dev/null +++ b/deeprank/utils/hitrate_plot.R @@ -0,0 +1,38 @@ +# INPUT: hitrate_melted.tsv +# +# label rank Methods hit_rate +# Test 1 hitRate_DR 0.01 +# Test 2 hitRate_DR 0.02 + +library(ggplot2) + +args = commandArgs(trailingOnly=TRUE) + +hitrate_FL = args[1] #'hitrate_melted.tsv' +outFL = args[2] #'hitrate.png' + +df_tmp = read.csv(hitrate_FL, sep = '\t') +df_tmp$label = factor(df_tmp$label, levels = c('Train','Valid','Test')) + +print(df_tmp[0:5,]) + +font_size = 20 +text_style = element_text(size=font_size, family="Helvetica", face="bold") + + +ggplot(df_tmp) + aes_string(x='rank', y='hit_rate', color='label', linetype='Methods') + + facet_wrap('label~.', scales = 'free') + + geom_line(size=1) + + labs(x= 'Top N models', y= 'Hit Rate') + + theme_bw()+ + theme('legend.position' = 'right', + 'plot.title'= text_style, + 'text'= text_style, + 'axis.text.x'= element_text(size=font_size-6, angle = -45, vjust = 1, hjust= 0), + 'axis.text.y'= element_text(size=font_size-6)) + + labs('colour'= "Sets") #+ +# coord_cartesian(xlim = c(0, 100)) + + +ggsave(outFL, dpi = 100) +print(paste(outFL, 'generated.')) diff --git a/deeprank/utils/successrate_plot.R b/deeprank/utils/successrate_plot.R new file mode 100644 index 00000000..75a67949 --- /dev/null +++ b/deeprank/utils/successrate_plot.R @@ -0,0 +1,31 @@ +library(ggplot2) + +args = commandArgs(trailingOnly=TRUE) + +input_FL = args[1] #'successrate_melted.tsv' +outFL = args[2] #'successrate.png' + +df = read.csv(input_FL, sep = '\t') +df$label = factor(df$label, levels = c('Train','Valid','Test')) + +print(df[0:5,]) + +font_size = 20 +text_style = element_text(size=font_size, family="Helvetica", face="bold") + +ggplot(df) + + aes_string(x='rank', y='success_rate', color='label', linetype='Methods') + + facet_wrap('label ~.', scales = 'free') + + geom_line(size=1) + + labs('x'= 'Top N models', 'y'= 'Success Rate') + + theme_bw() + + theme('legend.position'= 'right', + 'plot.title'= text_style, + 'text'= text_style, + 'axis.text.x'= element_text(size=font_size-6, angle = -45, vjust = 1, hjust= 0), + 'axis.text.y'= element_text(size=font_size-6)) + + labs('colour'= "Sets") #change legend title to 'Sets' +# scale_x_continuous(**{'breaks': breaks, 'labels': xlabels}) + +ggsave(outFL, dpi = 100) +print(paste(outFL, 'generated.')) From 7e10e3f8932e581db2b0481fc0eae2ce8e3d24a8 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Mon, 6 Apr 2020 11:24:24 +0200 Subject: [PATCH 32/38] do not crash when missing CA/CB atoms --- deeprank/features/BSA.py | 12 +++++++++--- deeprank/features/FullPSSM.py | 8 ++++++++ deeprank/features/ResidueDensity.py | 10 +++++++--- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/deeprank/features/BSA.py b/deeprank/features/BSA.py index 37d8ac6b..cd885ab6 100644 --- a/deeprank/features/BSA.py +++ b/deeprank/features/BSA.py @@ -120,9 +120,15 @@ def get_contact_residue_sasa(self, cutoff=5.5): atcenter = 'CB' if res[2] == 'GLY': atcenter = 'CA' - xyz = self.sql.get( - 'x,y,z', resSeq=res[1], chainID=res[0], name=atcenter)[0] - # xyz = np.mean(self.sql.get('x,y,z',resSeq=r[1],chainID=r[0]),0) + + try : + xyz = self.sql.get( + 'x,y,z', resSeq=res[1], chainID=res[0], name=atcenter)[0] + except IndexError : + warnings .warn('Atom ', atcenter, ' not found for residue ', key[1], \ + '. Use residue center as feature center') + xyz = np.mean(self.sql.get('x,y,z',resSeq=r[1],chainID=r[0]),0) + xyzkey = tuple([chain] + xyz) # put the data in dict diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py index 4e6572a1..f97cd8d4 100644 --- a/deeprank/features/FullPSSM.py +++ b/deeprank/features/FullPSSM.py @@ -212,6 +212,14 @@ def get_feature_value(self, cutoff=5.5): f"{self.mol_name}: The following interface residues have " f" no pssm value:\n {ctc_res_wo_pssm}" ) + elif len(pssm_res_set.difference(ctc_res_set)) > 0: + # can happen if CA/CB is missing in the res + pssm_res_wo_ctc = pssm_res_set.difference(ctc_res_set) + ctc_res_with_pssm = pssm_res_set.intersection(ctc_res_set) + warnings.warn( + f"{self.mol_name}: The following interface residues have " + f" no pssm value:\n {pssm_res_wo_ctc}" + ) else: ctc_res_with_pssm = ctc_res diff --git a/deeprank/features/ResidueDensity.py b/deeprank/features/ResidueDensity.py index 9965a7fb..4b9cf71e 100644 --- a/deeprank/features/ResidueDensity.py +++ b/deeprank/features/ResidueDensity.py @@ -129,9 +129,13 @@ def extract_features(self): atcenter = 'CA' # get the xyz of the center atom - xyz = self.sql.get( - 'x,y,z', resSeq=key[1], chainID=key[0], name=atcenter)[0] - #xyz = np.mean(self.sql.get('x,y,z',resSeq=key[1],chainID=key[0]),0).tolist() + try: + xyz = self.sql.get( + 'x,y,z', resSeq=key[1], chainID=key[0], name=atcenter)[0] + except IndexError : + warnings .warn('Atom ', atcenter, ' not found for residue ', key[1], \ + '. Use residue center as feature center') + xyz = np.mean(self.sql.get('x,y,z',resSeq=key[1],chainID=key[0]),0).tolist() xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz) self.feature_data_xyz['RCD_total'][xyz_key] = [ From d278b9d39344e08506bf4f9e6eb49e369c6fb6c4 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Mon, 6 Apr 2020 11:27:30 +0200 Subject: [PATCH 33/38] fixed error message --- deeprank/features/FullPSSM.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py index f97cd8d4..48f9ae2b 100644 --- a/deeprank/features/FullPSSM.py +++ b/deeprank/features/FullPSSM.py @@ -218,7 +218,7 @@ def get_feature_value(self, cutoff=5.5): ctc_res_with_pssm = pssm_res_set.intersection(ctc_res_set) warnings.warn( f"{self.mol_name}: The following interface residues have " - f" no pssm value:\n {pssm_res_wo_ctc}" + f" a CA or CB missing :\n {pssm_res_wo_ctc}" ) else: ctc_res_with_pssm = ctc_res From 08104161fce532768163e8597f0528768dc54fdf Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Mon, 6 Apr 2020 14:36:48 +0200 Subject: [PATCH 34/38] fixed test generate --- test/test_generate.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_generate.py b/test/test_generate.py index e9ed44fd..10c28a60 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -1,7 +1,7 @@ import os import unittest from time import time - +import shutil from deeprank.generate import * @@ -247,7 +247,14 @@ def test_6_align_interface(self): def test_7_realign(self): '''Realign existing pdbs.''' - database = DataGenerator(hdf5='./1ak4.hdf5') + + src_name = './1ak4.hdf5' + copy_name = './1ak4_aligned.hdf5' + + os.remove(copy_name) + shutil.copy(src_name,copy_name) + + database = DataGenerator(hdf5=copy_name) database.realign_complexes(align={'axis':'z'}) From 79d4c15e7ee510a8f60e2797e73ec9010033dd16 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Mon, 6 Apr 2020 14:47:29 +0200 Subject: [PATCH 35/38] fix tet generate --- test/test_generate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_generate.py b/test/test_generate.py index 10c28a60..5a83a91e 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -257,8 +257,6 @@ def test_7_realign(self): database = DataGenerator(hdf5=copy_name) database.realign_complexes(align={'axis':'z'}) - - if __name__ == "__main__": # unittest.main() From 1564d11460b2680d4e44946d8a2a3e1981313573 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 10 Apr 2020 14:51:43 +0200 Subject: [PATCH 36/38] intoduced get_residue_center --- deeprank/features/BSA.py | 26 +++--- deeprank/features/FeatureClass.py | 122 ++++++++++++++++++++++------ deeprank/features/FullPSSM.py | 29 ++++--- deeprank/features/ResidueDensity.py | 33 ++++---- deeprank/generate/DataGenerator.py | 2 +- test/test_generate.py | 12 +-- 6 files changed, 152 insertions(+), 72 deletions(-) diff --git a/deeprank/features/BSA.py b/deeprank/features/BSA.py index cd885ab6..1608f00b 100644 --- a/deeprank/features/BSA.py +++ b/deeprank/features/BSA.py @@ -117,19 +117,21 @@ def get_contact_residue_sasa(self, cutoff=5.5): # define the xyz key : (chain,x,y,z) chain = {'A': 0, 'B': 1}[res[0]] - atcenter = 'CB' - if res[2] == 'GLY': - atcenter = 'CA' + # atcenter = 'CB' + # if res[2] == 'GLY': + # atcenter = 'CA' - try : - xyz = self.sql.get( - 'x,y,z', resSeq=res[1], chainID=res[0], name=atcenter)[0] - except IndexError : - warnings .warn('Atom ', atcenter, ' not found for residue ', key[1], \ - '. Use residue center as feature center') - xyz = np.mean(self.sql.get('x,y,z',resSeq=r[1],chainID=r[0]),0) - - xyzkey = tuple([chain] + xyz) + # try : + # xyz = self.sql.get( + # 'x,y,z', resSeq=res[1], chainID=res[0], name=atcenter)[0] + # except IndexError : + # warnings.warn('Atom ', atcenter, ' not found for residue ', key[1], \ + # '. Use residue center as feature center') + # xyz = np.mean(self.sql.get('x,y,z',resSeq=res[1],chainID=res[0]),0) + # xyzkey = tuple([chain] + xyz) + + _, xyz = self.get_residue_center(self.sql, res=res) + xyzkey = tuple([chain] + xyz[0]) # put the data in dict self.bsa_data[res] = [bsa] diff --git a/deeprank/features/FeatureClass.py b/deeprank/features/FeatureClass.py index c29f3d14..07279ed6 100644 --- a/deeprank/features/FeatureClass.py +++ b/deeprank/features/FeatureClass.py @@ -1,12 +1,15 @@ import numpy as np - +import warnings class FeatureClass(object): def __init__(self, feature_type): - """Master class from which all the other feature classes should be - derived. + """Master class from which all the other feature classes should be derived. + + Arguments + feature_type(str): 'Atomic' or 'Residue' + Notes: Each subclass must compute: - self.feature_data: dictionary of features in @@ -28,22 +31,25 @@ def __init__(self, feature_type): {'coulomb': data_dict_clb, 'vdwaals': data_dict_vdw} data_dict_clb = {xyz_info: [values]} xyz_info = (chainNum, x, y, z) - - Args: - feature_type(str): 'Atomic' or 'Residue' """ + self.type = feature_type self.feature_data = {} self.feature_data_xyz = {} def export_data_hdf5(self, featgrp): - """Export the data in human readable format to HDF5's group. - - - For atomic features, the format of the data must be: - {(chainID, resSeq, resName, name): [values]} - - For residue features, the format must be: - {(chainID, resSeq, resName): [values]} + """Export the data in xyz-val format in an HDF5 file group. + + Arguments: + featgrp {[hdf5_group]} -- The hdf5 group of the feature + + Notes: + - For atomic features, the format of the data must be: + {(chainID, resSeq, resName, name): [values]} + - For residue features, the format must be: + {(chainID, resSeq, resName): [values]} """ + # loop through the datadict and name for name, data in self.feature_data.items(): @@ -84,22 +90,14 @@ def export_data_hdf5(self, featgrp): else: featgrp.create_dataset(name + '_raw', data=ds) - ######################################## - # - # export the data in an HDF5 file group - # the format of the data is here - # PRO : fast when mapping - # CON : only usefull for deeprank - # - ######################################## - + def export_dataxyz_hdf5(self, featgrp): """Export the data in xyz-val format in an HDF5 file group. - - For atomic and residue the format of the data must be: - {(chainNum(0 or 1), x, y, z): [values]} + + Arguments: + featgrp {[hdf5_group]} -- The hdf5 group of the feature """ - + # loop through the datadict and name for name, data in self.feature_data_xyz.items(): @@ -112,3 +110,77 @@ def export_dataxyz_hdf5(self, featgrp): old[...] = ds else: featgrp.create_dataset(name, data=ds) + + @staticmethod + def get_residue_center(sql, centers=['CB','CA','mean'], res=None): + """Computes the center of each residue by trying different options + + Arguments: + sql {pdb2sql} -- The pdb2sql instance + + Keyword Arguments: + centers {list} -- list of strings (default: {['CB','CA','mean']}) + res {list} -- list of residue to be considered ([[chainID, resSeq, resName]]) + + Raises: + ValueError: [description] + + Returns: + [type] -- list(res), list(xyz) + """ + + # get all residues if None were provided + # [chainID, resName, resSeq] + if res is None: + res = [tuple(x) for x in sql.get('chainID,resSeq,resName')] + res = sorted(set(res), key=res.index) + + + # make sure that we have a list of res + # even if ony 1 res was provided + # res=[chainID, resSeq, resName] -> res=[[chainID, resSeq, resName]] + elif not isinstance(res[0],list): + res = [res] + + # make sure that we have a list of possible centers + if not isinstance(centers,list): + centers = list(centers) + + xyz = [] + + for r in res: + + for ctr in centers: + + if ctr in ['CB','CA']: + + xyz_res = sql.get('x,y,z', + chainID=r[0], + resSeq=r[1], + resName=r[2], + name=ctr) + + elif ctr == 'mean': + xyz_res = [np.mean(sql.get('x,y,z', + chainID=r[0], + resSeq=r[1], + resName=r[2]),axis=0).tolist()] + + else: + raise ValueError('Center %s not recognized' %c) + + if len(xyz_res) == 0: + continue + + elif len(xyz_res) == 1: + xyz.append(xyz_res[0]) + break + + else: + raise ValueError('Residue center not found') + + if len(xyz) == 0: + raise ValueError('Center not found') + + return res, xyz + diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py index 48f9ae2b..7644a829 100644 --- a/deeprank/features/FullPSSM.py +++ b/deeprank/features/FullPSSM.py @@ -166,12 +166,13 @@ def get_feature_value(self, cutoff=5.5): sql = pdb2sql.interface(self.pdb_file) # set achors for all residues and get their xyz - xyz_info = sql.get('chainID,resSeq,resName', name='CB') - xyz_info += sql.get('chainID,resSeq,resName', name='CA', - resName='GLY') + # xyz_info = sql.get('chainID,resSeq,resName', name='CB') + # xyz_info += sql.get('chainID,resSeq,resName', name='CA', + # resName='GLY') + # xyz = sql.get('x,y,z', name='CB') + # xyz += sql.get('x,y,z', name='CA', resName='GLY') - xyz = sql.get('x,y,z', name='CB') - xyz += sql.get('x,y,z', name='CA', resName='GLY') + xyz_info, xyz = self.get_residue_center(sql) xyz_dict = {} for pos, info in zip(xyz, xyz_info): @@ -212,14 +213,16 @@ def get_feature_value(self, cutoff=5.5): f"{self.mol_name}: The following interface residues have " f" no pssm value:\n {ctc_res_wo_pssm}" ) - elif len(pssm_res_set.difference(ctc_res_set)) > 0: - # can happen if CA/CB is missing in the res - pssm_res_wo_ctc = pssm_res_set.difference(ctc_res_set) - ctc_res_with_pssm = pssm_res_set.intersection(ctc_res_set) - warnings.warn( - f"{self.mol_name}: The following interface residues have " - f" a CA or CB missing :\n {pssm_res_wo_ctc}" - ) + + # elif len(pssm_res_set.difference(ctc_res_set)) > 0: + # # can happen if CA/CB is missing in the res + # pssm_res_wo_ctc = pssm_res_set.difference(ctc_res_set) + # ctc_res_with_pssm = pssm_res_set.intersection(ctc_res_set) + # warnings.warn( + # f"{self.mol_name}: The following interface residues have " + # f" a CA or CB missing :\n {pssm_res_wo_ctc}" + # ) + else: ctc_res_with_pssm = ctc_res diff --git a/deeprank/features/ResidueDensity.py b/deeprank/features/ResidueDensity.py index 4b9cf71e..965131d7 100644 --- a/deeprank/features/ResidueDensity.py +++ b/deeprank/features/ResidueDensity.py @@ -123,21 +123,24 @@ def extract_features(self): # total density in raw format self.feature_data['RCD_total'][key] = [res.density['total']] - # get the type of the center - atcenter = 'CB' - if key[2] == 'GLY': - atcenter = 'CA' - - # get the xyz of the center atom - try: - xyz = self.sql.get( - 'x,y,z', resSeq=key[1], chainID=key[0], name=atcenter)[0] - except IndexError : - warnings .warn('Atom ', atcenter, ' not found for residue ', key[1], \ - '. Use residue center as feature center') - xyz = np.mean(self.sql.get('x,y,z',resSeq=key[1],chainID=key[0]),0).tolist() - - xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz) + # # get the type of the center + # atcenter = 'CB' + # if key[2] == 'GLY': + # atcenter = 'CA' + + # # get the xyz of the center atom + # try: + # xyz = self.sql.get( + # 'x,y,z', resSeq=key[1], chainID=key[0], name=atcenter)[0] + # except IndexError : + # warnings.warn('Atom ', atcenter, ' not found for residue ', key[1], \ + # '. Use residue center as feature center') + # xyz = np.mean(self.sql.get('x,y,z',resSeq=key[1],chainID=key[0]),0).tolist() + # xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz) + + _, xyz = self.get_residue_center(self.sql, res=key) + xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz[0]) + self.feature_data_xyz['RCD_total'][xyz_key] = [ res.density['total']] diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index bb640d0b..1f4d635d 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -251,7 +251,7 @@ def create_database( # names of the molecule mol_name = os.path.splitext(os.path.basename(cplx))[0] - mol_name = mol_name.replace('-', '_') + #mol_name = mol_name.replace('-', '_') mol_aug_name_list = [] try: diff --git a/test/test_generate.py b/test/test_generate.py index 5a83a91e..d1672ce2 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -262,9 +262,9 @@ def test_7_realign(self): # unittest.main() inst = TestGenerateData() inst.test_1_generate() - inst.test_1_generate_mapfly() - inst.test_3_add_unique_target() - inst.test_4_add_feature() - inst.test_5_align() - inst.test_6_align_interface() - inst.test_7_realign() \ No newline at end of file + # inst.test_1_generate_mapfly() + # inst.test_3_add_unique_target() + # inst.test_4_add_feature() + # inst.test_5_align() + # inst.test_6_align_interface() + # inst.test_7_realign() \ No newline at end of file From ffc29f4e6ab89b2558a3f342c936945cf5fc3ec1 Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 10 Apr 2020 14:56:50 +0200 Subject: [PATCH 37/38] clean up --- deeprank/features/BSA.py | 14 +------------- deeprank/features/FullPSSM.py | 15 --------------- deeprank/features/ResidueDensity.py | 16 +--------------- deeprank/generate/DataGenerator.py | 1 - 4 files changed, 2 insertions(+), 44 deletions(-) diff --git a/deeprank/features/BSA.py b/deeprank/features/BSA.py index 1608f00b..0c90f543 100644 --- a/deeprank/features/BSA.py +++ b/deeprank/features/BSA.py @@ -117,19 +117,7 @@ def get_contact_residue_sasa(self, cutoff=5.5): # define the xyz key : (chain,x,y,z) chain = {'A': 0, 'B': 1}[res[0]] - # atcenter = 'CB' - # if res[2] == 'GLY': - # atcenter = 'CA' - - # try : - # xyz = self.sql.get( - # 'x,y,z', resSeq=res[1], chainID=res[0], name=atcenter)[0] - # except IndexError : - # warnings.warn('Atom ', atcenter, ' not found for residue ', key[1], \ - # '. Use residue center as feature center') - # xyz = np.mean(self.sql.get('x,y,z',resSeq=res[1],chainID=res[0]),0) - # xyzkey = tuple([chain] + xyz) - + # get the center _, xyz = self.get_residue_center(self.sql, res=res) xyzkey = tuple([chain] + xyz[0]) diff --git a/deeprank/features/FullPSSM.py b/deeprank/features/FullPSSM.py index 7644a829..962532cc 100644 --- a/deeprank/features/FullPSSM.py +++ b/deeprank/features/FullPSSM.py @@ -166,12 +166,6 @@ def get_feature_value(self, cutoff=5.5): sql = pdb2sql.interface(self.pdb_file) # set achors for all residues and get their xyz - # xyz_info = sql.get('chainID,resSeq,resName', name='CB') - # xyz_info += sql.get('chainID,resSeq,resName', name='CA', - # resName='GLY') - # xyz = sql.get('x,y,z', name='CB') - # xyz += sql.get('x,y,z', name='CA', resName='GLY') - xyz_info, xyz = self.get_residue_center(sql) xyz_dict = {} @@ -214,15 +208,6 @@ def get_feature_value(self, cutoff=5.5): f" no pssm value:\n {ctc_res_wo_pssm}" ) - # elif len(pssm_res_set.difference(ctc_res_set)) > 0: - # # can happen if CA/CB is missing in the res - # pssm_res_wo_ctc = pssm_res_set.difference(ctc_res_set) - # ctc_res_with_pssm = pssm_res_set.intersection(ctc_res_set) - # warnings.warn( - # f"{self.mol_name}: The following interface residues have " - # f" a CA or CB missing :\n {pssm_res_wo_ctc}" - # ) - else: ctc_res_with_pssm = ctc_res diff --git a/deeprank/features/ResidueDensity.py b/deeprank/features/ResidueDensity.py index 965131d7..d336b8c3 100644 --- a/deeprank/features/ResidueDensity.py +++ b/deeprank/features/ResidueDensity.py @@ -123,21 +123,7 @@ def extract_features(self): # total density in raw format self.feature_data['RCD_total'][key] = [res.density['total']] - # # get the type of the center - # atcenter = 'CB' - # if key[2] == 'GLY': - # atcenter = 'CA' - - # # get the xyz of the center atom - # try: - # xyz = self.sql.get( - # 'x,y,z', resSeq=key[1], chainID=key[0], name=atcenter)[0] - # except IndexError : - # warnings.warn('Atom ', atcenter, ' not found for residue ', key[1], \ - # '. Use residue center as feature center') - # xyz = np.mean(self.sql.get('x,y,z',resSeq=key[1],chainID=key[0]),0).tolist() - # xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz) - + # get the center _, xyz = self.get_residue_center(self.sql, res=key) xyz_key = tuple([{'A': 0, 'B': 1}[key[0]]] + xyz[0]) diff --git a/deeprank/generate/DataGenerator.py b/deeprank/generate/DataGenerator.py index 1f4d635d..be0f564b 100644 --- a/deeprank/generate/DataGenerator.py +++ b/deeprank/generate/DataGenerator.py @@ -251,7 +251,6 @@ def create_database( # names of the molecule mol_name = os.path.splitext(os.path.basename(cplx))[0] - #mol_name = mol_name.replace('-', '_') mol_aug_name_list = [] try: From 1c35cb73a98e9bb09fb29caa44fdf9d7d33c76da Mon Sep 17 00:00:00 2001 From: NicoRenaud Date: Fri, 10 Apr 2020 15:55:56 +0200 Subject: [PATCH 38/38] fix import and uncomment tests --- deeprank/features/FeatureClass.py | 1 - test/test_generate.py | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/deeprank/features/FeatureClass.py b/deeprank/features/FeatureClass.py index 07279ed6..a4060ea6 100644 --- a/deeprank/features/FeatureClass.py +++ b/deeprank/features/FeatureClass.py @@ -1,5 +1,4 @@ import numpy as np -import warnings class FeatureClass(object): diff --git a/test/test_generate.py b/test/test_generate.py index d1672ce2..5a83a91e 100644 --- a/test/test_generate.py +++ b/test/test_generate.py @@ -262,9 +262,9 @@ def test_7_realign(self): # unittest.main() inst = TestGenerateData() inst.test_1_generate() - # inst.test_1_generate_mapfly() - # inst.test_3_add_unique_target() - # inst.test_4_add_feature() - # inst.test_5_align() - # inst.test_6_align_interface() - # inst.test_7_realign() \ No newline at end of file + inst.test_1_generate_mapfly() + inst.test_3_add_unique_target() + inst.test_4_add_feature() + inst.test_5_align() + inst.test_6_align_interface() + inst.test_7_realign() \ No newline at end of file