Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Commit

Permalink
Merge 5348d56 into dd91cee
Browse files Browse the repository at this point in the history
  • Loading branch information
cbaakman committed Nov 16, 2021
2 parents dd91cee + 5348d56 commit 3b06616
Show file tree
Hide file tree
Showing 8 changed files with 5,406 additions and 17 deletions.
4 changes: 3 additions & 1 deletion deeprank/models/atom.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@


class Atom:
def __init__(self, id_, position, chain_id, name, element, residue=None):
def __init__(self, id_, position, chain_id, name, element, residue=None, altloc=None, occupancy=None):
self.id = id_
self.name = name
self.element = element
self.chain_id = chain_id
self.position = position
self.residue = residue
self.altloc = altloc
self.occupancy = occupancy

def __hash__(self):
return hash(self.id)
Expand Down
40 changes: 29 additions & 11 deletions deeprank/operate/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,50 @@ def get_atoms(pdb2sql):
# This is a working dictionary of residues, identified by their chains and numbers.
residues = {}

# This is the list of atom objects, that will be returned.
atoms = []
# This is the dictionary of atom objects, its values will be returned.
atoms = {}

# Iterate over the atom output from pdb2sql
request_s = "x,y,z,rowID,name,element,chainID,resSeq,resName"
# Iterate over the atom output from pdb2sql, select atoms with highest occupancy.
request_s = "x,y,z,rowID,name,element,chainID,resSeq,resName,iCode,altLoc,occ"
highest_occupancies = {}
for row in pdb2sql.get(request_s):

try:
x, y, z, atom_number, atom_name, element, chain_id, residue_number, residue_name = row
x, y, z, atom_number, atom_name, element, chain_id, residue_number, residue_name, insertion_code, altloc, occ = row
except:
raise ValueError("Got unexpected row {} for {}".format(row, request_s))

atom_id = (chain_id, residue_number, insertion_code, atom_name)

if insertion_code == "":
insertion_code = None # default value

# Make sure that the residue is in the working directory:
residue_id = (chain_id, residue_number)
residue_id = (chain_id, residue_number, insertion_code)
if residue_id not in residues:
residues[residue_id] = Residue(int(residue_number), residue_name, chain_id)
residues[residue_id] = Residue(int(residue_number), residue_name, chain_id, insertion_code)

# Turn the x,y,z into a vector:
atom_position = numpy.array([x, y, z])

# Create the atom object and link it to the residue:
atom = Atom(atom_number, atom_position, chain_id, atom_name, element, residues[residue_id])
atom_id = (chain_id, residue_number, insertion_code, atom_name)

# If the occupancy is lower than the previous atom with the same id, skip the atom:
if atom_id in highest_occupancies:
highest_occ = highest_occupancies[atom_id]
if occ <= highest_occ:
continue

# otherwise, overwrite..
atoms[atom_id] = Atom(atom_number, atom_position, chain_id, atom_name, element, residues[residue_id], altloc, occ)
highest_occupancies[atom_id] = occ

# Link atoms to residues:
for (chain_id, residue_number, insertion_code, atom_name), atom in atoms.items():
residue_id = (chain_id, residue_number, insertion_code)
residues[residue_id].atoms.append(atom)
atoms.append(atom)

return atoms
return list(atoms.values())


def get_residue_contact_atom_pairs(pdb2sql, chain_id, residue_number, insertion_code, max_interatomic_distance):
Expand Down
5,244 changes: 5,244 additions & 0 deletions test/data/pdb/5MNH/5MNH.pdb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions test/features/test_neighbour_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
def test_feature():
tmp_dir_path = mkdtemp()

variant = PdbVariantSelection("test/101M.pdb", "A", 25, glycine, tryptophan,
{'A': "test/101M.A.pdb.pssm"})

try:
hdf5_path = os.path.join(tmp_dir_path, 'test.hdf5')

variant = PdbVariantSelection("test/101M.pdb", "A", 25, glycine, tryptophan,
{'A': "test/101M.A.pdb.pssm"})

with h5py.File(hdf5_path, 'w') as f5:
group = f5.require_group("features")
__compute_feature__(variant.pdb_path, group, None, variant)
Expand Down
37 changes: 36 additions & 1 deletion test/generate/test_gridtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from deeprank.generate.GridTools import GridTools
from deeprank.models.variant import PdbVariantSelection
from deeprank.features.atomic_contacts import __compute_feature__ as compute_contact_feature
from deeprank.domain.amino_acid import phenylalanine, tyrosine, valine, unknown_amino_acid
from deeprank.domain.amino_acid import phenylalanine, tyrosine, valine, aspartate, asparagine, unknown_amino_acid


_log = logging.getLogger(__name__)
Expand Down Expand Up @@ -102,6 +102,41 @@ def test_atomic_contacts_mapping():
shutil.rmtree(tmp_dir)


def test_nan():
"this variant was reported as NaN-causing"

pdb_path = "test/data/pdb/5MNH/5MNH.pdb"

variant = PdbVariantSelection(pdb_path, 'A', 153, aspartate, asparagine)
variant_name = "NaN-causing"

feature_types = ["coulomb"]

tmp_dir = mkdtemp()
try:
tmp_path = os.path.join(tmp_dir, "test.hdf5")
with h5py.File(tmp_path, 'w') as f5:

variant_group = f5.require_group(variant_name)
variant_group.attrs['type'] = 'variant'
hdf5data.store_variant(variant_group, variant)

feature_group = variant_group.require_group('features')
raw_feature_group = variant_group.require_group('features_raw')

compute_contact_feature(pdb_path, feature_group, raw_feature_group, variant)

# Build the grid and map the features.
gridtools = GridTools(variant_group, variant,
number_of_points=20, resolution=1.0,
atomic_densities={'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8},
feature=feature_types,
contact_distance=8.5,
try_sparse=False)
finally:
shutil.rmtree(tmp_dir)


def test_feature_mapping():
""" In this test, we investigate a set of five atoms. We make the grid tool take
the atoms in a 20 A radius around the first atom and compute the carbon density grid around it.
Expand Down
44 changes: 44 additions & 0 deletions test/operate/test_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,27 @@ def test_xray():
assert not is_xray(f), "1a6b was identified as x-ray"


def test_altloc():
pdb_path = "test/data/pdb/5MNH/5MNH.pdb"

try:
pdb = pdb2sql(pdb_path)

atoms = get_atoms(pdb)

finally:
pdb._close()

selection = [atom for atom in atoms if atom.residue.number == 153 and
atom.chain_id == "A" and
atom.name == "CA"]

assert len(selection) == 1, "got {} of the same atom".format(len(selection))

# should be altloc C
assert selection[0].altloc == 'C', "got atom {} instead".format(selection[0].altloc)


def test_get_atoms():
pdb_path = os.path.join(pkg_resources.resource_filename(__name__, ''),
"../1AK4/native/1AK4.pdb")
Expand Down Expand Up @@ -63,6 +84,7 @@ def test_residue_contact_atoms():
query_residue = _find_residue(atoms, chain_id, residue_number)

contact_atom_pairs = get_residue_contact_atom_pairs(pdb, chain_id, residue_number, None, 8.5)
assert len(contact_atom_pairs) > 0, "no contacts found"

# Check for redundancy (we shouldn't see the same set of atoms twice)
atom_pairs_encountered = []
Expand Down Expand Up @@ -101,3 +123,25 @@ def test_residue_contact_atoms():

ok_(neighbour in contact_atoms)
ok_(distant not in contact_atoms)


def test_contacts_101m():
pdb_path = "test/101M.pdb"
chain_id = "A"

residue_number = 25

pdb = pdb2sql(pdb_path)
try:
contact_atom_pairs = get_residue_contact_atom_pairs(pdb, chain_id, residue_number, None, 10.0)

assert len(contact_atom_pairs) > 0, "no contacts found"

for atom1, atom2 in contact_atom_pairs:
assert atom1.residue is not None
assert len(atom1.residue.atoms) > 0

assert atom2.residue is not None
assert len(atom2.residue.atoms) > 0
finally:
pdb._close()
46 changes: 46 additions & 0 deletions test/test_generate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import sys
import unittest
from time import time
import shutil
from tempfile import mkdtemp
import logging

from unittest.mock import patch, MagicMock
import numpy
import h5py
from nose.tools import eq_, ok_
Expand Down Expand Up @@ -119,3 +121,47 @@ def test_skip_error():
finally:
shutil.rmtree(tmp_dir)


@patch("deeprank.generate.GridTools.map_features")
def test_skip_nan(mock_map_features):
"NaN features should not be added to the preprocessing"

number_of_points = 20

nan_dict = {}
for feature_name in ["feature1", "feature2"]:

grid = numpy.empty((number_of_points, number_of_points, number_of_points))
grid[:] = numpy.nan

nan_dict[feature_name] = grid

mock_map_features.return_value = nan_dict

tmp_dir = mkdtemp()

variants = [PdbVariantSelection("test/101m.pdb", 'A', 25, glycine, alanine)]

feature_names = ["test.feature.feature1", "test.feature.feature2"]
target_names = ["test.target.target1"]

resolution = 1.0
atomic_densities = {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8}
grid_info = {
'number_of_points': [number_of_points, number_of_points, number_of_points],
'resolution': [resolution, resolution, resolution],
'atomic_densities': atomic_densities,
}

hdf5_path = os.path.join(tmp_dir, "test.hdf5")

try:
data_generator = DataGenerator(variants, None, target_names, feature_names, 1, hdf5_path)
data_generator.create_database()
data_generator.map_features(grid_info)

# Check that the output file is empty, since nan entries don't belong in the output.
with h5py.File(hdf5_path, 'r') as f5:
ok_(len(f5.keys()) == 0)
finally:
shutil.rmtree(tmp_dir)
2 changes: 1 addition & 1 deletion test/test_learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_learn():
feature_modules = ["test.feature.feature1", "test.feature.feature2"]
target_modules = ["test.target.target1"]
pdb_path = "test/101M.pdb"
pssm_paths = {"A": "101M.A.pdb.pssm"}
pssm_paths = {"A": "test/101M.A.pdb.pssm"}

atomic_densities = {'C': 1.7, 'N': 1.55, 'O': 1.52, 'S': 1.8}
grid_info = {
Expand Down

0 comments on commit 3b06616

Please sign in to comment.