Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve amino acid features #272

Merged
merged 28 commits into from Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
1fee844
add non-canonical amino acids to one hot encoding
DaniBodor Dec 8, 2022
aa58cf9
non-canonical aas encoded as canonical counterpart
DaniBodor Dec 8, 2022
46ba65a
rename amino acid charge to property x
DaniBodor Dec 8, 2022
d204f57
improve AminoAcid docstring
DaniBodor Dec 8, 2022
8eb60a4
add mass+pI to aminoacidlist.py; reorder according to 1letter code
DaniBodor Dec 8, 2022
2f95fb8
add mass and pI to features/components.py
DaniBodor Dec 8, 2022
711ba6e
change DIFFCHARGE to DIFFX
DaniBodor Dec 8, 2022
5e849bf
re-index amino acids
DaniBodor Dec 8, 2022
49d7479
correct/update polarity of amino acids
DaniBodor Dec 8, 2022
49667d2
update info on H bonds
DaniBodor Dec 8, 2022
e438800
add source for size
DaniBodor Dec 8, 2022
56725e9
same order of amino acid properties in all modules
DaniBodor Dec 8, 2022
95f9d41
test explicitly all amino acid features
DaniBodor Dec 8, 2022
1a4dfec
fix test_aminoacidlist
DaniBodor Dec 8, 2022
7190a6a
fix import in notebook
DaniBodor Dec 8, 2022
a44d734
add __init__ to neuralnets
DaniBodor Dec 8, 2022
cc264a0
fix charge
DaniBodor Dec 9, 2022
7ddd6a4
add notes on charge of S, T, Y
DaniBodor Dec 9, 2022
0ea69b5
additional comments on Sec/Pyl
DaniBodor Dec 9, 2022
8f79072
add source date
DaniBodor Dec 9, 2022
cab0709
Merge branch 'main' into 271_2_aminoacid_properties_dbodor
DaniBodor Dec 9, 2022
f0332e7
reorder AminoAcid @properties
DaniBodor Dec 9, 2022
49337a7
Merge branch 'main' into 271_aminoacid_properties_dbodor
DaniBodor Dec 9, 2022
7624059
update notebook
DaniBodor Dec 9, 2022
37967aa
update info in aminoacidlist.py
DaniBodor Dec 9, 2022
9199eb4
add excel with summarized amino acid info
DaniBodor Dec 9, 2022
36741cd
improve test_aminoacidlist
DaniBodor Dec 9, 2022
426ba74
update hdf5 files
DaniBodor Dec 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file added deeprankcore/domain/aminoacid_summary.xlsx
Binary file not shown.
424 changes: 255 additions & 169 deletions deeprankcore/domain/aminoacidlist.py

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions deeprankcore/domain/nodestorage.py
Expand Up @@ -8,16 +8,20 @@

## residue core features
RESTYPE = "res_type" # AminoAcid object; former FEATURENAME_AMINOACID
RESCHARGE = "res_charge" # float(<0); former FEATURENAME_CHARGE (was not assigned)
RESSIZE = "res_size" # int; former FEATURENAME_SIZE
RESCHARGE = "charge" # float(<0); former FEATURENAME_CHARGE (was not assigned)
POLARITY = "polarity" # Polarity object; former FEATURENAME_POLARITY
RESSIZE = "res_size" # int; former FEATURENAME_SIZE
RESMASS = "res_mass"
RESPI = "res_pI"
HBDONORS = "hb_donors" # int; former FEATURENAME_HYDROGENBONDDONORS
HBACCEPTORS = "hb_acceptors"# int; former FEATURENAME_HYDROGENBONDACCEPTORS

## variant residue features
VARIANTRES = "variant_res" # AminoAcid object; former FEATURENAME_VARIANTAMINOACID
DIFFCHARGE = "diff_charge" # float
DIFFSIZE = "diff_size" # int; former FEATURENAME_SIZEDIFFERENCE
DIFFMASS = "diff_mass"
DIFFPI = "diff_pI"
DIFFPOLARITY = "diff_polarity" # [type?]; former FEATURENAME_POLARITYDIFFERENCE
DIFFHBDONORS = "diff_hb_donors" # int; former FEATURENAME_HYDROGENBONDDONORSDIFFERENCE
DIFFHBACCEPTORS = "diff_hb_acceptors" # int; former FEATURENAME_HYDROGENBONDACCEPTORSDIFFERENCE
Expand Down
21 changes: 13 additions & 8 deletions deeprankcore/features/components.py
Expand Up @@ -31,27 +31,32 @@ def add_features( # pylint: disable=unused-argument

node.features[Nfeat.RESTYPE] = residue.amino_acid.onehot
node.features[Nfeat.RESCHARGE] = residue.amino_acid.charge
node.features[Nfeat.RESSIZE] = residue.amino_acid.size
node.features[Nfeat.POLARITY] = residue.amino_acid.polarity.onehot
node.features[Nfeat.HBDONORS] = residue.amino_acid.count_hydrogen_bond_donors
node.features[Nfeat.HBACCEPTORS] = residue.amino_acid.count_hydrogen_bond_acceptors
node.features[Nfeat.RESSIZE] = residue.amino_acid.size
node.features[Nfeat.RESMASS] = residue.amino_acid.mass
node.features[Nfeat.RESPI] = residue.amino_acid.pI
node.features[Nfeat.HBDONORS] = residue.amino_acid.hydrogen_bond_donors
node.features[Nfeat.HBACCEPTORS] = residue.amino_acid.hydrogen_bond_acceptors

if single_amino_acid_variant is not None:

wildtype = single_amino_acid_variant.wildtype_amino_acid
variant = single_amino_acid_variant.variant_amino_acid

if residue == single_amino_acid_variant.residue:
node.features[Nfeat.VARIANTRES] = variant.onehot
node.features[Nfeat.DIFFCHARGE] = variant.charge - wildtype.charge
node.features[Nfeat.DIFFSIZE] = variant.size - wildtype.size
node.features[Nfeat.DIFFPOLARITY] = variant.polarity.onehot - wildtype.polarity.onehot
node.features[Nfeat.DIFFHBDONORS] = variant.count_hydrogen_bond_donors - wildtype.count_hydrogen_bond_donors
node.features[Nfeat.DIFFHBACCEPTORS] = variant.count_hydrogen_bond_acceptors - wildtype.count_hydrogen_bond_acceptors
node.features[Nfeat.DIFFSIZE] = variant.size - wildtype.size
node.features[Nfeat.DIFFMASS] = variant.mass - wildtype.mass
node.features[Nfeat.DIFFPI] = variant.pI - wildtype.pI
node.features[Nfeat.DIFFHBDONORS] = variant.hydrogen_bond_donors - wildtype.hydrogen_bond_donors
node.features[Nfeat.DIFFHBACCEPTORS] = variant.hydrogen_bond_acceptors - wildtype.hydrogen_bond_acceptors
else:
node.features[Nfeat.VARIANTRES] = residue.amino_acid.onehot
node.features[Nfeat.DIFFCHARGE] = 0
node.features[Nfeat.DIFFSIZE] = 0
node.features[Nfeat.DIFFPOLARITY] = np.zeros(residue.amino_acid.polarity.onehot.shape)
node.features[Nfeat.DIFFSIZE] = 0
node.features[Nfeat.DIFFMASS] = 0
node.features[Nfeat.DIFFPI] = 0
node.features[Nfeat.DIFFHBDONORS] = 0
node.features[Nfeat.DIFFHBACCEPTORS] = 0
86 changes: 52 additions & 34 deletions deeprankcore/molstruct/aminoacid.py
Expand Up @@ -5,7 +5,7 @@
class Polarity(Enum):
"a value to express a residue's polarity"

APOLAR = 0
NONPOLAR = 0
POLAR = 1
NEGATIVE_CHARGE = 2
POSITIVE_CHARGE = 3
Expand All @@ -26,35 +26,45 @@ def __init__( # pylint: disable=too-many-arguments
name: str,
three_letter_code: str,
one_letter_code: str,
charge: float,
charge: int,
polarity: Polarity,
size: int,
count_hydrogen_bond_donors: int,
count_hydrogen_bond_acceptors: int,
mass: float,
pI: float,
hydrogen_bond_donors: int,
hydrogen_bond_acceptors: int,
index: int,
):
"""
Args:
name(str): unique name for the amino acid
three_letter_code(str): code of the amino acid, as in PDB
one_letter_code(str): letter of the amino acid, as in fasta
charge(float, optional): the charge property of the amino acid
polarity(deeprank polarity enum, optional): the polarity property of the amino acid
size(int, optional): the number of heavy atoms in the side chain
index(int, optional): the rank of the amino acid, used for computing one-hot encoding
name (str): full name of the amino acid
three_letter_code (str): three-letter code of the amino acid (as in PDB)
one_letter_code (str): one-letter of the amino acid (as in fasta)
charge (int): charge of the amino acid
polarity (deeprank polarity enum): the polarity of the amino acid
size (int): the number of non-hydrogen atoms in the side chain
mass (float): average residue mass (i.e. mass of amino acid - H20) in Daltons
pI (float): isolectric point; pH at which the molecule has no net electric charge
hydrogen_bond_donors (int): number of hydrogen bond donors
hydrogen_bond_acceptors (int): number of hydrogen bond acceptors
index (int): the rank of the amino acid, used for computing one-hot encoding
"""

# amino acid nomenclature
self._name = name
self._three_letter_code = three_letter_code
self._one_letter_code = one_letter_code

# these settings apply to the side chain
self._size = size
# side chain properties
self._charge = charge
self._polarity = polarity
self._count_hydrogen_bond_donors = count_hydrogen_bond_donors
self._count_hydrogen_bond_acceptors = count_hydrogen_bond_acceptors
self._size = size
self._mass = mass
self._pI = pI
self._hydrogen_bond_donors = hydrogen_bond_donors
self._hydrogen_bond_acceptors = hydrogen_bond_acceptors

# one hot encoding
self._index = index

@property
Expand All @@ -70,37 +80,45 @@ def one_letter_code(self) -> str:
return self._one_letter_code

@property
def onehot(self) -> np.ndarray:
if self._index is None:
raise ValueError(
"amino acid {self._name} index is not set, thus no onehot can be computed"
)
def charge(self) -> int:
return self._charge

# assumed that there are only 20 different amino acids
a = np.zeros(20)
a[self._index] = 1.0
@property
def polarity(self) -> Polarity:
return self._polarity

return a
@property
def size(self) -> int:
return self._size

@property
def count_hydrogen_bond_donors(self) -> int:
return self._count_hydrogen_bond_donors
def mass(self) -> float:
return self._mass

@property
def count_hydrogen_bond_acceptors(self) -> int:
return self._count_hydrogen_bond_acceptors
def pI(self) -> float:
return self._pI

@property
def charge(self) -> float:
return self._charge
def hydrogen_bond_donors(self) -> int:
return self._hydrogen_bond_donors

@property
def polarity(self) -> Polarity:
return self._polarity
def hydrogen_bond_acceptors(self) -> int:
return self._hydrogen_bond_acceptors

@property
def size(self) -> int:
return self._size
def onehot(self) -> np.ndarray:
if self._index is None:
raise ValueError(
"amino acid {self._name} index is not set, thus no onehot can be computed"
)
# 20 canonical amino acids
# selenocysteine and pyrrolysine are indexed as cysteine and lysine, respectively
a = np.zeros(20)
a[self._index] = 1.0

return a

@property
def index(self) -> int:
Expand Down
Empty file.
Binary file modified tests/data/hdf5/1ATN_ppi.hdf5
Binary file not shown.