In [39]:
from dataclasses import dataclass

import pandas as pd
import numpy as np

import prody
from biopandas.pdb import PandasPdb
from biotite.sequence import ProteinSequence


from predictability.constants import DATA_ROOT
from predictability.utils import (
    read_fasta,
    coord_distance,
    dist_to_active_site,
    get_buriedness,
    assign_mutations,
    assign_classes,
)

In [40]:
data_dir = DATA_ROOT / "amylase"
data_dir.mkdir(exist_ok=True)
pdb_id = "1ua7"

In [41]:
filename = data_dir / f"{pdb_id}.pdb"
structure = PandasPdb().read_pdb(str(data_dir / f"{pdb_id}.pdb")).df["ATOM"].query('alt_loc == "" or alt_loc == "A"')
residue_characteristics = (
    structure[["residue_number", "residue_name"]]
    .drop_duplicates()
    .assign(
    residue_name=lambda d: d.residue_name.apply(ProteinSequence.convert_letter_3to1)
))

In [42]:
reference_sequence = [value for _, value in read_fasta(data_dir / "reference.fasta").items()][0]

# Calculating buriedness

In [43]:
buriedness = get_buriedness(filename).loc[
    lambda d: (d.chain_id == "A") & (~d.residue_name.isin(["HOH", "ACI", "CA"]))
]

residue_characteristics = residue_characteristics.merge(buriedness[["residue_number", "buriedness"]], on=["residue_number"])

# Calculating number of contacts

In [44]:
ca_atoms = (
    structure.loc[lambda d: d.atom_name == "CA"] .copy()
    .assign(contacts=np.nan)[
            ["residue_number", "contacts", "x_coord", "y_coord", "z_coord"]
        ].set_index("residue_number")
)
for ca in ca_atoms.itertuples():
    contacts = 0
    for other in ca_atoms.itertuples():
        if ca.Index != other.Index:
            dist = distance(ca, other)
            if dist < 7.3:
                contacts += 1
    ca_atoms.loc[ca.Index, "contacts"] = contacts
residue_characteristics = residue_characteristics.merge(ca_atoms.reset_index()[["residue_number", "contacts"]])

# Calculating distance to the active site

In [45]:
active_site_residues = [176, 208]
active_site = list(structure.loc[lambda d: (d.residue_number.isin(active_site_residues)) & (d.atom_name == "CA")].itertuples())
as_distances = (
    structure.loc[lambda d: d.atom_name == "CA"] .copy()
    [
        ["residue_number", "x_coord", "y_coord", "z_coord"]
    ].set_index("residue_number")
)
as_distances["distance_to_active_site"] = [dist_to_active_site(ca, active_site) for ca in as_distances.itertuples()]
residue_characteristics = residue_characteristics.merge(as_distances.reset_index()[["residue_number", "distance_to_active_site"]])

# Determining secondary structure

In [46]:
_, header = prody.parsePDB(str(data_dir / f"{pdb_id}.pdb"), header=True)
residues = structure.loc[lambda d: d.atom_name == "CA"].copy()[["residue_number"]]

@dataclass
class Range:
    start: int
    stop: int

    def __contains__(self, value):
        return self.stop >= value >= self.start

ranges = [Range(start, stop) for _, _, _, _, start, stop in header["helix_range"] + header["sheet_range"]]

residues["is_secondary"] = [any(v in r for r in ranges) for v in structure.residue_number.unique()]
residue_characteristics = residue_characteristics.merge(residues)

@> 3808 atoms and 1 coordinate set(s) were parsed in 0.02s.


In [47]:
# Binarize
residue_characteristics["is_buried"] = (residue_characteristics.buriedness > residue_characteristics.buriedness.quantile(1 - 1/2)).astype(bool)
residue_characteristics["is_connected"] = (residue_characteristics.contacts > residue_characteristics.contacts.quantile(1 - 1/2)).astype(bool)
residue_characteristics["is_close_to_as"] = (residue_characteristics.distance_to_active_site < residue_characteristics.distance_to_active_site.quantile(1 - 1/2)).astype(bool)

In [48]:
# Save
residue_characteristics.to_csv(data_dir / "structural_characteristics.csv", index=False)

In [49]:
residue_characteristics

Unnamed: 0,residue_number,residue_name,buriedness,contacts,distance_to_active_site,is_secondary,is_buried,is_connected,is_close_to_as
0,4,P,2.122458,4.0,34.299691,False,False,False,False
1,5,S,3.177263,5.0,30.524442,False,False,False,False
2,6,I,6.574809,9.0,28.138897,False,True,False,True
3,7,K,6.422451,6.0,27.714115,False,True,False,True
4,8,S,5.470014,9.0,31.516398,False,True,False,False
...,...,...,...,...,...,...,...,...,...
417,421,V,10.350866,13.0,46.579590,True,True,True,False
418,422,L,9.082451,11.0,49.025293,True,True,True,False
419,423,Y,6.608980,8.0,49.148367,False,True,False,False
420,424,P,2.867646,4.0,51.569397,False,False,False,False


# Assigning structural characteristics to singles

In [50]:
data = pd.read_csv(data_dir / "raw_singles_data.csv")
data = assign_mutations(data, str(reference_sequence))
feature_table = pd.read_csv(data_dir / "structural_characteristics.csv")
data = assign_classes(data, feature_table, mutation_col="mutations", features=["is_buried", "is_connected", "is_close_to_as", "is_secondary"])
data.to_csv(data_dir / "singles.csv")