# Extract pocket residue CA atom coordinates

We extract the coordinates for all pockets' residue CA atoms to be used in other notebooks.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from opencadd.databases.klifs import setup_local
from kissim.io import PocketDataFrame



In [3]:
HERE = Path(_dh[-1])  # noqa: F821
DATA = HERE / "../../data/"
RESULTS = HERE / "../../results/"

In [4]:
try:
    LOCAL = setup_local(DATA / "external/structures/20210630_KLIFS_HUMAN")
except FileNotFoundError:
    # Use this KLIFS dataset for CI
    LOCAL = setup_local(DATA / "external/structures/20201223_KLIFS_HUMAN_ABL2")

print(f"Number of structures: {len(LOCAL._database)}")

Number of structures: 11693


## Load structure KLIFS IDs of interest

Let's load our pre-calculated fingerprint saved in a _json_ file as `FingerprintGenerator` object.

In [5]:
structure_klifs_ids = pd.read_csv(
    DATA / "processed/structure_klifs_ids.txt", squeeze=True, header=None
).to_list()
print(f"Number of structures: {len(structure_klifs_ids)}")

Number of structures: 5265


## Extract pocket residue CA atom coordinates

In [6]:
def pocket_residue_ca_atom_coordinates(structure_klifs_ids, klifs_session):

    coordinates = []

    for structure_klifs_id in structure_klifs_ids:
        pocket = PocketDataFrame.from_structure_klifs_id(structure_klifs_id, klifs_session)
        if pocket is not None:
            ca_atoms = pocket.residues.dropna().merge(
                pocket.ca_atoms, how="left", on=["residue.id"]
            )
            ca_atoms["structure.klifs_id"] = structure_klifs_id
            ca_atoms = ca_atoms.set_index(
                ["structure.klifs_id", "residue.ix"],
            )[["atom.x", "atom.y", "atom.z"]]
            coordinates.append(ca_atoms)

    coordinates = pd.concat(coordinates)
    return coordinates

In [10]:
%time coordinates = pocket_residue_ca_atom_coordinates(structure_klifs_ids, LOCAL)  # noqa: E501

10429: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
6656: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
1987: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
7362: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
5454: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
5458: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
10737: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
1940: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
3544: Structure KLIFS ID unknown to local session. (ValueError: Input values yield no results.)
The following structure could not be loaded into kissim: 1243: Length of values (1284) does not match length of index (1313)
13042: Lo

CPU times: user 16min 36s, sys: 864 ms, total: 16min 37s
Wall time: 16min 37s


## Save coordinates

In [11]:
coordinates.to_csv(  # noqa: F821
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz", compression="gzip"
)

## Load coordinates

In [12]:
coordinates = pd.read_csv(
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz",
    header=0,
    index_col=[0, 1],
)
coordinates

Unnamed: 0_level_0,Unnamed: 1_level_0,atom.x,atom.y,atom.z
structure.klifs_id,residue.ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3833,1,6.052,14.970,51.859
3833,2,6.264,13.245,48.484
3833,3,2.951,13.925,46.609
3833,4,3.487,11.680,43.579
3833,5,6.188,10.086,41.434
...,...,...,...,...
7219,81,2.003,19.768,33.851
7219,82,3.749,21.637,31.096
7219,83,6.670,19.352,30.231
7219,84,5.273,17.350,27.325


In [13]:
n_structures = len(coordinates.index.get_level_values("structure.klifs_id").unique())
print(f"Number of structures: {n_structures}")

Number of structures: 5242
