# Extract pocket residue CA atom coordinates

We extract the coordinates for all pockets' residue CA atoms to be used in other notebooks.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from opencadd.databases.klifs import setup_local
from kissim.io import PocketDataFrame

from src.paths import (
    PATH_DATA,
    PATH_RESULTS,
    PATH_DATA_KLIFS_DOWNLOAD,
)



In [3]:
HERE = Path(_dh[-1])  # noqa: F821
DATA = PATH_DATA
RESULTS = PATH_RESULTS / "all"

In [4]:
LOCAL = setup_local(PATH_DATA_KLIFS_DOWNLOAD)
print(f"Number of structures: {len(LOCAL._database)}")

Number of structures: 11806


## Load structure KLIFS IDs of interest

Let's load our pre-calculated fingerprint saved in a _json_ file as `FingerprintGenerator` object.

In [5]:
structure_klifs_ids = pd.read_csv(
    DATA / "processed/structure_klifs_ids_all.txt", squeeze=True, header=None
).to_list()
print(f"Number of structures: {len(structure_klifs_ids)}")

Number of structures: 4690


## Extract pocket residue CA atom coordinates

In [6]:
def pocket_residue_ca_atom_coordinates(structure_klifs_ids, klifs_session):

    coordinates = []

    for structure_klifs_id in structure_klifs_ids:
        pocket = PocketDataFrame.from_structure_klifs_id(structure_klifs_id, klifs_session)
        if pocket is not None:
            ca_atoms = pocket.residues.dropna().merge(
                pocket.ca_atoms, how="left", on=["residue.id"]
            )
            ca_atoms["structure.klifs_id"] = structure_klifs_id
            ca_atoms = ca_atoms.set_index(
                ["structure.klifs_id", "residue.ix"],
            )[["atom.x", "atom.y", "atom.z"]]
            coordinates.append(ca_atoms)

    coordinates = pd.concat(coordinates)
    return coordinates

The next cell will take about 15 minutes.

In [7]:
%time coordinates = pocket_residue_ca_atom_coordinates(structure_klifs_ids, LOCAL)  # noqa: E501

13042: Local complex.pdb or pocket.pdb file missing: /home/dominique/Documents/GitHub/kissim_app/src/../data/external/structures/20210902_KLIFS_HUMAN/HUMAN/GPRK5/6pjx_altA_chainA/complex.pdb


CPU times: user 16min 38s, sys: 1.01 s, total: 16min 39s
Wall time: 16min 40s


## Save coordinates

In [8]:
coordinates.to_csv(  # noqa: F821
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz", compression="gzip"
)

## Load coordinates

In [9]:
coordinates = pd.read_csv(
    DATA / "processed/pocket_residue_ca_atom_coordinates.csv.gz",
    header=0,
    index_col=[0, 1],
)
coordinates

Unnamed: 0_level_0,Unnamed: 1_level_0,atom.x,atom.y,atom.z
structure.klifs_id,residue.ix,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,8.332,18.829,52.312
1,2,9.247,16.975,49.126
1,3,6.166,15.784,47.231
1,4,7.642,14.754,43.861
1,5,10.860,13.971,41.981
...,...,...,...,...
13864,81,2.577,18.822,33.603
13864,82,2.393,21.136,30.594
13864,83,5.271,19.480,28.732
13864,84,3.250,18.475,25.666


In [10]:
n_structures = len(coordinates.index.get_level_values("structure.klifs_id").unique())
print(f"Number of structures: {n_structures}")

Number of structures: 4689
