In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import json
import MDAnalysis as mda
import nglview as nv
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:

PATH = '../data/sair_pockets'


pockets = json.load(open(os.path.join(PATH, 'pockets_info.json')))

df = pd.read_csv(os.path.join(PATH, 'sair_pockets.csv'))
df["volume_norm"] = df["width_x"] * df["width_y"] * df["width_z"] / 20 **3  # normalize by 30A^3
df["log_n_ligand_poses"] = np.log10(df["n_ligand_poses"])

pdb_ids = list(pockets.keys())
df.sample(5)

In [None]:

fig, axes = plt.subplots(2,3, figsize=(20,7))
axes = axes.flatten()
i = 0

sns.histplot(df["volume_norm"], bins = 50, ax=axes[i])
axes[i].set_xlabel("Volume")
axes[i].set_ylabel("Count")
axes[i].set_title("Pocket Volume Distribution")

i+=1
sns.histplot(df["log_n_ligand_poses"], bins = 50, ax=axes[i])
axes[i].set_xlabel("Number of ligand poses (log10)")
axes[i].set_ylabel("Count")
axes[i].set_title("Number of ligand poses distribution")


i+=1
sns.scatterplot(data=df, x="avg_pIC50", y="avg_confidence", size = "volume_norm", hue="log_n_ligand_poses", ax=axes[i])

i+=1
prot_n_pocket = df.groupby("prot_id").size()
sns.histplot(prot_n_pocket, bins = 50, ax=axes[i])
axes[i].set_xlabel("Number of pockets per protein (if >1)")
axes[i].set_ylabel("Count")
axes[i].set_title("Number of pockets per protein")


prot_multi_pock = prot_n_pocket[prot_n_pocket>1]

i+=1
sns.scatterplot(
    data = df[df["prot_id"].isin(prot_multi_pock.index)],
    x = "prot_id",
    y = "n_ligand_poses",
    ax=axes[i]
)
#Rotate x labels
axes[i].tick_params(axis='x', rotation=90)
axes[i].set_xlabel("Protein ID (only proteins with >1 pocket)")
axes[i].set_ylabel("Number of ligand poses")
axes[i].set_title("Number of ligand poses per pocket")

plt.tight_layout()

In [None]:
def get_protein_and_pocket_fpocket(pdb_id):
    pocket_info =pockets[pdb_id]["metadata"]
    pocket_id = pocket_info["pocket_id"]

    pdb_id = pdb_id + "_processed"


    u_protein = mda.Universe(os.path.join(PATH, "pdb_files", f"{pdb_id}.pdb"))
    pocket_u = mda.Universe(os.path.join(PATH, "pdb_files", f"{pdb_id}_out", "pockets", f"pocket{pocket_id}_atm.pdb"))
    print(len(pocket_u.residues))
    pocket_selection = " or ".join([f"resi {res.resid}" for res in pocket_u.residues])
    # protein_atoms = u.select_atoms("protein")

    # Create a new universe with only protein atoms
    # u_protein = mda.Merge(protein_atoms)
    u_protein.atoms.translate(-u_protein.atoms.center_of_mass())  # Optional: center the protein
    view = nv.show_mdanalysis(u_protein)
    view.add_representation("surface", colorScheme="hydrophobicity", selection=pocket_selection)
    return view, pocket_selection


def get_protein_and_vina_box(pdb_id):
    u = mda.Universe(os.path.join(PATH, f"{pdb_id}.pdb"))
    protein_atoms = u.select_atoms("protein")
    pocket_info = pockets[pdb_id]
    u_protein = mda.Merge(protein_atoms)

    pocket_selection = get_selection(
        np.array(pocket_info["size"]), pocket_info["center"], u_protein
    )
    u_protein.atoms.translate(-u_protein.atoms.center_of_mass())  # Optional: center the protein

    view = nv.show_mdanalysis(u_protein)
    view.add_representation("surface", colorScheme="hydrophobicity", selection=pocket_selection)
    view.add_representation("balls_and_sticks")
    return view, pocket_selection

In [None]:
def get_selection(size, center, sim):
    center = np.array(center)
    size = np.array(size)
    # Get atoms
    all_atoms = sim.select_atoms("all")
    coords = all_atoms.positions
    # Create boolean mask for atoms inside the box
    mask = (np.abs(coords - center.reshape(1,3)) < size/2).all(axis=1)
    pocket_atoms = all_atoms[mask]
    pocket_residues = pocket_atoms.residues

    # Use segid instead of chainID
    pocket_selection = " or ".join([f"resi {res.resid}" for res in pocket_residues])
    return pocket_selection

In [None]:
i = 0

In [None]:
i+=1

In [None]:
pdb_id = pdb_ids[i]
u_protein = mda.Universe(os.path.join(PATH, pdb_id) + ".pdb")
u_protein.atoms.translate(-u_protein.atoms.center_of_mass())  # Optional: center the protein
view = nv.show_mdanalysis(u_protein)
# view.add_representation("surface", colorScheme="hydrophobicity", selection=pocket_selection)
view

In [None]:
view, pocket_selection2 = get_protein_and_vina_box(pdb_id)
view