In [1]:
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader
from typing import Optional
import os

In [3]:
import pdbreader

df = pdbreader.read_pdb(r"C:\Users\evani\OneDrive\Documenten\Phd\RLMD\PDBbind_v2020_refined\refined-set\1a1e\1a1e_protein.pdb")

In [5]:
import pandas as pd
from biopandas.pdb import PandasPdb
from typing import Optional

def read_pdb_to_dataframe(
    pdb_path: Optional[str] = None,
    model_index: int = 1,
    parse_header: bool = True,
    ) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
        model_index (int, optional): Index of the model to extract from the PDB file, in case
            it contains multiple models. Defaults to 1.
        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
            Defaults to True.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
            per atom
    """
    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header

In [6]:
# Find lowest number of atoms in protein from PDB_bind dataset

def list_files(dir):
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            if name.endswith("_protein.pdb"):
                filepath = root + os.sep + name
                df, _ = read_pdb_to_dataframe(filepath)
                r.append(df["atom_number"].tail(1))
    return r.min()

In [7]:
# Find file with smallest protein size in bits
def find_smallest(dir):
    sizes = []
    p_names = []
    p_size = 0
    l_size = 0
    for root, dirs, files in os.walk(dir):
        for name in files:
            if name.endswith("_ligand.mol2"):
                filepath = root + os.sep + name
                l_stats = os.stat(filepath)
                l_size = l_stats.st_size
            if name.endswith("_protein.pdb"):
                filepath = root + os.sep + name
                p_stats = os.stat(filepath)
                p_size = p_stats.st_size
                total = p_size + l_size
                sizes.append(total)
                p_names.append(name)
    return sizes, p_names

In [8]:
sizes, p_names = find_smallest(r"C:\Users\evani\OneDrive\Documenten\Phd\RLMD\PDBbind_v2020_refined")

In [9]:
# Smallest protein

import numpy as np

min = np.array(sizes).min()
index = np.where(sizes == min)[0][0]
p_names[index]

'6fhq_protein.pdb'

In [10]:
df, _ = read_pdb_to_dataframe(r"C:\\Users\\evani\\OneDrive\\Documenten\\Phd\\RLMD\\PDBbind_v2020_refined\\refined-set\\6fhq\\6fhq_protein.pdb")

In [11]:
print("Number of atoms", df["atom_number"].tail(1))
df["element_symbol"].value_counts()

Number of atoms 29    566
Name: atom_number, dtype: int32


element_symbol
C     275
O     110
H      94
N      76
S       9
ZN      2
Name: count, dtype: int64

In [12]:
# Protein 6fhq
import plotly.express as px

fig = px.scatter_3d(df, x='x_coord', y='y_coord', z='z_coord', color='element_symbol')
fig.update_traces(marker_size = 2)

fig.show()

In [20]:
from moldf import read_mol2

mol2 = r"C:\Users\evani\OneDrive\Documenten\Phd\RLMD\PDBbind_v2020_refined\refined-set\6fhq\6fhq_ligand.mol2"
ligand = read_mol2(mol2)

In [28]:
ligand["ATOM"]["type"] = ligand["ATOM"]["atom_name"].str.slice(0, 1)

In [31]:
ligand["ATOM"]

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge,type
0,1,C4,19.576,2.654,7.864,C.2,1,DE5,0.2604,C
1,2,C5,19.353001,1.152,8.26,C.3,1,DE5,0.0624,C
2,3,N1,19.145,4.503,4.788,N.2,1,DE5,-0.3233,N
3,4,N2,19.201,2.932,6.566,N.am,1,DE5,-0.1899,N
4,5,C3,19.282,4.189,6.084,C.2,1,DE5,0.2059,C
5,6,N3,19.414,0.913,9.741,N.4,1,DE5,0.2363,N
6,7,C2,19.254,5.832,4.505,C.2,1,DE5,-0.0467,C
7,8,C1,19.496,6.605,5.585,C.2,1,DE5,-0.0079,C
8,9,S1,19.549999,5.63,6.95,S.3,1,DE5,0.0111,S
9,10,O1,19.987,3.496,8.689,O.2,1,DE5,-0.3741,O


In [30]:
# Protein 6fhq
import plotly.express as px

fig = px.scatter_3d(ligand["ATOM"], x='x', y='y', z='z', color='type')
fig.update_traces(marker_size = 2)

fig.show()