# PDBBind 2020

Downloaded from https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_PP.tar.gz (Cloud CDN of http://www.pdbbind.org.cn/download/PDBbind_v2020_mol2.tar.gz)

In [3]:
import os
import pandas as pd
from abag_affinity.utils.config import read_yaml, get_data_paths

config = read_yaml("../../abag_affinity/config.yaml")
info_path = os.path.join(config["RESOURCES"]["path"], config["RESOURCES"]["PDBBind"]["folder_path"], config["RESOURCES"]["PDBBind"]["info_file"])

In [4]:
with open(info_path) as f:
    lines = f.readlines()

all_records = []

for line in lines[6:]:
    line = [ token for token in line.split(" ") if token != "" ]
    if "IC50" in line[3]:
        affinity_type = line[3][:4]
        affinity = line[3][5:]
    else:
        affinity_type = line[3][:2]
        affinity = line[3][3:]
    affinity_value = affinity[:-2]
    affinity_unit = affinity[-2:]
    
    all_records.append({
        "pdb": line[0],
        "resolution": line[1],
        "release_year": line[2],
        "affinty": float(affinity_value),
        "affinity_unit": affinity_unit,
        "affinity_type": affinity_type,
        "ligand_name": " ".join(line[6:])
    })
    
summary_df = pd.DataFrame.from_records(all_records)

In [5]:
print("Number of Protein-Protein Interactions with binding affinity >>>", len(summary_df))

Number of Protein-Protein Interactions with binding affinity >>> 2852


In [6]:
summary_df["affinity_unit"].unique()

array(['nM', 'pM', 'uM', 'fM', 'mM'], dtype=object)

In [7]:
convert_unit = {
    'mM': 1e03,
    'uM': 1e06,
    "nM": 1e09,
    'pM': 1e12,
    'fM': 1e15
}
summary_df["Kd"] = summary_df.apply(lambda row: row["affinty"] / convert_unit[row["affinity_unit"]], axis=1)

In [8]:
import numpy as np
gas_constant =  8.31446261815324 # 0.0821

def calc_delta_g(row):
    delta_g = gas_constant * row["temperature_kelvin"] * np.log(row["Kd"])
    return delta_g / 4184 # convert to kcal

In [9]:
summary_df["temperature_kelvin"] = 298.15 # assume temperature of 25° Celcius

summary_df["delta_G"] = summary_df.apply(lambda row: calc_delta_g(row), axis=1)

In [10]:
#summary_df.to_csv(os.path.join(config["DATA"]["path"], config["DATA"]["PDBBind"]["folder_path"], config["DATA"]["PDBBind"]["summary"]), index=False)

In [13]:
from abag_affinity.utils.pdb_reader import read_file
from Bio.SeqUtils import seq1
import warnings
warnings.filterwarnings("ignore")

pdb_path = os.path.join(config["RESOURCES"]["path"], config["RESOURCES"]["PDBBind"]["folder_path"], config["RESOURCES"]["PDBBind"]["pdb_path"])

def read_pdb(pdb_id):
    path = os.path.join(pdb_path, pdb_id + ".ent.pdb")

    structure, header = read_file(pdb_id, path)
    chains = structure.get_chains()
    chain_ids = [chain.id for chain in chains]
    compound_info = header.get("compound")
    if compound_info is None:
        return "No Compound Info"
    
    chain_info = {}
    molecules = []
    for info in compound_info.values():
        if isinstance(info, str):
            print(info)
            return "Invalid Information format"
        if info.get("molecule") is not None:
            molecule = info.get("molecule")
            if "light" in molecule:
                molecule = molecule[:molecule.find("light")]
            if "heavy" in molecule:
                molecule = molecule[:molecule.find("heavy")]
            if molecule in molecules:
                mol_id = molecules.index(molecule)
            else:
                mol_id = len(molecules)
                molecules.append(molecule)
            chain_info[info["chain"]] = mol_id
        else:
            return "No Molecule Info"
    
    for chain in structure.get_chains():
        if chain.id.lower() not in chain_info:
            return "No Info for chain {}".format(chain.id)
    
    if len(molecules) > 2:
        return "Too many molecules"
    
    return chain_info

In [14]:
from tqdm.auto import tqdm
tqdm.pandas()

chain_infos = summary_df["pdb"].progress_apply(lambda id: read_pdb(id))

  0%|          | 0/2852 [00:00<?, ?it/s]

In [15]:
print("{} total structures with two molecules and info available".format(chain_infos.apply(lambda chain_info: not isinstance(chain_info, str)).sum()))
print("{} total structures with more than two molecules and info available".format(chain_infos.apply(lambda chain_info: chain_info == "Too many molecules" ).sum()))

1086 total structures with two molecules and info available
212 total structures with more than two molecules and info available


In [19]:
summary_df["chain_infos"] = chain_infos

In [20]:
summary_df.to_csv(os.path.join(config["RESOURCES"]["path"], config["RESOURCES"]["PDBBind"]["folder_path"], config["RESOURCES"]["PDBBind"]["summary"]), index=False)