In [1]:
import numpy as np
import pandas as pd
import os
from biopandas.mol2 import PandasMol2
from multiprocessing import Process, Queue, Pool
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio import PDB
from scipy.spatial import distance_matrix
import pickle
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

### 1. Preprocessing scPDB dataset

In [2]:
# BlendNet utilizes Pseq2Sites, a ligand binding pocket prediction model, to extract binding pocket sequences.
# To extract data for training Pseq2Sites and select only high-quality compound-protein complexes, binding pockets are examined against the collected datasets.
# Consistent with the original paper of Pseq2Sites, the databases for training ligand-binding pocket prediction utilize scPDB and PDBbind.

# Download complex structure data from scPDB database to extract binding sites data (http://bioinfo-pharma.u-strasbg.fr/scPDB/)
# Save the downloaded complex structure to the ./scPDB/raw_data dir

#In the case of PDBbind, there is no need to perform additional downloads as the data from the previous preprocessing step will be used.

In [3]:
scPDB_dir_list = os.listdir("./scPDB/raw_data")
print(len(scPDB_dir_list))

17594


In [4]:
scPDB_df = pd.DataFrame({"scPDB":[i for i in scPDB_dir_list]})

In [5]:
def get_ligand_code(scPDB):
    path = "./scPDB/raw_data"

    ligand_info = PandasMol2().read_mol2(f"{path}/{scPDB}/ligand.mol2").df
    ligand_info = np.unique(ligand_info.loc[:, ['subst_name']].values)
    
    if len(ligand_info) != 1:
        return None
    else:
        return ligand_info[0]

In [2]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [7]:
def get_ligand_code_bulk(df):
    return df.scPDB.map(get_ligand_code)

In [8]:
scPDB_results = parallelize_dataframe(scPDB_df, get_ligand_code_bulk, 10)

In [9]:
scPDB_results = pd.concat(scPDB_results)

In [10]:
scPDB_df["File_lig_code"] = scPDB_results.map(lambda a: a if a is not None else None)

In [11]:
scPDB_df = scPDB_df.dropna(axis = 0).reset_index(drop=True)
scPDB_df

Unnamed: 0,scPDB,File_lig_code
0,4nx6_1,FOL201
1,3tqx_1,PLP397
2,4bek_1,XK01447
3,4dca_1,ADP401
4,4pah_1,LNR600
...,...,...
17489,3w2f_1,FAD301
17490,3af0_1,GDP313
17491,4dym_1,IYZ503
17492,2i47_4,KGY1002


In [12]:
def preprocessing(df):
    results = {"PDB":[], "Lig_code":[], "scPDB":[], "File_lig_code":[]}
    
    for line in df.values:
        scPDB, file_lig_code = line[0], line[1]
        pdbid, lig_code = scPDB.split("_")[0], file_lig_code[:3]
        
        results["PDB"].append(pdbid)
        results["Lig_code"].append(lig_code)
        results["scPDB"].append(scPDB)
        results["File_lig_code"].append(file_lig_code)
    
    return pd.DataFrame(results)

In [13]:
scPDB_df = preprocessing(scPDB_df)
scPDB_df

Unnamed: 0,PDB,Lig_code,scPDB,File_lig_code
0,4nx6,FOL,4nx6_1,FOL201
1,3tqx,PLP,3tqx_1,PLP397
2,4bek,XK0,4bek_1,XK01447
3,4dca,ADP,4dca_1,ADP401
4,4pah,LNR,4pah_1,LNR600
...,...,...,...,...
17489,3w2f,FAD,3w2f_1,FAD301
17490,3af0,GDP,3af0_1,GDP313
17491,4dym,IYZ,4dym_1,IYZ503
17492,2i47,KGY,2i47_4,KGY1002


In [14]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [15]:
check = list()
for i in scPDB_df.Lig_code.values:
    if i in amino_acids_short:
        check.append(False)
    else:
        check.append(True)
        
scPDB_df = scPDB_df.loc[check,:].dropna(axis = 0).reset_index(drop=True)
scPDB_df

Unnamed: 0,PDB,Lig_code,scPDB,File_lig_code
0,4nx6,FOL,4nx6_1,FOL201
1,3tqx,PLP,3tqx_1,PLP397
2,4bek,XK0,4bek_1,XK01447
3,4dca,ADP,4dca_1,ADP401
4,4pah,LNR,4pah_1,LNR600
...,...,...,...,...
17486,3w2f,FAD,3w2f_1,FAD301
17487,3af0,GDP,3af0_1,GDP313
17488,4dym,IYZ,4dym_1,IYZ503
17489,2i47,KGY,2i47_4,KGY1002


In [17]:
scPDB_df.to_csv("./data/supplementary/scPDB_data.tsv", sep = "\t", index = False)

In [3]:
scPDB_df = pd.read_csv("./data/supplementary/scPDB_data.tsv", sep = "\t")

In [4]:
def read_file(file):
    return file.readlines()

def preprocessing_PDB_ligand(lines, sup_dict):

    results = dict()
    
    for line in lines:
        line_list = line.split("\t")
        
        if line_list[1] in sup_dict:
            results[line_list[1]] = sup_dict[line_list[1]]
            
        elif line_list[0] == "":
            continue
        
        else:
            results[line_list[1]] = line_list[0]

    return results

In [5]:
SMILES_stereo_correct = {"0IU":"C[C@@H](C1=CC=CC=C1)N(CC(=O)N(C)C)C(=O)C[C@@H](CC2=CSC(=N2)N)C(=O)N[C@@H](CC3CCCCC3)[C@H]([C@H](CC(C)C)O)O", 
                     "0QN":"C[C@@H](C(=O)NC1=CC=C(C=C1)C(F)(F)F)NC(=O)[C@H](C(C)C)NC(=O)C(F)(F)F", 
                     "2Z3":"CCCC[C@@H](C(=O)N[C@@H](CC1CCCCC1)C(C(C(=O)NC)(F)F)(O)O)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)N3CCOCC3", 
                     "0DB":"C1CCC(C1)(C[C@@H](CCC2=CC=CC=C2)C(=O)O)C(=O)N[C@@H](CC3=CNC4=CC=CC=C43)C(=O)O", 
                     "0EM":"CC[C@H](C)CNC(=O)CP(=O)([C@H](CC(C)C)NC(=O)[C@H](CC1=CN=CN1)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)OC(C)(C)C)O", 
                     "0DY":"CC(C)C[C@@H](C(=O)N[C@@H](CC(=O)N)C(=O)NCC1=CC(=CC=C1)N)C(=O)NO", 
                     "0QB":"C[C@@H](C1=CC=CC=C1)N(CC(=O)N(C)C)C(=O)C[C@@H](CC2=CSC(=N2)N)C(=O)N[C@@H](CC3CCCCC3)[C@H]([C@@H]4CC(N(C4=O)C)(C)C)O", 
                     "0ED":"CC(C)C[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)NCCN2CCOCC2)N[C@H](CCNC(=O)OCC3=CC=CC=C3)C(=O)O", 
                     "0QH":"CC(C)C[C@@H](C(=O)N[N@+](C)(CC1=CC=CC=C1)CC(=O)NC2=CC=C(C=C2)C(C)C)NC(=O)C(F)(F)F", 
                     "0Z0":"C[C@@H](C(=O)NC1=CC=C(C=C1)C(F)(F)F)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)C(F)(F)F", 
                     "0ZB":"CC(C)C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC1=CC=CC=C1)C(=O)NO", 
                     "0D3":"C[C@@H](C(=O)NCC(=O)N)NC(=O)[C@H](CC1=CC=CC=C1)CS", 
                     "0ZC":"CC(C)C[C@@H](C(=O)N1CC2=C(C[C@H]1C(=O)O)C3=CC=CC=C3N2)NC(=O)C4=CC=CO4", 
                     "01S":"C[C@@H](C(=O)NCC(=O)N)NC(=O)[C@H](CC(C)C)C(=O)NO", 
                     "0E9":"CC(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)C([C@H](CC2=CC=CC=C2)NC(=O)[C@H](C(C)C)NC(=O)OCC3=CC=CC=C3)O)NC(=O)OCC4=CC=CC=C4",
                     "0PK":"C[C@@H](C(=O)O)NC(=O)[C@H](CC(C)C)NP(=O)([C@H](CC1=CC=CC=C1)NC(=O)OCC2=CC=CC=C2)O",
                     "0PJ":"CC(C)C[C@@H](C(=O)N[C@@H](CC(C)C)C(=O)O)NP(=O)(CNC(=O)OCC1=CC=CC=C1)O"
                    }    

In [6]:
PDB_ligand_isomeric_dict = preprocessing_PDB_ligand(read_file(open("./data/supplementary/Components-smiles-stereo-oe.smi", "r")), 
                                                  SMILES_stereo_correct)

In [7]:
print(f"[Ligand Expo Isomeric] SMILES: {len(PDB_ligand_isomeric_dict)}")

[Ligand Expo Isomeric] SMILES: 40464


In [8]:
def add_SMILES(df, ligand_isomeric_dict):
    results = {"PDB":[], "Lig_code":[], "scPDB":[], "OE_stereo_SMILES":[], "File_ligand_code":[]}
    
    correct_ligand_code = {"6hzp":"FVT", "6mli":"HIS", "6mmq":"CMP", "6mlp":"HIS", "6nfg":"KKP", 
                           "3adu":"MYI", "6imd":"AH9", "6jad":"GLC", "6jag":"SUC", "1czo":"FMN", 
                           "1d04":"FMN", "6hbm":"AHR", "6agp":"GDP", "6nfo":"KKM", "6hop":"FER", 
                           "1aku":"FMN", "6jki":"SFG", "6a73":"IHP", "6hmb":"3NG", "6mdb":"JE4", 
                           "6mhd":"JRD", "6jz0":"CKO", "6myn":"K6Y", "6ocu":"M5D", "6hub":"GRW", 
                           "5xff":"6LF", "5xfj":"6LF", "1akt":"FMN", "1akw":"FMN", "1azl":"FMN", 
                           "6n3n":"KA4", "1czk":"FMN", "1czr":"FMN", "1akr":"FMN", "1akq":"FMN",
                           "1czl":"FMN", "6moo":"A5F", "1c7f":"FMN", "1akv":"FMN", "1i7m":"CG",
                           "6cqt":"VX", "6cqz":"VX", "1xzx":"T3", "1a5v":"Y3", "1af2":"U", 
                           "1phw":"N", "2gfj":"VI", "1kll":"MC", "6o4x":"AA",
                            "5moq":"BEN", "1tyr":"9CR", "3t83":"MG5", "4elh":"53I", "5w0l":"9UD", 
                            "5mnh":"BEN", "1h00":"FCP", "4p0n":"1IR", "4a50":"DQ6", "5n9n":"KC5", 
                            "5lvx":"7E4", "1p28":"HBS", "1bap":"ARA", "5x72":"P59", "5mo0":"BEN", 
                            "3lk1":"JKE", "6eu6":"ZDM", "2cht":"TSA", "1apb":"FCA", "1oko":"GAL", 
                            "1f5k":"BEN", "7abp":"FCA", "5g6u":"SGN", "5ijj":"IHP", "4q1e":"2Y7", 
                            "5mng":"BEN", "3rv6":"RVA", "5mw3":"5JJ", "5wkm":"N02", "1c5p":"BEN", 
                            "1abf":"FCA", "6qs5":"JGB", "2srt":"8MI", "9abp":"GLA", "4d1d":"5ND", 
                            "4elg":"52I", "1c5o":"BEN", "1h07":"MFP", "3rv8":"RVC", "6eiz":"FOF", 
                            "1c5z":"BEN", "4qhp":"32Q", "5ijp":"IHP", "6abp":"ARA", "5abp":"GLA", 
                            "1ws5":"MMA", "5wkl":"AVY", "8abp":"GLA", "5t52":"A2G", "3uwl":"FFO",
                            "2qtr":"DND", "2x4z":"7KC", "1rdn":"NDG", "1szm":"BI4", "2qwe":"ZMR", 
                            "3dla":"DND", "4elb":"34R"}   
    
    for line in df.values:
        pdb, ligand_code, scPDB, file_ligand_code = line[0], line[1], line[2], line[3]
        
        if ligand_code in ligand_isomeric_dict:
            results['PDB'].append(pdb)
            results['Lig_code'].append(ligand_code)
            results['scPDB'].append(scPDB)
            results['OE_stereo_SMILES'].append(ligand_isomeric_dict[ligand_code])
            results['File_ligand_code'].append(file_ligand_code)
    
    return results

In [9]:
def convert_rdkit_smiles(df, isomeric = False):
    
    results = list()

    for line in df.values:
        try:
            smiles = MolToSmiles(MolFromSmiles(line[3]),isomericSmiles = isomeric, kekuleSmiles = True)
            results.append(smiles)
        except:
            results.append(None)
    return results

In [10]:
scPDB_df = add_SMILES(scPDB_df, PDB_ligand_isomeric_dict)

In [11]:
scPDB_df = pd.DataFrame(scPDB_df)
scPDB_df

Unnamed: 0,PDB,Lig_code,scPDB,OE_stereo_SMILES,File_ligand_code
0,4nx6,FOL,4nx6_1,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,FOL201
1,3tqx,PLP,3tqx_1,Cc1c(c(c(cn1)COP(=O)(O)O)C=O)O,PLP397
2,4bek,XK0,4bek_1,C[C@]1(CCSC(=N1)N)c2ccc(cc2)OC,XK01447
3,4dca,ADP,4dca_1,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,ADP401
4,4pah,LNR,4pah_1,c1cc(c(cc1[C@H](CN)O)O)O,LNR600
...,...,...,...,...,...
17484,3w2f,FAD,3w2f_1,Cc1cc2c(cc1C)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]([C...,FAD301
17485,3af0,GDP,3af0_1,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,GDP313
17486,4dym,IYZ,4dym_1,CC(=O)c1cccc(c1)c2cnc3n2nc(cc3)NCC4CC4,IYZ503
17487,2i47,KGY,2i47_4,CC#CCOc1ccc(cc1)S(=O)(=O)CC2(CCN(CC2)S(=O)(=O)...,KGY1002


In [12]:
rdkit_smiles_isomeric = convert_rdkit_smiles(scPDB_df, isomeric = True)

[23:34:19] Explicit valence for atom # 25 N, 4, is greater than permitted
[23:34:20] Explicit valence for atom # 7 Cl, 5, is greater than permitted


In [13]:
scPDB_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [14]:
scPDB_df = scPDB_df.dropna(axis = 0).reset_index(drop=True)

In [15]:
print(f"[scPDB] complexes: {len(scPDB_df)}")

[scPDB] complexes: 17487


#### 1.2. Download scPDB dataset

In [31]:
def download_PDB_structure(pdb):
    if not os.path.isfile(f"./data/scPDB/protein/{pdb}.pdb"):
        command = f"wget https://files.rcsb.org/download/{pdb}.pdb --directory-prefix={path}"
        os.system(command)

In [32]:
def get_pdb_info_bulk(df):
    return df.PDB.map(download_PDB_structure)

In [33]:
path = "./data/scPDB/protein/"

In [34]:
_ = parallelize_dataframe(scPDB_df, get_pdb_info_bulk, 10)

In [16]:
def download_ligand_structure(lig):
    if not os.path.isfile(f"./data/scPDB/ligand/{lig}_ideal.pdb"):
        command = f"wget https://files.rcsb.org/ligands/{lig[0]}/{lig}/{lig}_ideal.pdb --directory-prefix={path}"
        os.system(command)

In [17]:
def get_pdb_info_bulk(df):
    return df.Lig_code.map(download_ligand_structure)

In [18]:
path = "./data/scPDB/ligand/"

In [19]:
_ = parallelize_dataframe(scPDB_df, get_pdb_info_bulk, 10)

--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/F/FOL/FOL_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/X/XK0/XK0_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/P/PLP/PLP_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/A/ADP/ADP_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/L/LNR/LNR_ideal.pdb
Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... --2024-09-04 23:34:27--  https://files.rcsb.org/ligands/F/FAD/FAD_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/F/FAD/FAD_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/D/DCP/DCP_ideal.pdb
--2024-09-04 23:34:27--  https://files.rcsb.org/ligands/1/1FV/1FV_ideal.pdb
Resolving files.rcsb.org (files.rcsb.org)... --2024-09-04 23:34:27--  https://files.rcsb.org/ligands/A/ARD/ARD_ideal.pdb
132.249.213.193

In [20]:
def read_file(file):
    return file.readlines()

def additional_download(df):
    path = "./data/scPDB/ligand/"

    for line in df.values:
        lig = line[1] 
        if not os.path.isfile(f"{path}{lig}_ideal.pdb"):

            command = f"wget https://files.rcsb.org/ligands/download/{lig}_ideal.mol2 --directory-prefix={path}"
            os.system(command)

            command = f"obabel -imol2 {path}{lig}_ideal.mol2 -opdb -O {path}{lig}_ideal.pdb"
            os.system(command)

In [21]:
additional_download(scPDB_df)

### 2. Extract protein and ligand info from PDB file

#### 2.1 scPDB dataset

In [18]:
path = "./data/scPDB/protein"

In [19]:
def extract_ligand_info(row):
    pdbid, lig_code, ligand_info = row[0], row[1], list()
    try:
        lines = read_file(open(f"{path}/{pdbid}.pdb", "r"))
    except:
        return None
    
    model_flag = 0
    
    """ Extract ligand info """
    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_falg = 1
            
            if "HETATM" in line:

                residue, chain, residue_seq_number = line[17:20].strip(), line[21:22].strip(), line[22:26].strip()

                if residue == lig_code:
                    if f"{residue}_{chain}_{residue_seq_number}" not in ligand_info:
                        ligand_info.append(f"{residue}_{chain}_{residue_seq_number}")

    if len(ligand_info) == 0:
        return None
    else:
        return ",".join(ligand_info)

In [20]:
def extract_ligand_info_bulk(df):
    return df.apply(extract_ligand_info, axis = 1)

In [24]:
scPDB_ligand_info_results = parallelize_dataframe(scPDB_df, extract_ligand_info_bulk, 10)

In [25]:
scPDB_ligand_info_results = pd.concat(scPDB_ligand_info_results)

In [26]:
scPDB_df["Total_Lig_info"] = scPDB_ligand_info_results.map(lambda a: a if a is not None else None)

In [27]:
scPDB_df = scPDB_df.dropna(axis = 0).reset_index(drop=True)
scPDB_df

Unnamed: 0,PDB,Lig_code,scPDB,OE_stereo_SMILES,File_ligand_code,RDKit_iso_SMILES,Total_Lig_info
0,4nx6,FOL,4nx6_1,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,FOL201,NC1=NC(=O)C2=NC(CNC3=CC=C(C(=O)N[C@@H](CCC(=O)...,FOL_A_201
1,3tqx,PLP,3tqx_1,Cc1c(c(c(cn1)COP(=O)(O)O)C=O)O,PLP397,CC1=C(O)C(C=O)=C(COP(=O)(O)O)C=N1,"PLP_A_397,PLP_B_397"
2,4bek,XK0,4bek_1,C[C@]1(CCSC(=N1)N)c2ccc(cc2)OC,XK01447,COC1=CC=C([C@]2(C)CCSC(N)=N2)C=C1,XK0_A_1447
3,4dca,ADP,4dca_1,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,ADP401,NC1=C2N=CN([C@@H]3O[C@H](CO[P@](=O)(O)OP(=O)(O...,ADP_A_401
4,4pah,LNR,4pah_1,c1cc(c(cc1[C@H](CN)O)O)O,LNR600,NC[C@H](O)C1=CC(O)=C(O)C=C1,LNR_A_600
5,4to0,DCP,4to0_3,C1[C@@H]([C@H](O[C@H]1N2C=CC(=NC2=O)N)CO[P@@](...,DCP701,NC1=NC(=O)N([C@H]2C[C@H](O)[C@@H](CO[P@@](=O)(...,"DCP_A_701,DCP_B_701,DCP_C_703,DCP_D_702"
6,4l2i,FAD,4l2i_2,Cc1cc2c(cc1C)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]([C...,FAD301,CC1=CC2=C(C=C1C)N(C[C@H](O)[C@H](O)[C@H](O)CO[...,"FAD_A_401,FAD_B_301"
7,1kbq,FAD,1kbq_6,Cc1cc2c(cc1C)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]([C...,FAD602,CC1=CC2=C(C=C1C)N(C[C@H](O)[C@H](O)[C@H](O)CO[...,"FAD_A_601,FAD_B_602,FAD_C_603,FAD_D_604"
8,4iwq,1FV,4iwq_1,c1cc(cc(c1)Nc2ncc(c(n2)NCCCNC(=O)C3CCC3)C4CC4)...,1FV701,O=C(NCCCNC1=NC(NC2=CC=CC(CN3CCOCC3)=C2)=NC=C1C...,1FV_A_701
9,3fap,ARD,3fap_1,Cc1ccc(s1)[C@@H]/2C[C@@H]3CC[C@H]([C@@](O3)(C(...,ARD402,CO[C@@H]1C[C@H](C[C@@H](C)[C@@H]2CC(=O)[C@H](C...,ARD_A_402


In [28]:
def count_atom(lines):
    return len([line for line in lines if "ATOM" in line and line[76:78].strip() != "H" and line[76:78].strip() != "D"])

def count_HETATM(lines):
    return len([line for line in lines if "HETATM" in line and line[76:78].strip() != "H" and line[76:78].strip() != "D"])

def read_file(file):
    return file.readlines()

def fwrite(fw, file):
    for i in file:
        fw.write(f"{i}\n")
    fw.close()    
    
def get_results(pdbid, ligand_code, ligand_dict, ligand_info_list):
    ligand_atom_count = dict()
    
    """ Get Ideal ligand count """
    ideal_lines = read_file(open(f"{ideal_ligand_path}/{ligand_code}_ideal.pdb", "r"))
    ideal_ligand_atom_count = count_atom(ideal_lines)     
    
    """ Get final ligand info """
    for ligand in ligand_info_list:
        ligand_atom_count[ligand] = count_HETATM(ligand_dict[ligand])
    
    sorted_ligand_atom_count = sorted(ligand_atom_count.items(), key = lambda item: item[1], reverse = True)
    final_ligand_info, final_ligand_atom_count = sorted_ligand_atom_count[0][0], sorted_ligand_atom_count[0][1]    
    
    if ideal_ligand_atom_count >= final_ligand_atom_count:
        if ideal_ligand_atom_count - final_ligand_atom_count < 6:
            if not os.path.isdir(f"{des_path}/{pdbid}/"):
                command = f"mkdir {des_path}/{pdbid}/"
                os.system(command)
            
            fwrite(open(f"{des_path}/{pdbid}/{final_ligand_info}.pdb", "w"), ligand_dict[final_ligand_info])
            
            write_ligand_list, tmp = list(), list()
            
            write_ligand_list.append(final_ligand_info)
            tmp.append(f"{final_ligand_info.split('_')[0]}_{final_ligand_info.split('_')[1]}")

            for ligand in list(ligand_dict.keys()):
                if len(ligand_dict[ligand]) != 0:
                    if f'{ligand.split("_")[0]}_{ligand.split("_")[1]}' not in tmp:
                        tmp.append(f"{ligand.split('_')[0]}_{ligand.split('_')[1]}")
                        fwrite(open(f"{des_path}/{pdbid}/{ligand}.pdb", "w"), ligand_dict[ligand])
                        write_ligand_list.append(ligand)
            
            return final_ligand_info, ",".join(write_ligand_list)
        
        else:
            return None, None
    
    elif ideal_ligand_atom_count < final_ligand_atom_count:
        print(f"Check PDB fild: {pdbid}; Recorded ligand atom count is over ideal atom count.")
        return None, None

def preprocessing_PDB_file(row):
    def pre(line):
        return line[:16] + " " + line[17:]
    
    ligand_dict, ligand_atom_types_dict, model_flag = dict(), dict(), 0
    pdbid, ligand_code, ligand_info_list = row[0], row[1], row[-1].split(",")
    
    for ligand in ligand_info_list:
        ligand_dict[ligand] = list()
        ligand_atom_types_dict[ligand] = list()
    
    lines = read_file(open(f"{src_path}/{pdbid}.pdb", "r"))
    
    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_flag = 1 
            
            if "HETATM" in line:
                atom_type, residue_chain, residue, chain, residue_seq_number = line[12:16].strip(), line[16:17].strip(), line[17:20].strip(), line[21:22].strip(), line[22:26].strip()
            
                if f"{residue}_{chain}_{residue_seq_number}" in ligand_dict:
                    if len(ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"]) == 0:
                        ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"].append(atom_type)
                        if residue_chain == "":
                            ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(line.strip())
                        else:
                            ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(pre(line.strip())) 
                            
                    else:
                        if atom_type not in ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"]:
                            ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"].append(atom_type)

                            if residue_chain == "":
                                ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(line.strip())
                            else:
                                ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(pre(line.strip()))  
                        
            elif line[:].strip() == "CONECT":
                final_ligand_code, check_ligand_code = get_results(pdbid, ligand_code, ligand_dict, ligand_info_list)
                return final_ligand_code, check_ligand_code

    final_ligand_code, check_ligand_code = get_results(pdbid, ligand_code, ligand_dict, ligand_info_list)
    return final_ligand_code, check_ligand_code

In [29]:
def preprocessing_PDB_file_bulk(df):
    return df.apply(preprocessing_PDB_file, axis = 1)

In [30]:
src_path = "./data/scPDB/protein"
ideal_ligand_path = "./data/scPDB/ligand"
des_path = "./data/scPDB/structure"

In [31]:
scPDB_ligand_results = parallelize_dataframe(scPDB_df, preprocessing_PDB_file_bulk, 10)

In [32]:
scPDB_ligand_results = pd.concat(scPDB_ligand_results)

In [33]:
scPDB_df["Final_lig_info"] = scPDB_ligand_results.map(lambda a:a[0] if a is not None else None)
scPDB_df["Check_lig_info"] = scPDB_ligand_results.map(lambda a:a[1] if a is not None else None)

In [34]:
scPDB_df = scPDB_df.dropna(axis = 0).reset_index(drop=True)
print(f"[scPDB] Complexes: {len(scPDB_df)}")

[scPDB] Complexes: 10


In [35]:
def preprocessing_multimodel(path):
    write_lines, model_flag = list(), 0
    
    lines = read_file(open(path, "r"))
    
    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_flag = 1     
            write_lines.append(line.strip())
    fwrite(open(path, "w"), write_lines)

def remove_HEATM(pdbid):
    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0 

    if not os.path.isfile(f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb"):

        src_file = f"{src_path}/{pdbid}.pdb"
        des_file = f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb"

        pdb = PDBParser().get_structure(pdbid, src_file)
        io = PDBIO()
        io.set_structure(pdb)
        io.save(des_file, NonHetSelect())
        
        preprocessing_multimodel(f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb")

In [36]:
def remove_HEATM_bulk(df):
    return df.PDB.map(remove_HEATM)

In [37]:
src_path = "./data/scPDB/protein"
protein_des_path = "./data/scPDB/structure"

In [38]:
_ = parallelize_dataframe(scPDB_df, remove_HEATM_bulk, 10)



#### 2.2 PDBbind dataset 

In [45]:
PDBbind_df = pd.read_csv("./preprocessed_data/step2_PDBbind_data.tsv", sep = "\t")
PDBbind_df = PDBbind_df.drop(["Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"], axis = 1)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Total_Lig_info
0,3wka,S0G,S0G_A_603
1,5kgx,7SK,7SK_A_301
2,5wqc,7MA,7MA_A_2001
3,5aen,DP8,DP8_A_611
4,1utp,PBN,PBN_A_1246
...,...,...,...
14538,3fci,3FI,3FI_A_1
14539,5j9l,6HF,6HF_A_501
14540,2q9n,LK5,LK5_A_0
14541,3np7,Z15,"Z15_A_997,Z15_A_998"


In [46]:
src_path = "./data/PDB/protein
ideal_ligand_path = "./data/PDB/ligand
des_path = "./data/PDB/structure/PDBbind

In [47]:
PDBbind_ligand_results = parallelize_dataframe(PDBbind_df, preprocessing_PDB_file_bulk, 10)

In [48]:
PDBbind_ligand_results = pd.concat(PDBbind_ligand_results)

In [49]:
PDBbind_df["Final_lig_info"] = PDBbind_ligand_results.map(lambda a:a[0] if a is not None else None)
PDBbind_df["Check_lig_info"] = PDBbind_ligand_results.map(lambda a:a[1] if a is not None else None)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Total_Lig_info,Final_lig_info,Check_lig_info
0,3wka,S0G,S0G_A_603,S0G_A_603,S0G_A_603
1,5kgx,7SK,7SK_A_301,7SK_A_301,7SK_A_301
2,5wqc,7MA,7MA_A_2001,7MA_A_2001,7MA_A_2001
3,5aen,DP8,DP8_A_611,DP8_A_611,DP8_A_611
4,1utp,PBN,PBN_A_1246,PBN_A_1246,PBN_A_1246
...,...,...,...,...,...
14538,3fci,3FI,3FI_A_1,3FI_A_1,3FI_A_1
14539,5j9l,6HF,6HF_A_501,6HF_A_501,6HF_A_501
14540,2q9n,LK5,LK5_A_0,LK5_A_0,LK5_A_0
14541,3np7,Z15,"Z15_A_997,Z15_A_998",Z15_A_997,Z15_A_997


In [50]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)
print(f"[PDBbind] Complexes: {len(PDBbind_df)}")

[PDBbind] Complexes: 14403


In [51]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/PDBbind"

In [52]:
_ = parallelize_dataframe(PDBbind_df, remove_HEATM_bulk, 10)

In [53]:
scPDB_df.to_csv("./preprocessed_bs_data/step1_scPDB_data.tsv", sep = "\t", index = False)
PDBbind_df.to_csv("./preprocessed_bs_data/step1_PDBbind_data.tsv", sep = "\t", index = False)

### 3. Extract binding sites info

In [54]:
def get_binding_sites(protein_coords, ligand_coords, chain_list, residue_list, threshold):
    P_L_distance_matrix = distance_matrix(protein_coords, ligand_coords)

    return list(set(chain_list[np.where(P_L_distance_matrix<=threshold[0])[0]])), \
                sorted(list(set(residue_list[np.where(P_L_distance_matrix<=threshold[0])[0]]))), \
                    list(set(chain_list[np.where(P_L_distance_matrix<=threshold[1])[0]])), \
                        sorted(list(set(residue_list[np.where(P_L_distance_matrix<=threshold[1])[0]])))

In [55]:
def get_protein_info(row):
    pdbid, ligand_info = row[0], row[-1]
    
    """ Load protein info """
    structure = pdb_parser.get_structure(pdbid, f"{path}/{pdbid}/{pdbid}_protein.pdb")
    protein_atom_coords, chain_results, atom_chain_name_list, pdb_sequence_list, residue_index_list = list(), list(), list(), list(), list()
    atom_residue_list, reindexing = list(), 0
    
    """ Read protein info """
    for chain_name in list(structure[0].child_dict.keys()):
        chain = structure[0][chain_name]
        pdb_sequence, chain_residue_index_list = "", list()
        
        for residue in chain.get_residues():
            if residue.get_id()[0] != ' ' or residue.get_id()[2] != ' ':   # remove HETATM
                continue
            
            if residue.resname in amino_acids_short.keys():
                pdb_sequence += amino_acids_short[residue.resname]
                chain_residue_index_list.append(str(residue.get_id()[1]) + residue.get_id()[2].strip())
                
                for atom in residue:
                    protein_atom_coords.append(atom.get_coord())
                    atom_chain_name_list.append(chain_name)
                    atom_residue_list.append(reindexing)
                    #atom_residue_list.append(str(residue.get_id()[1]) + residue.get_id()[2].strip())
                    
                reindexing += 1

        if len(pdb_sequence) != 0:
            pdb_sequence_list.append(pdb_sequence)
            chain_results.append(chain_name)
            residue_index_list.append(",".join(chain_residue_index_list))

    if ",".join(chain_results) == '':
        return None
    
    chain8A, BS8A, chain4A, BS4A = list(), list(), list(), list()
    
    """ Load ligand info """
    for ligand_code_info in ligand_info.split(","):
        ligand_atom_coords = list()
        ligand_structure = pdb_parser.get_structure(pdbid, f"{path}/{pdbid}/{ligand_code_info}.pdb") 
    
        """ Read ligand info """
        for chain_name in list(ligand_structure[0].child_dict.keys()):
            chain = ligand_structure[0][chain_name]
        
            for residue in chain.get_residues():
                for atom in residue:
                    ligand_atom_coords.append(atom.get_coord())
                    
        """ Get distance (8A, 4A) binding sites """ 
        chain_8A, bs_8A, chain_4A, bs_4A = get_binding_sites(protein_atom_coords, 
                                                                      ligand_atom_coords, 
                                                                      np.array(atom_chain_name_list), 
                                                                      np.array(atom_residue_list), 
                                                                      [8., 4.])
    
        if len(chain_8A) != 0:
            chain8A.append(",".join(chain_8A))
            BS8A.append(bs_8A)        
        else:
            chain8A.append("")
            BS8A.append(list())
        
        if len(chain_4A) != 0:
            chain4A.append(",".join(chain_4A))
            BS4A.append(bs_4A)        
        else:
            chain4A.append("")
            BS4A.append(list())
    
    if "".join(chain4A) == "":
        return None
    
    else:
        return ",".join(chain_results), ",".join(pdb_sequence_list), ";".join(residue_index_list), \
                    ";".join(chain8A), BS8A, ";".join(chain4A), BS4A


In [56]:
def get_pocket_chain_info_bulk(df):
    return df.apply(get_protein_info, axis = 1)

In [57]:
pdb_parser = PDB.PDBParser(QUIET = True)

#### 3.1 scPDB dataset

In [58]:
path = "./data/scPDB/structure"

In [59]:
scPDB_info_results = parallelize_dataframe(scPDB_df, get_pocket_chain_info_bulk, 10)

In [60]:
scPDB_info_results = pd.concat(scPDB_info_results)

In [61]:
scPDB_df["Chain"] = scPDB_info_results.map(lambda a: a[0] if a is not None else None)
scPDB_df["PDB_seqs"] = scPDB_info_results.map(lambda a: a[1] if a is not None else None)
scPDB_df["PDB_indexes"] = scPDB_info_results.map(lambda a: a[2] if a is not None else None)
scPDB_df["Chain_8A"] = scPDB_info_results.map(lambda a: a[3] if a is not None else None)
scPDB_df["BS_8A"] = scPDB_info_results.map(lambda a: a[4] if a is not None else None)
scPDB_df["Chain_4A"] = scPDB_info_results.map(lambda a: a[5] if a is not None else None)
scPDB_df["BS_4A"] = scPDB_info_results.map(lambda a: a[6] if a is not None else None)
scPDB_df

Unnamed: 0,PDB,Lig_code,scPDB,OE_stereo_SMILES,File_ligand_code,RDKit_iso_SMILES,Total_Lig_info,Final_lig_info,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,4nx6,FOL,4nx6_1,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,FOL201,NC1=NC(=O)C2=NC(CNC3=CC=C(C(=O)N[C@@H](CCC(=O)...,FOL_A_201,FOL_A_201,FOL_A_201,A,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[3, 4, 5, 6, 7, 13, 17, 18, 19, 21, 22, 23, 2...",A,"[[4, 5, 6, 19, 21, 26, 27, 28, 29, 30, 31, 45,..."
1,3tqx,PLP,3tqx_1,Cc1c(c(c(cn1)COP(=O)(O)O)C=O)O,PLP397,CC1=C(O)C(C=O)=C(COP(=O)(O)O)C=N1,"PLP_A_397,PLP_B_397",PLP_A_397,"PLP_A_397,PLP_B_397","A,B",QEILSQLNKEIEGLKKAGLYKSERIITSPQNAEIKVGEKEVLNFCA...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...","A,B;A,B","[[46, 48, 103, 104, 105, 106, 107, 108, 109, 1...","A,B;A,B","[[105, 106, 107, 110, 131, 133, 175, 202, 204,..."
2,4bek,XK0,4bek_1,C[C@]1(CCSC(=N1)N)c2ccc(cc2)OC,XK01447,COC1=CC=C([C@]2(C)CCSC(N)=N2)C=C1,XK0_A_1447,XK0_A_1447,XK0_A_1447,A,RGSFVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVG...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7...",A,"[[15, 16, 17, 18, 34, 35, 36, 37, 38, 39, 40, ...",A,"[[34, 36, 38, 39, 75, 112, 119, 122, 220, 222,..."
3,4dca,ADP,4dca_1,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,ADP401,NC1=C2N=CN([C@@H]3O[C@H](CO[P@](=O)(O)OP(=O)(O...,ADP_A_401,ADP_A_401,ADP_A_401,A,LDAEIYEHLNKQIKINELRYLSSGDDSDTFLCNEQYVVKVPKRDSV...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A,"[[19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, ...",A,"[[20, 21, 28, 36, 38, 81, 82, 83, 191, 192, 20..."
4,4pah,LNR,4pah_1,c1cc(c(cc1[C@H](CN)O)O)O,LNR600,NC[C@H](O)C1=CC(O)=C(O)C=C1,LNR_A_600,LNR_A_600,LNR_A_600,A,TVPWFPRTIQELDRFANQILSYGAELDADHPGFKDPVYRARRKQFA...,"117,118,119,120,121,122,123,124,125,126,127,12...",A,"[[130, 131, 132, 133, 134, 135, 137, 138, 146,...",A,"[[131, 137, 168, 173, 208, 213]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17151,3w2f,FAD,3w2f_1,Cc1cc2c(cc1C)N(C3=NC(=O)NC(=O)C3=N2)C[C@@H]([C...,FAD301,CC1=CC2=C(C=C1C)N(C[C@H](O)[C@H](O)[C@H](O)CO[...,FAD_A_301,FAD_A_301,FAD_A_301,A,TPAITLENPDIKYPLRLIDKEVVNHDTRRFRFALPSPEHILGLPVG...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[25, 26, 27, 29, 45, 46, 47, 48, 49, 50, 54, ...",A,"[[47, 61, 62, 63, 64, 78, 79, 80, 82, 83, 86, ..."
17152,3af0,GDP,3af0_1,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,GDP313,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)OP(=...,GDP_A_313,GDP_A_313,GDP_A_313,A,EPSPYVEFDRRQWRALRMSTPLALTEEELVGLRGLGEQIDLLEVEE...,"6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22...",A,"[[31, 32, 33, 34, 35, 36, 89, 90, 91, 92, 93, ...",A,"[[92, 93, 94, 95, 96, 97, 98, 173, 174, 232, 2..."
17153,4dym,IYZ,4dym_1,CC(=O)c1cccc(c1)c2cnc3n2nc(cc3)NCC4CC4,IYZ503,CC(=O)C1=CC=CC(C2=CN=C3C=CC(NCC4CC4)=NN23)=C1,IYZ_A_503,IYZ_A_503,IYZ_A_503,A,QITLLECVGKGRYGEVWRGSWQGENVAVKIFSSRDEKSWFRETELY...,"207,208,209,210,211,212,213,214,215,216,217,21...",A,"[[7, 8, 9, 10, 12, 14, 15, 16, 17, 25, 26, 27,...",A,"[[12, 15, 26, 28, 56, 76, 77, 78, 79, 82, 83, ..."
17154,2i47,KGY,2i47_4,CC#CCOc1ccc(cc1)S(=O)(=O)CC2(CCN(CC2)S(=O)(=O)...,KGY1002,CC#CCOC1=CC=C(S(=O)(=O)CC2(C(=O)NO)CCN(S(=O)(=...,"KGY_C_1001,KGY_D_1002",KGY_C_1001,"KGY_C_1001,KGY_D_1002","A,B,C,D",PMKNTCKLLVVADHRFYRYMGRGEESTTTNYLIELIDRVDDIYRNT...,"220,221,222,223,224,225,226,227,228,229,230,23...","C,D;D","[[601, 602, 603, 604, 606, 631, 632, 633, 634,...",C;D,"[[602, 603, 634, 635, 636, 637, 638, 686, 689,..."


In [62]:
scPDB_df = scPDB_df.dropna(axis = 0).reset_index(drop=True)
print(f"[scPDB] Complexes: {len(scPDB_df)}")

[scPDB] Complexes: 17155


#### 3.2 PDBbind dataset

In [63]:
path = "./data/PDB/structure/PDBbind"

In [64]:
PDBbind_info_results = parallelize_dataframe(PDBbind_df, get_pocket_chain_info_bulk, 10)

In [65]:
PDBbind_info_results = pd.concat(PDBbind_info_results)

In [66]:
PDBbind_df["Chain"] = PDBbind_info_results.map(lambda a: a[0] if a is not None else None)
PDBbind_df["PDB_seqs"] = PDBbind_info_results.map(lambda a: a[1] if a is not None else None)
PDBbind_df["PDB_indexes"] = PDBbind_info_results.map(lambda a: a[2] if a is not None else None)
PDBbind_df["Chain_8A"] = PDBbind_info_results.map(lambda a: a[3] if a is not None else None)
PDBbind_df["BS_8A"] = PDBbind_info_results.map(lambda a: a[4] if a is not None else None)
PDBbind_df["Chain_4A"] = PDBbind_info_results.map(lambda a: a[5] if a is not None else None)
PDBbind_df["BS_4A"] = PDBbind_info_results.map(lambda a: a[6] if a is not None else None)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Total_Lig_info,Final_lig_info,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,3wka,S0G,S0G_A_603,S0G_A_603,S0G_A_603,A,TLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[264, 265, 266, 267, 332, 333, 334, 357, 378,...",A,"[[265, 381, 417, 494, 495, 496, 522, 523]]"
1,5kgx,7SK,7SK_A_301,7SK_A_301,7SK_A_301,A,CSPGIWQLDTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,"56,57,58,59,60,61,62,63,64,66,67,68,69,70,71,7...",A,"[[94, 95, 96, 97, 98, 99, 100, 101, 102, 103, ...",A,"[[97, 98, 99, 100, 102, 103, 107]]"
2,5wqc,7MA,7MA_A_2001,7MA_A_2001,7MA_A_2001,A,FLRYLWREYLHPKEYEWVLIAGYIIVFVVALIGNVLVCVAVWKNHH...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5...",A,"[[22, 64, 68, 69, 71, 72, 73, 75, 76, 80, 81, ...",A,"[[72, 92, 95, 96, 99, 148, 152, 170, 182, 185,..."
3,5aen,DP8,DP8_A_611,DP8_A_611,DP8_A_611,A,IVDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQEDN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[4, 131, 132, 133, 134, 135, 136, 197, 262, 2...",A,"[[133, 134, 264, 266, 267, 308, 311, 364, 366,..."
4,1utp,PBN,PBN_A_1246,PBN_A_1246,PBN_A_1246,A,IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGI...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...",A,"[[0, 1, 23, 24, 37, 39, 40, 80, 83, 117, 121, ...",A,"[[39, 168, 169, 170, 171, 174, 188, 190, 191, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
14398,3fci,3FI,3FI_A_1,3FI_A_1,3FI_A_1,A,MEFFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFT...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A,"[[60, 61, 62, 63, 64, 65, 66, 70, 71, 72, 73, ...",A,"[[61, 62, 63, 65, 75, 76, 87, 122, 164, 165, 1..."
14399,5j9l,6HF,6HF_A_501,6HF_A_501,6HF_A_501,A,SLHMIDYKEIEVEEVVGRGGVVCKAWRADVAIKQIESESERKAFIV...,"27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,4...",A,"[[13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 29, ...",A,"[[15, 16, 21, 30, 59, 71, 72, 73, 74, 75, 77, ..."
14400,2q9n,LK5,LK5_A_0,LK5_A_0,LK5_A_0,A,PVSEKQLAEVVANTITPLMKAQSVPGMAVAVIYQGKPHYYTFGKAD...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[60, 61, 62, 63, 64, 65, 116, 117, 118, 119, ...",A,"[[62, 117, 118, 148, 150, 202, 219, 315, 316, ..."
14401,3np7,Z15,"Z15_A_997,Z15_A_998",Z15_A_997,Z15_A_997,A,QISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYYFALAH...,"12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,2...",A,"[[1, 4, 81, 84, 88, 93, 108, 109, 111, 112, 46...",A,"[[4, 84, 108, 112, 470, 471, 473, 520, 521, 52..."


In [67]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)
print(f"[PDBbind] Complexes: {len(PDBbind_df)}")

[PDBbind] Complexes: 14401


#### 3.3 CASF2016 dataset

In [68]:
CASF2016_df = pd.read_csv("./preprocessed_data/step3_CASF2016_labels.tsv", sep = "\t")
CASF2016_BS_df = pd.DataFrame({"PDB":[i for i in CASF2016_df.PDB.values], "Lig_code":[i for i in CASF2016_df.Lig_code.values], "Check_lig_info":[i for i in CASF2016_df.Check_Lig_info.values]})

In [69]:
path = "./data/PDB/structure/CASF2016/"

In [70]:
CASF2016_info_results = parallelize_dataframe(CASF2016_BS_df, get_pocket_chain_info_bulk, 10)

In [71]:
CASF2016_info_results = pd.concat(CASF2016_info_results)

In [72]:
CASF2016_BS_df["Chain"] = CASF2016_info_results.map(lambda a: a[0] if a is not None else None)
CASF2016_BS_df["PDB_seqs"] = CASF2016_info_results.map(lambda a: a[1] if a is not None else None)
CASF2016_BS_df["PDB_indexes"] = CASF2016_info_results.map(lambda a: a[2] if a is not None else None)
CASF2016_BS_df["Chain_8A"] = CASF2016_info_results.map(lambda a: a[3] if a is not None else None)
CASF2016_BS_df["BS_8A"] = CASF2016_info_results.map(lambda a: a[4] if a is not None else None)
CASF2016_BS_df["Chain_4A"] = CASF2016_info_results.map(lambda a: a[5] if a is not None else None)
CASF2016_BS_df["BS_4A"] = CASF2016_info_results.map(lambda a: a[6] if a is not None else None)

In [73]:
CASF2016_BS_df = CASF2016_BS_df.dropna(axis = 0).reset_index(drop=True)
CASF2016_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,1bcu,PRL,PRL_H_280,"L,H,I","CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2...",H,"[[15, 55, 97, 136, 141, 143, 144, 155, 178, 18...",H,"[[184, 185, 186, 187, 190, 208, 210, 211, 213,..."
1,1bzc,TPI,TPI_A_902,A,EMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRYR...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[34, 42, 43, 44, 45, 46, 47, 82, 83, 84, 108,...",A,"[[44, 45, 46, 47, 118, 179, 180, 213, 214, 215..."
2,1c5z,BEN,BEN_B_251,"A,B","LKFQCGQKT,IIGGEFTTIENQPWFAAIYRRHVTYVCGGSLMSPCW...","9,10,11,12,13,14,15,16,17;16,17,18,19,20,21,22...",B,"[[9, 10, 35, 48, 50, 51, 92, 95, 131, 135, 136...",B,"[[182, 183, 184, 185, 188, 206, 207, 208, 209,..."
3,1e66,HUX,HUX_A_803,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A,"[[65, 68, 71, 74, 75, 76, 77, 78, 79, 80, 81, ...",A,"[[80, 114, 115, 117, 195, 196, 286, 326, 327, ..."
4,1eby,BEB,BEB_B_501,"A,B",PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","A,B","[[7, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 3...","A,B","[[7, 22, 24, 26, 27, 28, 29, 31, 46, 47, 48, 4..."
...,...,...,...,...,...,...,...,...,...,...
274,5aba,UL7,"UL7_A_1291,UL7_B_1291","A,B",SVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKLFCQLAKTC...,"96,97,98,99,100,101,102,103,104,105,106,107,10...",A;B,"[[11, 13, 14, 48, 49, 50, 51, 52, 53, 54, 55, ...",A;B,"[[49, 51, 52, 53, 54, 55, 124, 125, 126, 127, ..."
275,5c28,4XV,4XV_A_803,"A,B",HMSICTSEEWQGLMQFTLPVRLCKEIELFHFDIGPFENMWPGIFVY...,"437,438,439,440,441,442,443,444,445,446,447,44...","A,B","[[77, 78, 82, 186, 188, 224, 226, 227, 228, 22...",A,"[[228, 230, 231, 245, 246, 266, 279, 282]]"
276,5c2h,4XU,4XU_B_803,"A,B",QFTLPVRLCKEIELFHFDIGPFENMWPGIFVYMVHRSCGTSCFELE...,"451,452,453,454,455,456,457,458,459,460,461,46...","A,B","[[225, 384, 385, 386, 390, 425, 428, 430, 431,...",B,"[[386, 536, 538, 539, 553, 554, 557, 573, 574,..."
277,5dwr,5H7,5H7_A_401,A,PLESQYQVGPLLGSGGFGSVYSGIRVSDNLPVAIKHVEKDRISDWG...,"33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,4...",A,"[[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2...",A,"[[11, 12, 16, 19, 32, 34, 71, 87, 88, 89, 90, ..."


#### 3.4 CASF2013 dataset

In [74]:
CASF2013_df = pd.read_csv("./preprocessed_data/step3_CASF2013_labels.tsv", sep = "\t")
CASF2013_BS_df = pd.DataFrame({"PDB":[i for i in CASF2013_df.PDB.values], "Lig_code":[i for i in CASF2013_df.Lig_code.values], "Check_lig_info":[i for i in CASF2013_df.Check_Lig_info.values]})

In [75]:
path = "./data/PDB/structure/CASF2013/"

In [76]:
CASF2013_info_results = parallelize_dataframe(CASF2013_BS_df, get_pocket_chain_info_bulk, 10)

In [77]:
CASF2013_info_results = pd.concat(CASF2013_info_results)

In [78]:
CASF2013_BS_df["Chain"] = CASF2013_info_results.map(lambda a: a[0] if a is not None else None)
CASF2013_BS_df["PDB_seqs"] = CASF2013_info_results.map(lambda a: a[1] if a is not None else None)
CASF2013_BS_df["PDB_indexes"] = CASF2013_info_results.map(lambda a: a[2] if a is not None else None)
CASF2013_BS_df["Chain_8A"] = CASF2013_info_results.map(lambda a: a[3] if a is not None else None)
CASF2013_BS_df["BS_8A"] = CASF2013_info_results.map(lambda a: a[4] if a is not None else None)
CASF2013_BS_df["Chain_4A"] = CASF2013_info_results.map(lambda a: a[5] if a is not None else None)
CASF2013_BS_df["BS_4A"] = CASF2013_info_results.map(lambda a: a[6] if a is not None else None)

In [79]:
CASF2013_BS_df = CASF2013_BS_df.dropna(axis = 0).reset_index(drop=True)
CASF2013_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,10gs,VWW,"VWW_A_210,VWW_B_210","A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...","A,B;A,B","[[5, 6, 7, 8, 9, 10, 11, 12, 15, 31, 32, 33, 3...","A,B;A,B","[[5, 6, 8, 11, 36, 42, 48, 49, 50, 51, 62, 63,..."
1,1bcu,PRL,PRL_H_280,"L,H,I","CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2...",H,"[[15, 55, 97, 136, 141, 143, 144, 155, 178, 18...",H,"[[184, 185, 186, 187, 190, 208, 210, 211, 213,..."
2,1e66,HUX,HUX_A_803,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A,"[[65, 68, 71, 74, 75, 76, 77, 78, 79, 80, 81, ...",A,"[[80, 114, 115, 117, 195, 196, 286, 326, 327, ..."
3,1f8b,DAN,DAN_A_0,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A,"[[36, 37, 52, 54, 67, 68, 69, 70, 74, 96, 97, ...",A,"[[36, 37, 69, 70, 96, 140, 142, 164, 194, 195,..."
4,1f8c,4AM,4AM_A_4,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A,"[[36, 37, 52, 54, 67, 68, 69, 70, 74, 96, 97, ...",A,"[[36, 37, 69, 70, 96, 140, 142, 164, 194, 195,..."
...,...,...,...,...,...,...,...,...,...,...
172,4djv,0KM,"0KM_A_501,0KM_B_501","A,B",GSFVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGA...,"58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,7...",A;B,"[[12, 13, 14, 15, 16, 17, 33, 34, 35, 36, 37, ...",A;B,"[[13, 14, 15, 16, 33, 35, 37, 38, 74, 79, 111,..."
173,4g8m,G8M,"G8M_A_301,G8M_B_901","A,B",ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A;B,"[[9, 11, 14, 58, 59, 60, 61, 72, 73, 86, 87, 8...",A;B,"[[59, 87, 88, 89, 94, 136, 139, 140, 141, 191,..."
174,4gid,0GH,"0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501","A,B,C,D",FVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGAAP...,"47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,6...",A;B;C;D,"[[9, 10, 11, 12, 13, 14, 15, 31, 32, 33, 34, 3...",A;B;C;D,"[[12, 13, 14, 31, 33, 35, 70, 71, 72, 73, 74, ..."
175,4gqq,0XR,0XR_A_502,A,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[232, 233, 234, 235, 236, 237, 238, 240, 241,...",A,"[[234, 242, 243, 253, 255, 283, 284, 285, 286]]"


#### 3.5 CSAR2014 dataset

In [80]:
CSAR2014_df = pd.read_csv("./preprocessed_data/step3_CSAR2014_labels.tsv", sep = "\t")
CSAR2014_BS_df = pd.DataFrame({"PDB":[i for i in CSAR2014_df.PDB.values], "Lig_code":[i for i in CSAR2014_df.Lig_code.values], "Check_lig_info":[i for i in CSAR2014_df.Check_Lig_info.values]})

In [81]:
path = "./data/PDB/structure/CSAR2014/"

In [82]:
CSAR2014_info_results = parallelize_dataframe(CSAR2014_BS_df, get_pocket_chain_info_bulk, 10)

In [83]:
CSAR2014_info_results = pd.concat(CSAR2014_info_results)

In [84]:
CSAR2014_BS_df["Chain"] = CSAR2014_info_results.map(lambda a: a[0] if a is not None else None)
CSAR2014_BS_df["PDB_seqs"] = CSAR2014_info_results.map(lambda a: a[1] if a is not None else None)
CSAR2014_BS_df["PDB_indexes"] = CSAR2014_info_results.map(lambda a: a[2] if a is not None else None)
CSAR2014_BS_df["Chain_8A"] = CSAR2014_info_results.map(lambda a: a[3] if a is not None else None)
CSAR2014_BS_df["BS_8A"] = CSAR2014_info_results.map(lambda a: a[4] if a is not None else None)
CSAR2014_BS_df["Chain_4A"] = CSAR2014_info_results.map(lambda a: a[5] if a is not None else None)
CSAR2014_BS_df["BS_4A"] = CSAR2014_info_results.map(lambda a: a[6] if a is not None else None)
CSAR2014_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,4ypw,4FD,4FD_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[88, 89, 90, 91, 115, 117, 118, 119, 134, 135..."
1,4ypx,4FG,4FG_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A,"[[89, 90, 91, 92, 93, 94, 95, 96, 97, 114, 115...",A,"[[90, 91, 92, 135, 136, 137, 139, 140, 141, 14..."
2,4ypy,4F9,4F9_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 112, 113...",A,"[[88, 89, 90, 133, 134, 135, 137, 139, 141, 14..."
3,4ypz,4FL,4FL_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 112, 113...",A,"[[88, 89, 90, 114, 132, 133, 134, 135, 139, 14..."
4,4yq0,4FM,4FM_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 69, 88, 89, 90, 91, 92, 93, 94, ...",A,"[[88, 89, 90, 91, 115, 116, 117, 119, 120, 133..."
5,4yq1,4FN,4FN_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A,"[[67, 68, 69, 92, 93, 94, 95, 96, 97, 98, 99, ...",A,"[[94, 95, 96, 120, 121, 122, 123, 124, 138, 13..."
6,4yq2,EFY,EFY_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A,"[[63, 64, 65, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[90, 91, 92, 116, 117, 118, 120, 134, 135, 13..."
7,4yq3,4G1,4G1_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A,"[[67, 69, 93, 94, 95, 96, 97, 98, 99, 100, 101...",A,"[[93, 94, 95, 96, 120, 121, 122, 124, 138, 139..."
8,4yq4,4G3,4G3_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[88, 89, 90, 91, 115, 117, 118, 119, 134, 135..."
9,4yq5,4G0,4G0_A_301,A,LVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPR...,"-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12...",A,"[[66, 67, 68, 92, 93, 94, 95, 96, 97, 98, 99, ...",A,"[[92, 93, 94, 95, 119, 122, 123, 138, 139, 140..."


In [85]:
CSAR2014_BS_df = CSAR2014_BS_df.dropna(axis = 0).reset_index(drop=True)
CSAR2014_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,4ypw,4FD,4FD_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[88, 89, 90, 91, 115, 117, 118, 119, 134, 135..."
1,4ypx,4FG,4FG_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A,"[[89, 90, 91, 92, 93, 94, 95, 96, 97, 114, 115...",A,"[[90, 91, 92, 135, 136, 137, 139, 140, 141, 14..."
2,4ypy,4F9,4F9_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 112, 113...",A,"[[88, 89, 90, 133, 134, 135, 137, 139, 141, 14..."
3,4ypz,4FL,4FL_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A,"[[87, 88, 89, 90, 91, 92, 93, 94, 95, 112, 113...",A,"[[88, 89, 90, 114, 132, 133, 134, 135, 139, 14..."
4,4yq0,4FM,4FM_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 69, 88, 89, 90, 91, 92, 93, 94, ...",A,"[[88, 89, 90, 91, 115, 116, 117, 119, 120, 133..."
5,4yq1,4FN,4FN_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A,"[[67, 68, 69, 92, 93, 94, 95, 96, 97, 98, 99, ...",A,"[[94, 95, 96, 120, 121, 122, 123, 124, 138, 13..."
6,4yq2,EFY,EFY_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A,"[[63, 64, 65, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[90, 91, 92, 116, 117, 118, 120, 134, 135, 13..."
7,4yq3,4G1,4G1_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A,"[[67, 69, 93, 94, 95, 96, 97, 98, 99, 100, 101...",A,"[[93, 94, 95, 96, 120, 121, 122, 124, 138, 139..."
8,4yq4,4G3,4G3_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A,"[[62, 63, 64, 88, 89, 90, 91, 92, 93, 94, 95, ...",A,"[[88, 89, 90, 91, 115, 117, 118, 119, 134, 135..."
9,4yq5,4G0,4G0_A_301,A,LVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPR...,"-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12...",A,"[[66, 67, 68, 92, 93, 94, 95, 96, 97, 98, 99, ...",A,"[[92, 93, 94, 95, 119, 122, 123, 138, 139, 140..."


#### 3.6 CSAR2012 dataset 

In [86]:
CSAR2012_df = pd.read_csv("./preprocessed_data/step3_CSAR2012_labels.tsv", sep = "\t")
CSAR2012_BS_df = pd.DataFrame({"PDB":[i for i in CSAR2012_df.PDB.values], "Lig_code":[i for i in CSAR2012_df.Lig_code.values], "Check_lig_info":[i for i in CSAR2012_df.Check_Lig_info.values]})

In [87]:
path = "./data/PDB/structure/CSAR2012/"

In [88]:
CSAR2012_info_results = parallelize_dataframe(CSAR2012_BS_df, get_pocket_chain_info_bulk, 10)

In [89]:
CSAR2012_info_results = pd.concat(CSAR2012_info_results)

In [90]:
CSAR2012_BS_df["Chain"] = CSAR2012_info_results.map(lambda a: a[0] if a is not None else None)
CSAR2012_BS_df["PDB_seqs"] = CSAR2012_info_results.map(lambda a: a[1] if a is not None else None)
CSAR2012_BS_df["PDB_indexes"] = CSAR2012_info_results.map(lambda a: a[2] if a is not None else None)
CSAR2012_BS_df["Chain_8A"] = CSAR2012_info_results.map(lambda a: a[3] if a is not None else None)
CSAR2012_BS_df["BS_8A"] = CSAR2012_info_results.map(lambda a: a[4] if a is not None else None)
CSAR2012_BS_df["Chain_4A"] = CSAR2012_info_results.map(lambda a: a[5] if a is not None else None)
CSAR2012_BS_df["BS_4A"] = CSAR2012_info_results.map(lambda a: a[6] if a is not None else None)
CSAR2012_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,4fud,6UP,6UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 91, 93, 96, 136, 141, 144, 149...",A,"[[191, 192, 194, 197, 215, 217, 218, 220, 221,..."
1,4fue,7UP,7UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 19, 29, 30, 43, 44, 45, 46, 47, 48, 49...",A,"[[45, 48, 49, 93, 191, 192, 193, 194, 197, 215..."
2,4fu7,1UP,1UP_A_305,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 46, 90, 91, 93, 96, 136, 141, ...",A,"[[191, 192, 193, 194, 197, 215, 216, 217, 218,..."
3,4fu8,2UP,2UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 91, 93, 96, 136, 141, 144, 149...",A,"[[191, 192, 193, 194, 197, 215, 216, 217, 218,..."
4,4fu9,675,675_A_313,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 29, 30, 43, 44, 45, 46, 47, 48, 49, 50...",A,"[[45, 49, 93, 191, 192, 193, 194, 197, 215, 21..."
5,4fub,4UP,4UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[1, 30, 44, 45, 46, 47, 48, 49, 86, 88, 89, 9...",A,"[[45, 93, 191, 192, 194, 197, 215, 217, 218, 2..."
6,4fuc,239,239_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 29, 30, 43, 44, 45, 46, 47, 48, 49, 50...",A,"[[45, 48, 49, 86, 93, 191, 192, 193, 194, 197,..."
7,4fsm,HK1,HK1_A_301,A,VPFVEDWDLVQTLGEGEVQLAVNRVTEEAVAVKIVNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,21,22,23...",A,"[[10, 11, 12, 13, 14, 16, 17, 18, 19, 28, 29, ...",A,"[[12, 17, 30, 52, 68, 69, 70, 71, 72, 73, 74, ..."
8,4fsw,HK6,HK6_A_301,A,VPFVEDWDLVQTLGEGGEVQLAVNRVTEEAVAVKIVNIKKEICINK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22...",A,"[[10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 30, ...",A,"[[12, 18, 31, 70, 71, 72, 73, 75, 76, 122]]"
9,4ft5,H2K,H2K_A_300,A,VPFVEDWDLVQTLGEVQLAVNRVTEEAVAVKIVDMNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,22,23,24,25...",A,"[[10, 11, 12, 13, 14, 15, 16, 17, 26, 27, 28, ...",A,"[[12, 13, 15, 28, 30, 39, 52, 68, 69, 70, 71, ..."


In [91]:
CSAR2012_BS_df = CSAR2012_BS_df.dropna(axis = 0).reset_index(drop=True)
CSAR2012_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,4fud,6UP,6UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 91, 93, 96, 136, 141, 144, 149...",A,"[[191, 192, 194, 197, 215, 217, 218, 220, 221,..."
1,4fue,7UP,7UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 19, 29, 30, 43, 44, 45, 46, 47, 48, 49...",A,"[[45, 48, 49, 93, 191, 192, 193, 194, 197, 215..."
2,4fu7,1UP,1UP_A_305,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 46, 90, 91, 93, 96, 136, 141, ...",A,"[[191, 192, 193, 194, 197, 215, 216, 217, 218,..."
3,4fu8,2UP,2UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 30, 45, 91, 93, 96, 136, 141, 144, 149...",A,"[[191, 192, 193, 194, 197, 215, 216, 217, 218,..."
4,4fu9,675,675_A_313,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 29, 30, 43, 44, 45, 46, 47, 48, 49, 50...",A,"[[45, 49, 93, 191, 192, 193, 194, 197, 215, 21..."
5,4fub,4UP,4UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[1, 30, 44, 45, 46, 47, 48, 49, 86, 88, 89, 9...",A,"[[45, 93, 191, 192, 194, 197, 215, 217, 218, 2..."
6,4fuc,239,239_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[0, 1, 29, 30, 43, 44, 45, 46, 47, 48, 49, 50...",A,"[[45, 48, 49, 86, 93, 191, 192, 193, 194, 197,..."
7,4fsm,HK1,HK1_A_301,A,VPFVEDWDLVQTLGEGEVQLAVNRVTEEAVAVKIVNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,21,22,23...",A,"[[10, 11, 12, 13, 14, 16, 17, 18, 19, 28, 29, ...",A,"[[12, 17, 30, 52, 68, 69, 70, 71, 72, 73, 74, ..."
8,4fsw,HK6,HK6_A_301,A,VPFVEDWDLVQTLGEGGEVQLAVNRVTEEAVAVKIVNIKKEICINK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22...",A,"[[10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 30, ...",A,"[[12, 18, 31, 70, 71, 72, 73, 75, 76, 122]]"
9,4ft5,H2K,H2K_A_300,A,VPFVEDWDLVQTLGEVQLAVNRVTEEAVAVKIVDMNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,22,23,24,25...",A,"[[10, 11, 12, 13, 14, 15, 16, 17, 26, 27, 28, ...",A,"[[12, 13, 15, 28, 30, 39, 52, 68, 69, 70, 71, ..."


#### 3.7 CSARset1 dataset

In [92]:
CSARset1_df = pd.read_csv("./preprocessed_data/step3_CSARset1_labels.tsv", sep = "\t")
CSARset1_BS_df = pd.DataFrame({"PDB":[i for i in CSARset1_df.PDB.values], "Lig_code":[i for i in CSARset1_df.Lig_code.values], "Check_lig_info":[i for i in CSARset1_df.Check_Lig_info.values]})

In [93]:
path = "./data/PDB/structure/CSARset1/"

In [94]:
CSARset1_info_results = parallelize_dataframe(CSARset1_BS_df, get_pocket_chain_info_bulk, 10)

In [95]:
CSARset1_info_results = pd.concat(CSARset1_info_results)

In [96]:
CSARset1_BS_df["Chain"] = CSARset1_info_results.map(lambda a: a[0] if a is not None else None)
CSARset1_BS_df["PDB_seqs"] = CSARset1_info_results.map(lambda a: a[1] if a is not None else None)
CSARset1_BS_df["PDB_indexes"] = CSARset1_info_results.map(lambda a: a[2] if a is not None else None)
CSARset1_BS_df["Chain_8A"] = CSARset1_info_results.map(lambda a: a[3] if a is not None else None)
CSARset1_BS_df["BS_8A"] = CSARset1_info_results.map(lambda a: a[4] if a is not None else None)
CSARset1_BS_df["Chain_4A"] = CSARset1_info_results.map(lambda a: a[5] if a is not None else None)
CSARset1_BS_df["BS_4A"] = CSARset1_info_results.map(lambda a: a[6] if a is not None else None)
CSARset1_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,2are,MAN,"MAN_A_253,MAN_B_253","A,B",QDSLSFGFPTFPSDQKNLIFQGDAQIKNNAVQLTKTDSNGNPVAST...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[43, 44, 80, 82, 83, 84, 85, 86, 101, 102, 10...",A;B,"[[84, 85, 104, 105, 131, 136, 137, 219, 220, 2..."
1,2oag,DLI,DLI_B_4000,"A,B,C,D",SRKTYTLTDYLKNTYRLKLYSLRWISDHEYLYKQENNILVFNAEYG...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5...",B,"[[812, 813, 888, 889, 890, 891, 892, 893, 894,...",B,"[[812, 892, 893, 894, 896, 1044, 1045, 1234, 1..."
2,2jbj,G88,G88_A_1768,A,NMKAFLDELKAENIKKFLYNFTQIPHLAGTEQNFQLAKQIQSQWKE...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7...",A,"[[107, 148, 152, 153, 154, 197, 198, 199, 200,...",A,"[[153, 200, 318, 328, 365, 366, 368, 369, 394,..."
3,2pwd,NOJ,"NOJ_A_8000,NOJ_B_8001","A,B",KPGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[13, 15, 49, 51, 52, 59, 60, 61, 62, 63, 64, ...",A;B,"[[60, 63, 103, 144, 163, 197, 199, 253, 325, 3..."
4,2pwg,CTS,"CTS_A_8000,CTS_B_8001","A,B",PGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGID...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A;B,"[[14, 48, 50, 51, 58, 59, 60, 61, 62, 63, 97, ...",A;B,"[[59, 62, 102, 143, 162, 166, 198, 199, 252, 2..."
...,...,...,...,...,...,...,...,...,...,...
151,3ekr,PY9,"PY9_A_901,PY9_B_901","A,B",QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,"10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,2...",A;B,"[[34, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, ...",A;B,"[[38, 41, 42, 45, 48, 83, 86, 87, 88, 96, 174,..."
152,3ene,NPZ,NPZ_A_1,A,SEESQAFQRQLTALIGYDVTDVSNVHDDELEFTRRGLVTPRMAEVA...,"144,145,146,147,148,149,150,151,152,153,154,15...",A,"[[565, 567, 569, 570, 571, 573, 575, 592, 593,...",A,"[[575, 594, 596, 601, 604, 630, 642, 643, 644,..."
153,3eqr,T74,"T74_A_1,T74_B_1","A,B",LTCLIGEKDLRLLEKLGDGSFGVVRRGEWDAPSGKTVSVAVKCLKP...,"117,118,119,120,121,122,123,124,125,126,127,12...","A;A,B","[[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...",A;B,"[[15, 17, 23, 39, 40, 41, 54, 58, 67, 80, 82, ..."
154,3f8c,HT1,HT1_A_127,A,EIPKEMLRAQTNVILLNVLKQGDNYVYGIIKQVKEASNGEMELNEA...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[0, 2, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 53...",A,"[[5, 12, 86, 90]]"


In [97]:
CSARset1_BS_df = CSARset1_BS_df.dropna(axis = 0).reset_index(drop=True)
CSARset1_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,2are,MAN,"MAN_A_253,MAN_B_253","A,B",QDSLSFGFPTFPSDQKNLIFQGDAQIKNNAVQLTKTDSNGNPVAST...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[43, 44, 80, 82, 83, 84, 85, 86, 101, 102, 10...",A;B,"[[84, 85, 104, 105, 131, 136, 137, 219, 220, 2..."
1,2oag,DLI,DLI_B_4000,"A,B,C,D",SRKTYTLTDYLKNTYRLKLYSLRWISDHEYLYKQENNILVFNAEYG...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5...",B,"[[812, 813, 888, 889, 890, 891, 892, 893, 894,...",B,"[[812, 892, 893, 894, 896, 1044, 1045, 1234, 1..."
2,2jbj,G88,G88_A_1768,A,NMKAFLDELKAENIKKFLYNFTQIPHLAGTEQNFQLAKQIQSQWKE...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7...",A,"[[107, 148, 152, 153, 154, 197, 198, 199, 200,...",A,"[[153, 200, 318, 328, 365, 366, 368, 369, 394,..."
3,2pwd,NOJ,"NOJ_A_8000,NOJ_B_8001","A,B",KPGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[13, 15, 49, 51, 52, 59, 60, 61, 62, 63, 64, ...",A;B,"[[60, 63, 103, 144, 163, 197, 199, 253, 325, 3..."
4,2pwg,CTS,"CTS_A_8000,CTS_B_8001","A,B",PGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGID...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A;B,"[[14, 48, 50, 51, 58, 59, 60, 61, 62, 63, 97, ...",A;B,"[[59, 62, 102, 143, 162, 166, 198, 199, 252, 2..."
...,...,...,...,...,...,...,...,...,...,...
151,3ekr,PY9,"PY9_A_901,PY9_B_901","A,B",QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,"10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,2...",A;B,"[[34, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, ...",A;B,"[[38, 41, 42, 45, 48, 83, 86, 87, 88, 96, 174,..."
152,3ene,NPZ,NPZ_A_1,A,SEESQAFQRQLTALIGYDVTDVSNVHDDELEFTRRGLVTPRMAEVA...,"144,145,146,147,148,149,150,151,152,153,154,15...",A,"[[565, 567, 569, 570, 571, 573, 575, 592, 593,...",A,"[[575, 594, 596, 601, 604, 630, 642, 643, 644,..."
153,3eqr,T74,"T74_A_1,T74_B_1","A,B",LTCLIGEKDLRLLEKLGDGSFGVVRRGEWDAPSGKTVSVAVKCLKP...,"117,118,119,120,121,122,123,124,125,126,127,12...","A;A,B","[[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...",A;B,"[[15, 17, 23, 39, 40, 41, 54, 58, 67, 80, 82, ..."
154,3f8c,HT1,HT1_A_127,A,EIPKEMLRAQTNVILLNVLKQGDNYVYGIIKQVKEASNGEMELNEA...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[0, 2, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 53...",A,"[[5, 12, 86, 90]]"


#### 3.8 CSARset2 dataset

In [98]:
CSARset2_df = pd.read_csv("./preprocessed_data/step3_CSARset2_labels.tsv", sep = "\t")
CSARset2_BS_df = pd.DataFrame({"PDB":[i for i in CSARset2_df.PDB.values], "Lig_code":[i for i in CSARset2_df.Lig_code.values], "Check_lig_info":[i for i in CSARset2_df.Check_Lig_info.values]})

In [99]:
path = "./data/PDB/structure/CSARset2/"

In [100]:
CSARset2_info_results = parallelize_dataframe(CSARset2_BS_df, get_pocket_chain_info_bulk, 10)

In [101]:
CSARset2_info_results = pd.concat(CSARset2_info_results)

In [102]:
CSARset2_BS_df["Chain"] = CSARset2_info_results.map(lambda a: a[0] if a is not None else None)
CSARset2_BS_df["PDB_seqs"] = CSARset2_info_results.map(lambda a: a[1] if a is not None else None)
CSARset2_BS_df["PDB_indexes"] = CSARset2_info_results.map(lambda a: a[2] if a is not None else None)
CSARset2_BS_df["Chain_8A"] = CSARset2_info_results.map(lambda a: a[3] if a is not None else None)
CSARset2_BS_df["BS_8A"] = CSARset2_info_results.map(lambda a: a[4] if a is not None else None)
CSARset2_BS_df["Chain_4A"] = CSARset2_info_results.map(lambda a: a[5] if a is not None else None)
CSARset2_BS_df["BS_4A"] = CSARset2_info_results.map(lambda a: a[6] if a is not None else None)

In [103]:
CSARset2_BS_df = CSARset2_BS_df.dropna(axis = 0).reset_index(drop=True)
CSARset2_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,1a8i,GLS,GLS_A_998,A,QEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYY...,"7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2...",A,"[[77, 81, 126, 127, 128, 129, 130, 131, 132, 1...",A,"[[128, 129, 132, 267, 354, 355, 432, 461, 550,..."
1,1a99,PUT,"PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371","A,B,C,D",QKTLHIYNWSDYIAPDTVANFEKETGIKVVYDVFDSNEVLEGKLMA...,"29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,4...",A;B;C;D,"[[7, 8, 9, 10, 11, 12, 33, 34, 35, 36, 54, 55,...",A;B;C;D,"[[8, 9, 10, 11, 215, 218, 247, 249, 285], [349..."
2,1ax0,A2G,A2G_A_401,A,VETISFSFSEFEPGNDNLTLQGAALITQSGVLQLTKINQNGMPAWD...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[44, 45, 84, 85, 86, 87, 88, 89, 103, 104, 10...",A,"[[88, 105, 106, 130, 132, 134, 216, 217, 218, ..."
3,1b6l,PI4,PI4_A_201,"A,B",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","A,B","[[7, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 4...","A,B","[[24, 26, 27, 28, 29, 47, 48, 49, 80, 82, 104,..."
4,1b6m,PI6,PI6_B_201,"A,B",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","A,B","[[7, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 4...","A,B","[[7, 22, 24, 26, 27, 29, 47, 48, 49, 79, 80, 8..."
...,...,...,...,...,...,...,...,...,...,...
137,2qrk,AMP,AMP_A_500,A,AVTLHLRAETKPLEARAALTPTTVKKLIAKGFKIYVEDSPQSTFNI...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[127, 131, 132, 133, 134, 195, 196, 197, 198,...",A,"[[196, 197, 199, 200, 201, 202, 224, 225, 228,..."
138,2nn1,M28,"M28_A_311,M28_B_312","A,B",PDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLKPIS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...","A,B;A,B","[[2, 4, 26, 27, 59, 61, 64, 66, 88, 89, 90, 91...",A;B,"[[64, 91, 93, 116, 140, 194, 195, 196, 197, 20..."
139,2pou,I7A,I7A_A_1000,A,HHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[2, 4, 26, 27, 59, 61, 62, 63, 64, 88, 89, 90...",A,"[[61, 64, 89, 91, 93, 116, 118, 127, 137, 139,..."
140,3cd0,6HI,"6HI_B_1,6HI_C_4,6HI_D_3","A,B,C,D",EPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIE...,"441,442,443,444,445,446,447,448,449,450,451,45...","A,B;C,D;C,D","[[118, 119, 120, 121, 122, 123, 124, 125, 126,...","A,B;C,D;C,D","[[118, 119, 120, 121, 123, 124, 127, 294, 310,..."


#### 3.9 Astex dataset

In [104]:
Astex_df = pd.read_csv("./preprocessed_data/step3_Astex_labels.tsv", sep = "\t")
Astex_BS_df = pd.DataFrame({"PDB":[i for i in Astex_df.PDB.values], "Lig_code":[i for i in Astex_df.Lig_code.values], "Check_lig_info":[i for i in Astex_df.Check_Lig_info.values]})

In [105]:
path = "./data/PDB/structure/Astex/"

In [106]:
Astex_info_results = parallelize_dataframe(Astex_BS_df, get_pocket_chain_info_bulk, 10)

In [107]:
Astex_info_results = pd.concat(Astex_info_results)

In [108]:
Astex_BS_df["Chain"] = Astex_info_results.map(lambda a: a[0] if a is not None else None)
Astex_BS_df["PDB_seqs"] = Astex_info_results.map(lambda a: a[1] if a is not None else None)
Astex_BS_df["PDB_indexes"] = Astex_info_results.map(lambda a: a[2] if a is not None else None)
Astex_BS_df["Chain_8A"] = Astex_info_results.map(lambda a: a[3] if a is not None else None)
Astex_BS_df["BS_8A"] = Astex_info_results.map(lambda a: a[4] if a is not None else None)
Astex_BS_df["Chain_4A"] = Astex_info_results.map(lambda a: a[5] if a is not None else None)
Astex_BS_df["BS_4A"] = Astex_info_results.map(lambda a: a[6] if a is not None else None)

In [109]:
Astex_BS_df = Astex_BS_df.dropna(axis = 0).reset_index(drop=True)
Astex_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,1gm8,SOX,SOX_B_1559,"A,B",SSSEIKIVRDEYGMPHIYANDTWHLFYGYGYVVAQDRLFQMEMARR...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...","A,B","[[138, 139, 142, 143, 144, 145, 146, 147, 207,...","A,B","[[139, 143, 146, 207, 229, 230, 273, 274, 275,..."
1,1gpk,HUP,HUP_A_1540,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A,"[[65, 66, 68, 76, 77, 79, 80, 81, 82, 110, 111...",A,"[[80, 112, 113, 114, 115, 117, 118, 119, 120, ..."
2,1hnn,SKF,"SKF_A_3001,SKF_B_3002","A,B",AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,"22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,3...",A;B,"[[5, 8, 13, 14, 16, 17, 18, 19, 21, 22, 23, 25...",A;B,"[[13, 17, 18, 22, 31, 35, 160, 197, 200, 236, ..."
3,1hp0,AD3,"AD3_A_1315,AD3_B_1316","A,B",SAKNVVLDHDGNLDDFVAMVLLASNTEKVRLIGALCTDADCFVENG...,"0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[8, 9, 10, 11, 12, 13, 14, 15, 39, 78, 79, 81...",A;B,"[[11, 13, 14, 39, 78, 82, 136, 163, 172, 183, ..."
4,1hq2,PH2,PH2_A_181,A,TVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLG...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[5, 6, 7, 8, 9, 39, 40, 41, 42, 43, 44, 45, 4...",A,"[[41, 42, 43, 44, 52, 54, 88, 94, 122]]"
...,...,...,...,...,...,...,...,...,...,...
70,1ywr,LI9,LI9_A_361,A,RPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGHRV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A,"[[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, ...",A,"[[25, 30, 46, 48, 99, 100, 101, 102, 103, 104,..."
71,1z95,198,198_A_501,A,IFLNVLEAIEPGVVCAGHDNNQPDSFAALLSSLNELGERQLVHVVK...,"672,673,674,675,676,677,678,679,680,681,682,68...",A,"[[13, 15, 28, 29, 30, 31, 32, 33, 34, 35, 36, ...",A,"[[32, 33, 35, 36, 39, 66, 69, 70, 73, 74, 77, ..."
72,2bm2,PM2,"PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211","A,B,C,D",IVGGQEAPRSKWPWQVSLRVHGWMHFCGGSLIHPQWVLTAAHCVGL...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...","A,C;B,D;A,C;B,D","[[1, 41, 77, 78, 79, 119, 127, 152, 155, 163, ...","A,C;B,D;A,C;B,D","[[79, 169, 170, 171, 172, 175, 193, 195, 196, ..."
73,2br1,PFP,PFP_A_1277,A,AVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRCPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A,"[[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...",A,"[[13, 14, 15, 21, 34, 79, 80, 81, 82, 83, 85, ..."


#### 3.10 COACH420 dataset

In [110]:
COACH420_df = pd.read_csv("./preprocessed_data/step5_COACH420_labels.tsv", sep = "\t")
COACH420_BS_df = pd.DataFrame({"PDB":[i for i in COACH420_df.PDB.values], "Lig_code":[i for i in COACH420_df.Lig_code.values], "Check_lig_info":[i for i in COACH420_df.Check_Lig_info.values]})

In [111]:
path = "./data/PDB/structure/COACH420/"

In [112]:
COACH420_info_results = parallelize_dataframe(COACH420_BS_df, get_pocket_chain_info_bulk, 10)

In [113]:
COACH420_info_results = pd.concat(COACH420_info_results)

In [114]:
COACH420_BS_df["Chain"] = COACH420_info_results.map(lambda a: a[0] if a is not None else None)
COACH420_BS_df["PDB_seqs"] = COACH420_info_results.map(lambda a: a[1] if a is not None else None)
COACH420_BS_df["PDB_indexes"] = COACH420_info_results.map(lambda a: a[2] if a is not None else None)
COACH420_BS_df["Chain_8A"] = COACH420_info_results.map(lambda a: a[3] if a is not None else None)
COACH420_BS_df["BS_8A"] = COACH420_info_results.map(lambda a: a[4] if a is not None else None)
COACH420_BS_df["Chain_4A"] = COACH420_info_results.map(lambda a: a[5] if a is not None else None)
COACH420_BS_df["BS_4A"] = COACH420_info_results.map(lambda a: a[6] if a is not None else None)

In [115]:
COACH420_BS_df = COACH420_BS_df.dropna(axis = 0).reset_index(drop=True)
COACH420_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,1a7x,FKA,FKA_B_201,"A,B",GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","A,B","[[51, 52, 53, 54, 55, 56, 57, 76, 77, 78, 79, ...","A,B","[[53, 77, 78, 80, 81, 82, 132, 142, 143, 148, ..."
1,1afk,PAP,"PAP_A_125,PAP_B_125","A,B",KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B,"[[1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 34, 40, 42, ...",A;B,"[[3, 6, 10, 11, 40, 64, 66, 68, 70, 108, 110, ..."
2,1atl,0QI,"0QI_A_301,0QI_B_311","A,B",LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A;B,"[[68, 100, 101, 102, 103, 104, 105, 106, 107, ...",A;B,"[[103, 104, 105, 106, 139, 140, 143, 149, 162,..."
3,1b8u,NAD,NAD_A_334,A,KTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPNEK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 38, 39,...",A,"[[8, 9, 12, 13, 14, 39, 40, 41, 87, 88, 89, 10..."
4,1b8u,OAA,OAA_A_350,A,KTPMRVAVTGAAGQICYSLLFRIANGDMLGKDQPVILQLLEIPNEK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A,"[[131, 155, 158, 159, 162, 186, 187, 188, 189,...",A,"[[158, 162, 187, 188, 189, 224, 225, 226, 227,..."
...,...,...,...,...,...,...,...,...,...,...
318,7dfr,FOL,FOL_A_161,A,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[3, 4, 5, 6, 7, 13, 19, 21, 23, 24, 25, 26, 2...",A,"[[4, 5, 6, 19, 26, 27, 29, 30, 31, 45, 49, 53,..."
319,7dfr,NAP,NAP_A_164,A,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 1...",A,"[[5, 6, 13, 14, 15, 16, 17, 18, 19, 21, 42, 43..."
320,7est,0Z2,0Z2_E_1,E,VVGGTEAQRNSWPSQISLQYRSWAHTCGGTLIRQNWVMTAAHCVDR...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...",E,"[[0, 25, 26, 27, 39, 41, 42, 44, 78, 80, 81, 8...",E,"[[41, 83, 158, 174, 175, 177, 178, 196, 197, 1..."
321,830c,RS1,"RS1_A_1,RS1_B_1","A,B",YNVFPRTLKWSKMNLTYRIVNYTPDMTHSEVEKAFKKAFKVWSDVT...,"104,105,106,107,108,109,110,111,112,113,114,11...","A;A,B","[[72, 73, 74, 75, 78, 79, 80, 81, 82, 83, 84, ...",A;B,"[[79, 80, 81, 82, 83, 84, 114, 115, 118, 119, ..."


#### 3.11 HOLO4K dataset

In [116]:
HOLO4K_df = pd.read_csv("./preprocessed_data/step5_HOLO4K_labels.tsv", sep = "\t")
HOLO4K_BS_df = pd.DataFrame({"PDB":[i for i in HOLO4K_df.PDB.values], "Lig_code":[i for i in HOLO4K_df.Lig_code.values], "Check_lig_info":[i for i in HOLO4K_df.Check_Lig_info.values]})

In [117]:
path = "./data/PDB/structure/HOLO4K/"

In [118]:
HOLO4K_info_results = parallelize_dataframe(HOLO4K_BS_df, get_pocket_chain_info_bulk, 10)

In [119]:
HOLO4K_info_results = pd.concat(HOLO4K_info_results)

In [120]:
HOLO4K_BS_df["Chain"] = HOLO4K_info_results.map(lambda a: a[0] if a is not None else None)
HOLO4K_BS_df["PDB_seqs"] = HOLO4K_info_results.map(lambda a: a[1] if a is not None else None)
HOLO4K_BS_df["PDB_indexes"] = HOLO4K_info_results.map(lambda a: a[2] if a is not None else None)
HOLO4K_BS_df["Chain_8A"] = HOLO4K_info_results.map(lambda a: a[3] if a is not None else None)
HOLO4K_BS_df["BS_8A"] = HOLO4K_info_results.map(lambda a: a[4] if a is not None else None)
HOLO4K_BS_df["Chain_4A"] = HOLO4K_info_results.map(lambda a: a[5] if a is not None else None)
HOLO4K_BS_df["BS_4A"] = HOLO4K_info_results.map(lambda a: a[6] if a is not None else None)

In [121]:
HOLO4K_BS_df = HOLO4K_BS_df.dropna(axis = 0).reset_index(drop=True)
HOLO4K_BS_df

Unnamed: 0,PDB,Lig_code,Check_lig_info,Chain,PDB_seqs,PDB_indexes,Chain_8A,BS_8A,Chain_4A,BS_4A
0,121p,GCP,GCP_A_167,A,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A,"[[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,...",A,"[[10, 11, 12, 13, 14, 15, 16, 17, 27, 28, 29, ..."
1,12as,AMP,"AMP_A_332,AMP_B_332","A,B",AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A;B,"[[42, 44, 45, 67, 68, 70, 92, 94, 96, 99, 100,...",A;B,"[[96, 99, 105, 106, 107, 110, 112, 244, 245, 2..."
2,13pk,3PG,"3PG_A_423,3PG_B_423","A,B,C,D",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A;B,"[[17, 18, 19, 20, 21, 22, 23, 34, 56, 57, 58, ...",A;B,"[[19, 21, 34, 57, 59, 60, 130, 163, 164, 167, ..."
3,13pk,ADP,"ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421","A,B,C,D",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A;B;C;D,"[[21, 23, 34, 60, 210, 211, 212, 213, 214, 215...",A;B;C;D,"[[212, 213, 214, 218, 236, 237, 240, 289, 309,..."
4,16pk,BIS,BIS_A_499,A,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A,"[[19, 21, 34, 57, 130, 162, 163, 164, 166, 210...",A,"[[212, 213, 218, 236, 237, 240, 254, 289, 309,..."
...,...,...,...,...,...,...,...,...,...,...
3908,9gss,GTX,"GTX_A_211,GTX_B_210","A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...","A,B;A,B","[[5, 6, 7, 8, 9, 10, 11, 12, 15, 31, 32, 33, 3...","A,B;A,B","[[5, 6, 11, 33, 36, 42, 48, 49, 50, 51, 62, 63..."
3909,9ldb,NAD,"NAD_A_401,NAD_B_401","A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","A,B;A,B","[[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 49, ...",A;B,"[[25, 27, 28, 29, 50, 51, 52, 93, 94, 95, 96, ..."
3910,9ldb,OXM,OXM_B_402,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","A,B","[[62, 65, 352, 420, 421, 425, 427, 430, 455, 4...",B,"[[421, 427, 457, 488, 512, 551, 561]]"
3911,9ldt,NAD,"NAD_A_401,NAD_B_401","A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","A,B;A,B","[[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 49, ...",A;B,"[[25, 27, 28, 29, 50, 51, 52, 93, 94, 95, 96, ..."


In [122]:
scPDB_df.to_csv("./preprocessed_bs_data/step2_scPDB_data.tsv", sep = "\t", index = False)
PDBbind_df.to_csv("./preprocessed_bs_data/step2_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_BS_df.to_csv("./preprocessed_bs_data/step2_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_BS_df.to_csv("./preprocessed_bs_data/step2_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_BS_df.to_csv("./preprocessed_bs_data/step2_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_BS_df.to_csv("./preprocessed_bs_data/step2_CASF2012_data.tsv", sep = "\t", index = False)
CSARset1_BS_df.to_csv("./preprocessed_bs_data/step2_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_BS_df.to_csv("./preprocessed_bs_data/step2_CSARset2_data.tsv", sep = "\t", index = False)
Astex_BS_df.to_csv("./preprocessed_bs_data/step2_Astex_data.tsv", sep = "\t", index = False)
COACH420_BS_df.to_csv("./preprocessed_bs_data/step2_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_BS_df.to_csv("./preprocessed_bs_data/step2_HOLO4K_data.tsv", sep = "\t", index = False)

In [123]:
def make_index_dict(chain, indexes):
    results, reindexing = dict(), 0

    for ch, ind in zip(chain.split(","), indexes.split(";")):
        for i in ind.split(","):
            results[f"{ch}_{i}"] = reindexing
            reindexing += 1

    return results

def sort_chain(pocket_chain, chain):
    results = list()
    
    for i in chain.split(","):
        if i in pocket_chain.split(","):
            results.append(i)
    
    return results

def convert_bs(pocket_index, reindexing_dict, chain, indexes):
    
    results = list()
    for idx, (ch, ind) in enumerate(zip(chain.split(","), indexes.split(";"))):
        tmp = list()
        for i in ind.split(","):
            reindex = reindexing_dict[f"{ch}_{i}"]
            
            if reindex in pocket_index:
                tmp.append(str(i))
        
        if len(tmp) != 0:
            results.append(",".join(tmp))
        
    return ";".join(results)

In [124]:
def preprocessing_df(df):
    
    results_dict = dict()
    
    pdbids, ligand_codes, chains, sequences, indexes, check_ligand_info, chain8A, BS8A, china4A, BS4A = df.PDB.values, df.Lig_code.values, df.Chain.values, df.PDB_seqs.values, df.PDB_indexes.values, df.Check_lig_info.values, df.Chain_8A.values, df.BS_8A.values, df.Chain_4A.values, df.BS_4A.values

    for pdbid, ligand_code, chain, sequence, index, ligand_info, chain_8A, BS_8A, chain_4A, BS_4A in zip(pdbids, ligand_codes, chains, sequences, indexes, check_ligand_info, chain8A, BS8A, china4A, BS4A):
        reindexing_dict = make_index_dict(chain, index)
        
        results_dict[f"{pdbid}_{ligand_code}"] = dict()
        results_dict[f"{pdbid}_{ligand_code}"]["Chain"] = chain
        results_dict[f"{pdbid}_{ligand_code}"]["PDB_seqs"] = sequence
        results_dict[f"{pdbid}_{ligand_code}"]["PDB_indexes"] = index
        
        results_dict[f"{pdbid}_{ligand_code}"]["BS_8A"] = dict()
        results_dict[f"{pdbid}_{ligand_code}"]["BS_4A"] = dict()
        
        pocket_chain_8A_list, pocket_chain_4A_list = chain_8A.split(";"), chain_4A.split(";")

        for idx, ligand in enumerate(list(ligand_info.split(","))):
            cur_chain8A = sort_chain(pocket_chain_8A_list[idx], chain)
            cur_BS8A = convert_bs(BS_8A[idx], reindexing_dict, chain, index)
            
            cur_chain4A = sort_chain(pocket_chain_4A_list[idx], chain)
            cur_BS4A = convert_bs(BS_4A[idx], reindexing_dict, chain, index)

            results_dict[f"{pdbid}_{ligand_code}"]["BS_8A"][ligand] = (cur_chain8A, cur_BS8A)
            results_dict[f"{pdbid}_{ligand_code}"]["BS_4A"][ligand] = (cur_chain4A, cur_BS4A)

    return results_dict

In [128]:
scPDB_dict = preprocessing_df(scPDB_df)

In [125]:
PDBbind_dict = preprocessing_df(PDBbind_df)

In [126]:
CASF2016_BS_dict = preprocessing_df(CASF2016_BS_df)
CASF2013_BS_dict = preprocessing_df(CASF2013_BS_df)
CSAR2014_BS_dict = preprocessing_df(CSAR2014_BS_df)
CSAR2012_BS_dict = preprocessing_df(CSAR2012_BS_df)
CSARset1_BS_dict = preprocessing_df(CSARset1_BS_df)
CSARset2_BS_dict = preprocessing_df(CSARset2_BS_df)
Astex_BS_dict = preprocessing_df(Astex_BS_df)
COACH420_BS_dict = preprocessing_df(COACH420_BS_df)
HOLO4K_BS_dict = preprocessing_df(HOLO4K_BS_df)

In [129]:
print(f"[scPDB] {len(scPDB_dict)}")
print(f"[PDBbind] {len(PDBbind_dict)}")
print(f"[CASF2016] {len(CASF2016_BS_dict)}")
print(f"[CASF2013] {len(CASF2013_BS_dict)}")
print(f"[CSAR2014] {len(CSAR2014_BS_dict)}")
print(f"[CSAR2012] {len(CSAR2012_BS_dict)}")
print(f"[CSARset1] {len(CSARset1_BS_dict)}")
print(f"[CSARset2] {len(CSARset2_BS_dict)}")
print(f"[Astex] {len(Astex_BS_dict)}")
print(f"[COACH420] {len(COACH420_BS_dict)}")
print(f"[HOLO4K] {len(HOLO4K_BS_dict)}")

[scPDB] 17155
[PDBbind] 14401
[CASF2016] 279
[CASF2013] 177
[CSAR2014] 46
[CSAR2012] 55
[CSARset1] 156
[CSARset2] 142
[Astex] 75
[COACH420] 323
[HOLO4K] 3913


### 4. Save data

In [130]:
with open("./preprocessed_bs_data/step3_scPDB_data.pkl", "wb") as f:
    pickle.dump(scPDB_dict, f)
    
with open("./preprocessed_bs_data/step3_PDBbind_data.pkl", "wb") as f:
    pickle.dump(PDBbind_dict, f)

with open("./preprocessed_bs_data/step3_CASF2016_data.pkl", "wb") as f:
    pickle.dump(CASF2016_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_CASF2013_data.pkl", "wb") as f:
    pickle.dump(CASF2013_BS_dict, f)

with open("./preprocessed_bs_data/step3_CSAR2014_data.pkl", "wb") as f:
    pickle.dump(CSAR2014_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_CSAR2012_data.pkl", "wb") as f:
    pickle.dump(CSAR2012_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_CSARset1_data.pkl", "wb") as f:
    pickle.dump(CSARset1_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_CSARset2_data.pkl", "wb") as f:
    pickle.dump(CSARset2_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_Astex_data.pkl", "wb") as f:
    pickle.dump(Astex_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_COACH420_data.pkl", "wb") as f:
    pickle.dump(COACH420_BS_dict, f)
    
with open("./preprocessed_bs_data/step3_HOLO4K_data.pkl", "wb") as f:
    pickle.dump(HOLO4K_BS_dict, f)