In [1]:
import os
import numpy as np
import pandas as pd
from Bio import PDB
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
from scipy.spatial import distance_matrix
from multiprocessing import Process, Queue, Pool

### 1. Load data

In [2]:
PDBbind_df = pd.read_csv("./preprocessed_data/step2_PDBbind_data.tsv", sep = "\t")
CASF2016_df = pd.read_csv("./preprocessed_data/step2_CASF2016_data.tsv", sep = "\t")
CASF2013_df = pd.read_csv("./preprocessed_data/step2_CASF2013_data.tsv", sep = "\t")
CSAR2014_df = pd.read_csv("./preprocessed_data/step2_CSAR2014_data.tsv", sep = "\t")
CSAR2012_df = pd.read_csv("./preprocessed_data/step2_CSAR2012_data.tsv", sep = "\t")
CSARset1_df = pd.read_csv("./preprocessed_data/step2_CSARset1_data.tsv", sep = "\t")
CSARset2_df = pd.read_csv("./preprocessed_data/step2_CSARset2_data.tsv", sep = "\t")
Astex_df = pd.read_csv("./preprocessed_data/step2_Astex_data.tsv", sep = "\t")
COACH420_df = pd.read_csv("./preprocessed_data/step2_COACH420_data.tsv", sep = "\t")
HOLO4K_df = pd.read_csv("./preprocessed_data/step2_HOLO4K_data.tsv", sep = "\t")

In [3]:
def get_complex_info(df):
    return [f"{i}_{j}" for i, j in zip(df.iloc[:,0].values, df.iloc[:,1].values)]

In [4]:
PDBbind_complex, CASF2016_complex, CASF2013_complex, CSAR2014_complex, CSAR2012_complex, CSARset1_complex, CSARset2_complex, Astex_complex, COACH420_complex, HOLO4K_complex = get_complex_info(PDBbind_df), get_complex_info(CASF2016_df), get_complex_info(CASF2013_df), get_complex_info(CSAR2014_df), get_complex_info(CSAR2012_df), get_complex_info(CSARset1_df), get_complex_info(CSARset2_df), get_complex_info(Astex_df), get_complex_info(COACH420_df), get_complex_info(HOLO4K_df)
print(f"[pdbbind] Complexes: {len(PDBbind_complex)}")
print(f"[CASF2016] Complexes: {len(CASF2016_complex)}")
print(f"[CASF2013] Complexes: {len(CASF2013_complex)}")
print(f"[CSAR2014] Complexes: {len(CSAR2014_complex)}")
print(f"[CSAR2012] Complexes: {len(CSAR2012_complex)}")
print(f"[CSARset1] Complexes: {len(CSARset1_complex)}")
print(f"[CSARset2] Complexes: {len(CSARset2_complex)}")
print(f"[Astex] Complexes: {len(Astex_complex)}")
print(f"[COACH420] Complexes: {len(COACH420_complex)}")
print(f"[HOLO4K] Complexes: {len(HOLO4K_complex)}")

[pdbbind] Complexes: 14543
[CASF2016] Complexes: 279
[CASF2013] Complexes: 177
[CSAR2014] Complexes: 46
[CSAR2012] Complexes: 55
[CSARset1] Complexes: 156
[CSARset2] Complexes: 142
[Astex] Complexes: 75
[COACH420] Complexes: 358
[HOLO4K] Complexes: 4219


### 2. Preprocessing PDB file

In [5]:
def count_atom(lines):
    return len([line for line in lines if "ATOM" in line and line[76:78].strip() != "H" and line[76:78].strip() != "D"])

def count_HETATM(lines):
    return len([line for line in lines if "HETATM" in line and line[76:78].strip() != "H" and line[76:78].strip() != "D"])

def read_file(file):
    return file.readlines()

def fwrite(fw, file):
    for i in file:
        fw.write(f"{i}\n")
    fw.close()    
    
def get_results(pdbid, ligand_code, ligand_dict, ligand_info_list):
    ligand_atom_count = dict()
    
    """ Get Ideal ligand """
    ideal_lines = read_file(open(f"{ideal_ligand_path}/{ligand_code}_ideal.pdb", "r"))
    ideal_ligand_atom_count = count_atom(ideal_lines)  
    
    """ Get final ligand info """
    for ligand in ligand_info_list:
        ligand_atom_count[ligand] = count_HETATM(ligand_dict[ligand])
    
    sorted_ligand_atom_count = sorted(ligand_atom_count.items(), key = lambda item: item[1], reverse = True)
    final_ligand_info, final_ligand_atom_count = sorted_ligand_atom_count[0][0], sorted_ligand_atom_count[0][1]    
    
    write_ligand_list, tmp = list(), list()
    
    if ideal_ligand_atom_count >= final_ligand_atom_count:
        if ideal_ligand_atom_count - final_ligand_atom_count < 6:
            if not os.path.isdir(f"{des_path}/{pdbid}/"):
                command = f"mkdir {des_path}/{pdbid}/"
                os.system(command)
            
            for ligand in list(ligand_dict.keys()):
                if len(ligand_dict[ligand]) != 0:
                    if f'{ligand.split("_")[0]}_{ligand.split("_")[1]}' not in tmp:
                        tmp.append(f'{ligand.split("_")[0]}_{ligand.split("_")[1]}')
                        
                        fwrite(open(f"{des_path}/{pdbid}/{ligand}.pdb", "w"), ligand_dict[ligand])
                        write_ligand_list.append(ligand)
            
            return ",".join(write_ligand_list)
        
        else:
            return None
        
    elif ideal_ligand_atom_count < final_ligand_atom_count:
        print(f"Check PDB fild: {pdbid}; Recorded ligand atom count is over ideal atom count.")
        return None

def preprocessing_PDB_file(row):
    def pre(line):
        return line[:16] + " " + line[17:]
    
    ligand_dict, ligand_atom_types_dict, model_flag = dict(), dict(), 0
    pdbid, ligand_code, ligand_info_list = row[0], row[1], row[-1].split(",")
    
    for ligand in ligand_info_list:
        ligand_dict[ligand] = list()
        ligand_atom_types_dict[ligand] = list()
        
    lines = read_file(open(f"{src_path}/{pdbid}.pdb", "r"))

    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_flag = 1 
                
            if "HETATM" in line:
                atom_type, residue_chain, residue, chain, residue_seq_number = line[12:16].strip(), line[16:17].strip(), line[17:20].strip(), line[21:22].strip(), line[22:26].strip()

                if f"{residue}_{chain}_{residue_seq_number}" in ligand_dict:

                    if len(ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"]) == 0:
                        ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"].append(atom_type)
                        
                        if residue_chain == "":
                            ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(line.strip())
                        else:
                            ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(pre(line.strip()))  
                    
                    else:
                        if atom_type not in ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"]:
                            ligand_atom_types_dict[f"{residue}_{chain}_{residue_seq_number}"].append(atom_type)

                            if residue_chain == "":
                                ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(line.strip())
                            else:
                                ligand_dict[f"{residue}_{chain}_{residue_seq_number}"].append(pre(line.strip())) 
                        
            elif line[:].strip() == "CONECT":

                ligand_code = get_results(pdbid, ligand_code, ligand_dict, ligand_info_list)

                return ligand_code

    ligand_code = get_results(pdbid, ligand_code, ligand_dict, ligand_info_list)

    return ligand_code

In [6]:
def copy_file(pdbid, src_path, des_path):

    if not os.path.isfile(f"{des_path}/{pdbid}.pdb"):
        command = f"cp {src_path}/{pdbid}.pdb {des_path}/{pdbid}.pdb"
        os.system(command)

def preprocessing_multimodel(path):
    write_lines, model_flag = list(), 0
    
    lines = read_file(open(path, "r"))
    
    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_flag = 1     
            write_lines.append(line.strip())
    fwrite(open(path, "w"), write_lines)

def remove_HEATM(pdbid):
    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0 

    if not os.path.isfile(f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb"):

        src_file = f"{src_path}/{pdbid}.pdb"
        des_file = f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb"

        pdb = PDBParser().get_structure(pdbid, src_file)
        io = PDBIO()
        io.set_structure(pdb)
        io.save(des_file, NonHetSelect())
        
        preprocessing_multimodel(f"{protein_des_path}/{pdbid}/{pdbid}_protein.pdb")

In [7]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [8]:
def preprocessing_PDB_file_bulk(df):
    return df.apply(preprocessing_PDB_file, axis = 1)

In [9]:
def remove_HEATM_bulk(df):
    return df.PDB.map(remove_HEATM)

#### 2.1 PDBbind dataset get final ligand info

In [10]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"

des_path = "./data/PDB/structure/PDBbind"

In [11]:
PDBbind_ligand_results = parallelize_dataframe(PDBbind_df, preprocessing_PDB_file_bulk, 10)

In [12]:
PDBbind_ligand_results = pd.concat(PDBbind_ligand_results)

In [13]:
PDBbind_df["Check_Lig_info"] = PDBbind_ligand_results.map(lambda a:a if a is not None else None)

In [14]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,3wka,S0G,4.54,CN1C(=C(C(=O)NC1=O)N2CCCCC2)N,CN1C(N)=C(N2CCCCC2)C(=O)NC1=O,29.00,uM,4.54,S0G_A_603,S0G_A_603
1,5kgx,7SK,4.80,CC(C)(C)O[C@@H](c1c(c2ccccc2n1C)c3ccc4c(c3)CCC...,CN1C([C@H](OC(C)(C)C)C(=O)O)=C(C2=CC=C3OCCCC3=...,16.00,uM,4.80,7SK_A_301,7SK_A_301
2,5wqc,7MA,8.96,CCN(Cc1cccnc1)C(=O)CN(c2ccc(nc2)OC)S(=O)(=O)c3...,CCN(CC1=CC=CN=C1)C(=O)CN(C1=CC=C(OC)N=C1)S(=O)...,1.10,nM,8.96,7MA_A_2001,7MA_A_2001
3,5aen,DP8,7.22,CN(C)CCOc1ccc(cc1)Oc2ccccc2,CN(C)CCOC1=CC=C(OC2=CC=CC=C2)C=C1,0.06,uM,7.22,DP8_A_611,DP8_A_611
4,1utp,PBN,1.44,c1ccc(cc1)CCCCN,NCCCCC1=CC=CC=C1,36.00,mM,1.44,PBN_A_1246,PBN_A_1246
...,...,...,...,...,...,...,...,...,...,...
14398,3fci,3FI,5.89,c1cc(cc(c1)C(=O)O)\C=N\OCCCNCC2=CC(=O)NC(=O)N2,O=C(O)C1=CC=CC(/C=N/OCCCNCC2=CC(=O)NC(=O)N2)=C1,1.30,uM,5.89,3FI_A_1,3FI_A_1
14399,5j9l,6HF,7.47,CN1CCN(CC1)c2ccc(cc2)Nc3nc4c(cc[nH]4)c(n3)Oc5c...,C=CC(=O)NC1=CC=C(OC2=NC(NC3=CC=C(N4CCN(C)CC4)C...,34.00,nM,7.47,6HF_A_501,6HF_A_501
14400,2q9n,LK5,6.37,CCCCO[C@@H]1CCC[C@@H]2C1=C(N[C@@H]2[C@@H](CC)C...,CCCCO[C@@H]1CCC[C@@H]2C1=C(C(=O)O)N[C@@H]2[C@H...,430.00,nM,6.37,LK5_A_0,LK5_A_0
14401,3np7,Z15,4.02,c1c(c(cc(c1O)Cl)O)[C@H]2[C@@H]([C@H]([C@@H]([C...,OC[C@H]1O[C@@H](C2=CC(O)=C(Cl)C=C2O)[C@H](O)[C...,95.00,uM,4.02,"Z15_A_997,Z15_A_998",Z15_A_997


#### 2.2 PDBbind dataset get protein info

In [15]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/PDBbind"

In [16]:
_ = parallelize_dataframe(PDBbind_df, remove_HEATM_bulk, 10)

#### 2.3 CASF2016 dataset get final ligand info

In [17]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CASF2016"

In [18]:
CASF2016_ligand_results = parallelize_dataframe(CASF2016_df, preprocessing_PDB_file_bulk, 10)

In [19]:
CASF2016_ligand_results = pd.concat(CASF2016_ligand_results)

In [20]:
CASF2016_df["Check_Lig_info"] = CASF2016_ligand_results.map(lambda a:a if a is not None else None)

In [21]:
CASF2016_df = CASF2016_df.dropna(axis = 0).reset_index(drop=True)
CASF2016_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.53,mM,3.28,PRL_H_280,PRL_H_280
1,1bzc,TPI,4.92,c1cc(cc2c1cc(cc2)C(F)(F)P(=O)(O)O)C(=O)N[C@@H]...,NC(=O)[C@H](CCC(=O)O)NC(=O)C1=CC2=C(C=C1)C=C(C...,12.00,uM,4.92,TPI_A_902,TPI_A_902
2,1c5z,BEN,4.01,[H]/N=C(\c1ccccc1)/N,[H]/N=C(/N)C1=CC=CC=C1,97.00,uM,4.01,BEN_B_251,BEN_B_251
3,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.13,nM,9.89,HUX_A_803,HUX_A_803
4,1eby,BEB,9.70,c1ccc(cc1)CO[C@H]([C@@H]([C@H]([C@H](C(=O)N[C@...,O=C(N[C@H]1C2=CC=CC=C2C[C@H]1O)[C@H](OCC1=CC=C...,0.20,nM,9.70,BEB_B_501,BEB_B_501
...,...,...,...,...,...,...,...,...,...,...
274,5aba,UL7,2.98,c1c(cc(c(c1CN2CCC(CC2)N3CCC(CC3)O)O)Br)Cl,OC1=C(CN2CCC(N3CCC(O)CC3)CC2)C=C(Cl)C=C1Br,1040.00,uM,2.98,"UL7_A_1291,UL7_B_1291","UL7_A_1291,UL7_B_1291"
275,5c28,4XV,5.66,Cc1c(nc(nc1Cl)C2CC2)N,CC1=C(N)N=C(C2CC2)N=C1Cl,2.20,uM,5.66,4XV_A_803,4XV_A_803
276,5c2h,4XU,11.09,Cc1c(nc(nc1Cl)OCCCc2ccc3ccccc3n2)NCc4c(nc(s4)C)C,CC1=NC(C)=C(CNC2=C(C)C(Cl)=NC(OCCCC3=CC=C4C=CC...,8.20,pM,11.09,4XU_B_803,4XU_B_803
277,5dwr,5H7,11.22,C[C@H]1C[C@H](C[C@H](C1)N)c2ccncc2NC(=O)c3ccc(...,C[C@@H]1C[C@H](N)C[C@H](C2=CC=NC=C2NC(=O)C2=CC...,6.00,pM,11.22,5H7_A_401,5H7_A_401


#### 2.4 CASF2016 dataset get protein info

In [22]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CASF2016"

In [23]:
_ = parallelize_dataframe(CASF2016_df, remove_HEATM_bulk, 10)

#### 2.5 CASF2013 dataset get final ligand info

In [24]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CASF2013"

In [25]:
CASF2013_ligand_results = parallelize_dataframe(CASF2013_df, preprocessing_PDB_file_bulk, 10)

In [26]:
CASF2013_ligand_results = pd.concat(CASF2013_ligand_results)

In [27]:
CASF2013_df["Check_Lig_info"] = CASF2013_ligand_results.map(lambda a:a if a is not None else None)

In [28]:
CASF2013_df = CASF2013_df.dropna(axis = 0).reset_index(drop=True)
CASF2013_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,10gs,VWW,6.40,c1ccc(cc1)CSC[C@@H](C(=O)N[C@H](c2ccccc2)C(=O)...,N[C@@H](CCC(=O)N[C@@H](CSCC1=CC=CC=C1)C(=O)N[C...,0.400,uM,6.40,"VWW_A_210,VWW_B_210","VWW_A_210,VWW_B_210"
1,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.530,mM,3.28,PRL_H_280,PRL_H_280
2,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.130,nM,9.89,HUX_A_803,HUX_A_803
3,1f8b,DAN,5.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@@H]1[C@@H](O)C=C(C(=O)O)O[C@H]1[C@H]...,4.000,uM,5.40,DAN_A_0,DAN_A_0
4,1f8c,4AM,7.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@@H]1[C@@H](N)C=C(C(=O)O)O[C@H]1[C@H]...,0.040,uM,7.40,4AM_A_4,4AM_A_4
...,...,...,...,...,...,...,...,...,...,...
171,4djv,0KM,6.72,[H]/N=C/1\N[C@](C(=O)N1C)(c2ccccc2)c3cccc(c3)c...,[H]/N=C1\N[C@](C2=CC=CC=C2)(C2=CC=CC(C3=CC=CC(...,0.190,uM,6.72,"0KM_A_501,0KM_B_501","0KM_A_501,0KM_B_501"
172,4g8m,G8M,7.89,C1C[C@@H]([C@@H]1[C@@H](C(=O)O)N)C(=O)O,N[C@H](C(=O)O)[C@@H]1CC[C@@H]1C(=O)O,12.800,nM,7.89,"G8M_A_301,G8M_B_901","G8M_A_301,G8M_B_901"
173,4gid,0GH,10.77,C[C@H](c1ccccc1)NC(=O)c2cc(cc(c2)N(C)S(=O)(=O)...,CC(C)CNC(=O)[C@@H](NC[C@H](CC1=CC=CC=C1)NC(=O)...,0.017,nM,10.77,"0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501","0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501"
174,4gqq,0XR,2.89,CCOC(=O)/C=C/c1ccc(c(c1)O)O,CCOC(=O)/C=C/C1=CC=C(O)C(O)=C1,1.300,mM,2.89,"0XR_A_502,0XR_A_503,0XR_A_504",0XR_A_502


#### 2.6 CASF2013 dataset get protein info

In [29]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CASF2013"

In [30]:
_ = parallelize_dataframe(CASF2013_df, remove_HEATM_bulk, 10)

#### 2.7 CSAR2014 dataset get final ligand info

In [31]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CSAR2014"

In [32]:
CSAR2014_ligand_results = parallelize_dataframe(CSAR2014_df, preprocessing_PDB_file_bulk, 10)

In [33]:
CSAR2014_ligand_results = pd.concat(CSAR2014_ligand_results)

In [34]:
CSAR2014_df["Check_Lig_info"] = CSAR2014_ligand_results.map(lambda a:a if a is not None else None)

In [35]:
CSAR2014_df = CSAR2014_df.dropna(axis = 0).reset_index(drop=True)
CSAR2014_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,4ypw,4FD,5.5,c1ccc(cc1)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC=CC=C2)N=C1,IC50,3162.0,nM,4FD_A_301,4FD_A_301
1,4ypx,4FG,4.6,c1cc(ncc1C(=O)N)N,NC(=O)C1=CN=C(N)C=C1,IC50,25119.0,nM,4FG_A_301,4FG_A_301
2,4ypy,4F9,3.8,c1cnccc1c2cnc[nH]2,C1=CC(C2=CN=CN2)=CC=N1,IC50,158489.0,nM,4F9_A_301,4F9_A_301
3,4ypz,4FL,3.5,c1cnccc1c2[nH]ccn2,C1=CC(C2=NC=CN2)=CC=N1,IC50,316228.0,nM,4FL_A_301,4FL_A_301
4,4yq0,4FM,4.4,c1cc(ccc1CNC(=O)c2c(non2)N)Cl,NC1=NON=C1C(=O)NCC1=CC=C(Cl)C=C1,IC50,39811.0,nM,4FM_A_301,4FM_A_301
5,4yq1,4FN,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCCC2)NC(=O)c3ccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,4FN_A_301,4FN_A_301
6,4yq2,EFY,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,EFY_A_301,EFY_A_301
7,4yq3,4G1,6.8,Cc1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc4c3con4,CC1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC=C34)CC...,IC50,158.0,nM,4G1_A_301,4G1_A_301
8,4yq4,4G3,5.7,c1cc(cc(c1)O)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC(O)=CC=C2)N=C1,IC50,1995.0,nM,4G3_A_301,4G3_A_301
9,4yq5,4G0,5.4,CN(C)c1cccc(c1)CNc2ccc(cn2)C(=O)N,CN(C)C1=CC=CC(CNC2=CC=C(C(N)=O)C=N2)=C1,IC50,3981.0,nM,4G0_A_301,4G0_A_301


#### 2.8 CSAR2014 dataset get protein info

In [36]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CSAR2014"

In [37]:
_ = parallelize_dataframe(CSAR2014_df, remove_HEATM_bulk, 10)

#### 2.9 CSAR2012 dataset get final ligand info

In [38]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CSAR2012"

In [39]:
CSAR2012_ligand_results = parallelize_dataframe(CSAR2012_df, preprocessing_PDB_file_bulk, 10)

In [40]:
CSAR2012_ligand_results = pd.concat(CSAR2012_ligand_results)

In [41]:
CSAR2012_df["Check_Lig_info"] = CSAR2012_ligand_results.map(lambda a:a if a is not None else None)

In [42]:
CSAR2012_df = CSAR2012_df.dropna(axis = 0).reset_index(drop=True)
CSAR2012_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,4fud,6UP,6.35,[H]/N=C(\c1ccc2cccc(c2c1)N)/N,[H]/N=C(/N)C1=CC=C2C=CC=C(N)C2=C1,Ki,450.0,nM,6UP_A_301,6UP_A_301
1,4fue,7UP,7.23,[H]/N=C(/c1ccc2cc(ccc2c1)C#Cc3ccc4c(c3)CCNC4)\N,[H]/N=C(\N)C1=CC=C2C=C(C#CC3=CC=C4CNCCC4=C3)C=...,Ki,58.8,nM,7UP_A_301,7UP_A_301
2,4fu7,1UP,6.2,[H]/N=C(/c1ccc2ccc(c(c2c1)OCC(=O)N)OC)\N,[H]/N=C(\N)C1=CC=C2C=CC(OC)=C(OCC(N)=O)C2=C1,Ki,637.0,nM,1UP_A_305,1UP_A_305
3,4fu8,2UP,5.23,[H]/N=C(\c1ccc2ccccc2c1)/N,[H]/N=C(/N)C1=CC=C2C=CC=CC2=C1,Ki,5910.0,nM,2UP_A_301,2UP_A_301
4,4fu9,675,6.2,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccccc3)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=CC=C3)C=CC2=C1,Ki,628.0,nM,675_A_313,675_A_313
5,4fub,4UP,6.21,[H]/N=C(/c1ccc2cc(ccc2c1)[C@H]3[C@@H](O3)c4ccc...,[H]/N=C(\N)C1=CC=C2C=C([C@@H]3O[C@H]3C3=CC=CC=...,Ki,610.0,nM,4UP_A_301,4UP_A_301
6,4fuc,239,7.4,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccc(cc3)CN)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=C(CN)C=C3)C...,Ki,40.0,nM,239_A_301,239_A_301
7,4fsm,HK1,7.62,COc1cc2c(cc1OC)-c3c(c([nH]n3)c4ccc(cc4)O)C2,COC1=CC2=C(C=C1OC)C1=NNC(C3=CC=C(O)C=C3)=C1C2,IC50,23.9,nM,HK1_A_301,HK1_A_301
8,4fsw,HK6,4.76,c1ccc2c(c1)C(=O)Nc3cc(ccc3N2)Cl,O=C1NC2=CC(Cl)=CC=C2NC2=CC=CC=C12,IC50,17200.0,nM,HK6_A_301,HK6_A_301
9,4ft5,H2K,7.56,c1cc(c(cc1Cl)NC(=O)Nc2cnc(cn2)C#N)O[C@@H]3CCNC3,N#CC1=NC=C(NC(=O)NC2=C(O[C@@H]3CCNC3)C=CC(Cl)=...,IC50,27.4,nM,H2K_A_300,H2K_A_300


#### 2.10 CSAR2012 dataset get protein info

In [43]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CSAR2012"

In [44]:
_ = parallelize_dataframe(CSAR2012_df, remove_HEATM_bulk, 10)

#### 2.11 CSARset1 dataset get final ligand info

In [45]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CSARset1"

In [46]:
CSARset1_ligand_results = parallelize_dataframe(CSARset1_df, preprocessing_PDB_file_bulk, 10)

In [47]:
CSARset1_ligand_results = pd.concat(CSARset1_ligand_results)

In [48]:
CSARset1_df["Check_Lig_info"] = CSARset1_ligand_results.map(lambda a:a if a is not None else None)

In [49]:
CSARset1_df = CSARset1_df.dropna(axis = 0).reset_index(drop=True)
CSARset1_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info
0,2are,MAN,3.28,C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O)O)O)O)O,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,"MAN_A_253,MAN_B_253","MAN_A_253,MAN_B_253"
1,2oag,DLI,8.47,CS(=O)(=O)c1cccc(c1)c2cc(ncn2)N3C[C@@H]([C@H](...,CS(=O)(=O)C1=CC=CC(C2=CC(N3C[C@H](C4=CC(F)=C(F...,DLI_B_4000,DLI_B_4000
2,2jbj,G88,9.70,C(CC(=O)O)[C@H](CP(=O)(O)O)C(=O)O,O=C(O)CC[C@H](CP(=O)(O)O)C(=O)O,G88_A_1768,G88_A_1768
3,2pwd,NOJ,4.40,C1[C@@H]([C@H]([C@@H]([C@H](N1)CO)O)O)O,OC[C@H]1NC[C@H](O)[C@@H](O)[C@@H]1O,"NOJ_A_8000,NOJ_B_8001","NOJ_A_8000,NOJ_B_8001"
4,2pwg,CTS,4.82,C1C[N@]2C[C@@H]([C@H]([C@@H]([C@H]2[C@H]1O)O)O)O,O[C@H]1CN2CC[C@H](O)[C@@H]2[C@@H](O)[C@@H]1O,"CTS_A_8000,CTS_B_8001","CTS_A_8000,CTS_B_8001"
...,...,...,...,...,...,...,...
151,3ekr,PY9,5.55,Cc1ccccc1[C@H]2CCCN2C(=O)c3ccc(cc3O)O,CC1=CC=CC=C1[C@H]1CCCN1C(=O)C1=CC=C(O)C=C1O,"PY9_A_901,PY9_B_901","PY9_A_901,PY9_B_901"
152,3ene,NPZ,6.24,Cn1c2c(c(n1)c3ccc4ccccc4c3)c(ncn2)N,CN1N=C(C2=CC=C3C=CC=CC3=C2)C2=C1N=CN=C2N,NPZ_A_1,NPZ_A_1
153,3eqr,T74,8.70,Cc1cccc(c1Nc2c3cnc(nc3n(n2)CCC(C)(C)OC)Nc4ccc(...,COC(C)(C)CCN1N=C(NC2=C(C)C=CC=C2C)C2=CN=C(NC3=...,"T74_A_1,T74_B_1","T74_A_1,T74_B_1"
154,3f8c,HT1,7.68,CCOc1ccc(cc1)c2[nH]c3cc(ccc3n2)c4[nH]c5cc(ccc5...,CCOC1=CC=C(C2=NC3=CC=C(C4=NC5=CC=C(N6CCN(C)CC6...,HT1_A_127,HT1_A_127


#### 2.12 CSARset1 dataset get protein info

In [50]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CSARset1"

In [51]:
_ = parallelize_dataframe(CSARset1_df, remove_HEATM_bulk, 10)

#### 2.13 CSARset2 dataset get final ligand info

In [52]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/CSARset2"

In [53]:
CSARset2_ligand_results = parallelize_dataframe(CSARset2_df, preprocessing_PDB_file_bulk, 10)

In [54]:
CSARset2_ligand_results = pd.concat(CSARset2_ligand_results)

In [55]:
CSARset2_df["Check_Lig_info"] = CSARset2_ligand_results.map(lambda a:a if a is not None else None)

In [56]:
CSARset2_df = CSARset2_df.dropna(axis = 0).reset_index(drop=True)
CSARset2_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info
0,1a8i,GLS,5.52,C([C@@H]1[C@H]([C@@H]([C@H]([C@]2(O1)C(=O)NC(=...,O=C1NC(=O)[C@@]2(N1)O[C@H](CO)[C@@H](O)[C@H](O...,GLS_A_998,GLS_A_998
1,1a99,PUT,5.70,C(CCN)CN,NCCCCN,"PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371","PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371"
2,1ax0,A2G,3.13,CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1O)CO)O)O,CC(=O)N[C@H]1[C@@H](O)O[C@H](CO)[C@H](O)[C@@H]1O,A2G_A_401,A2G_A_401
3,1b6l,PI4,8.30,CC(C)(C)NC(=O)[C@@H]1CCCC[N@]1C[C@H]([C@@H]2Cc...,CC(C)(C)NC(=O)[C@@H]1CCCCN1C[C@@H](O)[C@@H]1CC...,PI4_A_201,PI4_A_201
4,1b6m,PI6,8.40,CC[C@H](C)[C@H]1C(=O)NCCCOc2ccc(cc2)C[C@@H](C(...,CC[C@H](C)[C@@H]1NC(=O)[C@@H](NC[C@@H](O)[C@H]...,PI6_B_201,PI6_B_201
...,...,...,...,...,...,...,...
137,2qrk,AMP,4.26,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,AMP_A_500,AMP_A_500
138,2nn1,M28,5.82,c1cc(ccc1CCC(=O)O)S(=O)(=O)N,NS(=O)(=O)C1=CC=C(CCC(=O)O)C=C1,"M28_A_311,M28_B_312","M28_A_311,M28_B_312"
139,2pou,I7A,7.42,c1c(cc(c(c1S(=O)(=O)N)Cl)Cl)S(=O)(=O)N,NS(=O)(=O)C1=CC(S(N)(=O)=O)=C(Cl)C(Cl)=C1,I7A_A_1000,I7A_A_1000
140,3cd0,6HI,7.57,CC(C)n1c(c(nc1C(=O)NCc2ccc(cc2)F)c3ccc(cc3)F)C...,CC(C)N1C(C(=O)NCC2=CC=C(F)C=C2)=NC(C2=CC=C(F)C...,"6HI_B_1,6HI_B_2,6HI_C_4,6HI_D_3","6HI_B_1,6HI_C_4,6HI_D_3"


#### 2.14 CSARset2 dataset get protein info

In [57]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/CSARset2"

In [58]:
_ = parallelize_dataframe(CSARset2_df, remove_HEATM_bulk, 10)

#### 2.15 Astex dataset get final ligand info

In [59]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/Astex"

In [60]:
Astex_ligand_results = parallelize_dataframe(Astex_df, preprocessing_PDB_file_bulk, 10)

In [61]:
Astex_ligand_results = pd.concat(Astex_ligand_results)

In [62]:
Astex_df["Check_Lig_info"] = Astex_ligand_results.map(lambda a:a if a is not None else None)

In [63]:
Astex_df = Astex_df.dropna(axis = 0).reset_index(drop=True)
Astex_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info
0,1gm8,SOX,4.80,CC1([C@@H](N2[C@H]([S@H]1O)[C@@H](C2=O)NC(=O)C...,CC1(C)[C@H](C(=O)O)N2C(=O)[C@@H](NC(=O)CC3=CC=...,Km,16.000,uM,SOX_B_1559,SOX_B_1559
1,1gpk,HUP,5.37,C/C=C/1\[C@@H]2CC3=C([C@]1(CC(=C2)C)N)C=CC(=O)N3,C/C=C1\[C@H]2C=C(C)C[C@]1(N)C1=C(C2)NC(=O)C=C1,Ki,4.300,uM,HUP_A_1540,HUP_A_1540
2,1hnn,SKF,6.24,c1cc2c(cc1S(=O)(=O)N)CNCC2,NS(=O)(=O)C1=CC2=C(C=C1)CCNC2,Ki,0.580,uM,"SKF_A_3001,SKF_B_3002","SKF_A_3001,SKF_B_3002"
3,1hp0,AD3,6.70,c1cnc(c2c1n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3)C...,NC1=NC=CC2=C1N=CN2[C@@H]1O[C@H](CO)[C@@H](O)[C...,Ki,0.200,uM,"AD3_A_1315,AD3_B_1316","AD3_A_1315,AD3_B_1316"
4,1hq2,PH2,6.77,C1C(=NC2=C(N1)N=C(NC2=O)N)CO,NC1=NC2=C(N=C(CO)CN2)C(=O)N1,Kd,0.170,uM,PH2_A_181,PH2_A_181
...,...,...,...,...,...,...,...,...,...,...
70,1ywr,LI9,7.49,C[C@@H](c1ccccc1)Nc2nccc(n2)C3=C(C(=O)N(N3C)C4...,C[C@H](NC1=NC=CC(C2=C(C3=CC=C(F)C=C3)C(=O)N(C3...,IC50,32.000,nM,LI9_A_361,LI9_A_361
71,1z95,198,7.12,C[C@](CS(=O)(=O)c1ccc(cc1)F)(C(=O)Nc2ccc(c(c2)...,C[C@](O)(CS(=O)(=O)C1=CC=C(F)C=C1)C(=O)NC1=CC=...,Ki,0.076,uM,198_A_501,198_A_501
72,2bm2,PM2,7.82,c1ccc(cc1)CCc2cc(cnc2)C(=O)N3CCC(CC3)c4cccc(c4)CN,NCC1=CC(C2CCN(C(=O)C3=CN=CC(CCC4=CC=CC=C4)=C3)...,Ki,0.015,uM,"PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211","PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211"
73,2br1,PFP,5.14,COc1ccc(cc1)c2c3c(ncnc3oc2c4ccc(cc4)OC)NCCO,COC1=CC=C(C2=C(C3=CC=C(OC)C=C3)C3=C(NCCO)N=CN=...,Ki,7200.000,nM,PFP_A_1277,PFP_A_1277


#### 2.16 Astex dataset get protein info

In [64]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/Astex"

In [65]:
_ = parallelize_dataframe(Astex_df, remove_HEATM_bulk, 10)

#### 2.17 COACH420 dataset get final ligand info

In [66]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/COACH420"

In [67]:
COACH420_ligand_results = parallelize_dataframe(COACH420_df, preprocessing_PDB_file_bulk, 10)

In [68]:
COACH420_ligand_results = pd.concat(COACH420_ligand_results)

In [69]:
COACH420_df["Check_Lig_info"] = COACH420_ligand_results.map(lambda a:a if a is not None else None)

In [70]:
COACH420_df = COACH420_df.dropna(axis = 0).reset_index(drop=True)
COACH420_df

Unnamed: 0,PDB,Lig_code,COACH420_chain,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info
0,1a2k,GDP,C,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)OP(=...,"GDP_C_220,GDP_D_220,GDP_E_220","GDP_C_220,GDP_D_220,GDP_E_220"
1,1a4k,FRA,H,CC(=O)Nc1ccc(cc1)N2C(=O)[C@@H]3C4CCC([C@@H]3C2...,CC(=O)NC1=CC=C(N2C(=O)[C@@H]3C4CCC(NC(=O)OCC(=...,"FRA_H_3083,FRA_B_3083","FRA_H_3083,FRA_B_3083"
2,1a7x,FKA,A,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,CO[C@H]1C[C@@H](C)C/C(C)=C/[C@@H](CCOC(=O)NCC2...,FKA_B_201,FKA_B_201
3,1afk,PAP,A,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)OP(=O)(...,"PAP_A_125,PAP_B_125","PAP_A_125,PAP_B_125"
4,1atl,0QI,A,CC(C)C[C@H](CS)C(=O)N[C@@H](Cc1ccc(cc1)OC)C(=O)O,COC1=CC=C(C[C@H](NC(=O)[C@@H](CS)CC(C)C)C(=O)O...,"0QI_A_301,0QI_B_311","0QI_A_301,0QI_B_311"
...,...,...,...,...,...,...,...
345,7dfr,FOL,A,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,NC1=NC(=O)C2=NC(CNC3=CC=C(C(=O)N[C@@H](CCC(=O)...,FOL_A_161,FOL_A_161
346,7dfr,NAP,A,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,NAP_A_164,NAP_A_164
347,7est,0Z2,E,CC(C)C[C@@H](C(=O)N[C@@H](C)C(=O)Nc1ccc(cc1)C(...,CC(C)C[C@H](NC(=O)C(F)(F)F)C(=O)N[C@@H](C)C(=O...,0Z2_E_1,0Z2_E_1
348,830c,RS1,A,c1cc(ccc1Oc2ccc(cc2)Cl)S(=O)(=O)CC3(CCOCC3)C(=...,O=C(NO)C1(CS(=O)(=O)C2=CC=C(OC3=CC=C(Cl)C=C3)C...,"RS1_A_1,RS1_B_1","RS1_A_1,RS1_B_1"


#### 2.18 COACH420 dataset get protein info

In [71]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/COACH420"

In [72]:
_ = parallelize_dataframe(COACH420_df, remove_HEATM_bulk, 10)

#### 2.19 HOLO4K dataset get final ligand info

In [73]:
src_path = "./data/PDB/protein"
ideal_ligand_path = "./data/PDB/ligand"
des_path = "./data/PDB/structure/HOLO4K"

In [74]:
HOLO4K_ligand_results = parallelize_dataframe(HOLO4K_df, preprocessing_PDB_file_bulk, 10)

In [75]:
HOLO4K_ligand_results = pd.concat(HOLO4K_ligand_results)

In [76]:
HOLO4K_df["Check_Lig_info"] = HOLO4K_ligand_results.map(lambda a:a if a is not None else None)

In [77]:
HOLO4K_df = HOLO4K_df.dropna(axis = 0).reset_index(drop=True)
HOLO4K_df

Unnamed: 0,PDB,Lig_code,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info
0,121p,GCP,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)O[P@...,GCP_A_167,GCP_A_167
1,12as,AMP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,"AMP_A_332,AMP_B_332","AMP_A_332,AMP_B_332"
2,13pk,3PG,C([C@H](C(=O)O)O)OP(=O)(O)O,O=C(O)[C@H](O)COP(=O)(O)O,"3PG_A_423,3PG_B_423","3PG_A_423,3PG_B_423"
3,13pk,ADP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@](=O)(O)OP(=O)(O...,"ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421","ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421"
4,16pk,BIS,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)O[P@](=...,BIS_A_499,BIS_A_499
...,...,...,...,...,...,...
4171,9gss,GTX,CCCCCCSC[C@@H](C(=O)NCC(=O)O)NC(=O)CC[C@@H](C(...,CCCCCCSC[C@H](NC(=O)CC[C@H]([NH3+])C(=O)O)C(=O...,"GTX_A_211,GTX_B_210","GTX_A_211,GTX_B_210"
4172,9ldb,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401","NAD_A_401,NAD_B_401"
4173,9ldb,OXM,C(=O)(C(=O)O)N,NC(=O)C(=O)O,OXM_B_402,OXM_B_402
4174,9ldt,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401","NAD_A_401,NAD_B_401"


#### 2.20 HOLO4K dataset get protein info

In [78]:
src_path = "./data/PDB/protein"
protein_des_path = "./data/PDB/structure/HOLO4K"

In [79]:
_ = parallelize_dataframe(HOLO4K_df, remove_HEATM_bulk, 10)

### 3. Check protein chain

In [80]:
def get_binding_sites(protein_coords, ligand_coords, chain_list, threshold):
    P_L_distance_matrix = distance_matrix(protein_coords, ligand_coords)
    return list(set(chain_list[np.where(P_L_distance_matrix<=threshold[0])[0]]))

In [81]:
def get_protein_info(row):
    pdbid, ligand_info = row[0], row[-1]

    """ Load protein info """
    structure = pdb_parser.get_structure(pdbid, f"{path}/{pdbid}/{pdbid}_protein.pdb")
    protein_atom_coords, chain_results, atom_chain_name_list, pdb_sequence_list, residue_index_list = list(), list(), list(), list(), list()

    """ Read protein info """
    for chain_name in list(structure[0].child_dict.keys()):
        chain = structure[0][chain_name]
        pdb_sequence, chain_residue_index_list = "", list()
        
        for residue in chain.get_residues():
            if residue.get_id()[0] != ' ' or residue.get_id()[2] != ' ':   # remove HETATM
                continue
                
            if residue.resname in amino_acids_short.keys():
                pdb_sequence += amino_acids_short[residue.resname]
                chain_residue_index_list.append(str(residue.get_id()[1]) + residue.get_id()[2].strip())
                
                for atom in residue:
                    protein_atom_coords.append(atom.get_coord())
                    atom_chain_name_list.append(chain_name)
                
        if len(pdb_sequence) != 0:
            pdb_sequence_list.append(pdb_sequence)
            chain_results.append(chain_name)
            residue_index_list.append(",".join(chain_residue_index_list))

    if ",".join(chain_results) == '':
        return None   
    
    pocket_chain_list = list()
    
    """ Load ligand info """
    for ligand_code_info in ligand_info.split(","):
        ligand_atom_coords = list()
        ligand_structure = pdb_parser.get_structure(pdbid, f"{path}/{pdbid}/{ligand_code_info}.pdb") 
        
        """ Read ligand info """
        for chain_name in list(ligand_structure[0].child_dict.keys()):
            chain = ligand_structure[0][chain_name]
        
            for residue in chain.get_residues():
                for atom in residue:
                    ligand_atom_coords.append(atom.get_coord())
                  
        pocket_chain = get_binding_sites(protein_atom_coords, ligand_atom_coords, np.array(atom_chain_name_list), [8.]) 
        
        if len(pocket_chain) != 0:
            pocket_chain_list.append(",".join(pocket_chain))
        else:
            pocket_chain_list.append("")

    if "".join(pocket_chain_list) == '':
        return None

    else:
        return ",".join(chain_results), ",".join(pdb_sequence_list), ";".join(residue_index_list), ";".join(pocket_chain_list)

In [82]:
def get_pocket_chain_info_bulk(df):
    return df.apply(get_protein_info, axis = 1)

In [83]:
pdb_parser = PDB.PDBParser(QUIET = True)

In [84]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

#### 3.1 PDBbind dataset

In [85]:
path = "./data/PDB/structure/PDBbind"

In [86]:
PDBbind_info_results = parallelize_dataframe(PDBbind_df, get_pocket_chain_info_bulk, 10)

In [87]:
PDBbind_info_results = pd.concat(PDBbind_info_results)

In [88]:
PDBbind_df["Chain"] = PDBbind_info_results.map(lambda a: a[0] if a is not None else None)
PDBbind_df["PDB_seqs"] = PDBbind_info_results.map(lambda a: a[1] if a is not None else None)
PDBbind_df["Residue_index"] = PDBbind_info_results.map(lambda a: a[2] if a is not None else None)
PDBbind_df["Pocket_chain"] = PDBbind_info_results.map(lambda a: a[3] if a is not None else None)

In [89]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,3wka,S0G,4.54,CN1C(=C(C(=O)NC1=O)N2CCCCC2)N,CN1C(N)=C(N2CCCCC2)C(=O)NC1=O,29.00,uM,4.54,S0G_A_603,S0G_A_603,A,TLRAAVFDLDGVLALPAVFGVLGRTEEALALPRGLLNDAFQKGGPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A
1,5kgx,7SK,4.80,CC(C)(C)O[C@@H](c1c(c2ccccc2n1C)c3ccc4c(c3)CCC...,CN1C([C@H](OC(C)(C)C)C(=O)O)=C(C2=CC=C3OCCCC3=...,16.00,uM,4.80,7SK_A_301,7SK_A_301,A,CSPGIWQLDTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLL...,"56,57,58,59,60,61,62,63,64,66,67,68,69,70,71,7...",A
2,5wqc,7MA,8.96,CCN(Cc1cccnc1)C(=O)CN(c2ccc(nc2)OC)S(=O)(=O)c3...,CCN(CC1=CC=CN=C1)C(=O)CN(C1=CC=C(OC)N=C1)S(=O)...,1.10,nM,8.96,7MA_A_2001,7MA_A_2001,A,FLRYLWREYLHPKEYEWVLIAGYIIVFVVALIGNVLVCVAVWKNHH...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5...",A
3,5aen,DP8,7.22,CN(C)CCOc1ccc(cc1)Oc2ccccc2,CN(C)CCOC1=CC=C(OC2=CC=CC=C2)C=C1,0.06,uM,7.22,DP8_A_611,DP8_A_611,A,IVDTCSLASPASVCRTKHLHLRCSVDFTRRTLTGTAALTVQSQEDN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A
4,1utp,PBN,1.44,c1ccc(cc1)CCCCN,NCCCCC1=CC=CC=C1,36.00,mM,1.44,PBN_A_1246,PBN_A_1246,A,IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGI...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...",A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14398,3fci,3FI,5.89,c1cc(cc(c1)C(=O)O)\C=N\OCCCNCC2=CC(=O)NC(=O)N2,O=C(O)C1=CC=CC(/C=N/OCCCNCC2=CC(=O)NC(=O)N2)=C1,1.30,uM,5.89,3FI_A_1,3FI_A_1,A,MEFFGESWKKHLSGEFGKPYFIKLMGFVAEERKHYTVYPPPHQVFT...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A
14399,5j9l,6HF,7.47,CN1CCN(CC1)c2ccc(cc2)Nc3nc4c(cc[nH]4)c(n3)Oc5c...,C=CC(=O)NC1=CC=C(OC2=NC(NC3=CC=C(N4CCN(C)CC4)C...,34.00,nM,7.47,6HF_A_501,6HF_A_501,A,SLHMIDYKEIEVEEVVGRGGVVCKAWRADVAIKQIESESERKAFIV...,"27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,4...",A
14400,2q9n,LK5,6.37,CCCCO[C@@H]1CCC[C@@H]2C1=C(N[C@@H]2[C@@H](CC)C...,CCCCO[C@@H]1CCC[C@@H]2C1=C(C(=O)O)N[C@@H]2[C@H...,430.00,nM,6.37,LK5_A_0,LK5_A_0,A,PVSEKQLAEVVANTITPLMKAQSVPGMAVAVIYQGKPHYYTFGKAD...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A
14401,3np7,Z15,4.02,c1c(c(cc(c1O)Cl)O)[C@H]2[C@@H]([C@H]([C@@H]([C...,OC[C@H]1O[C@@H](C2=CC(O)=C(Cl)C=C2O)[C@H](O)[C...,95.00,uM,4.02,"Z15_A_997,Z15_A_998",Z15_A_997,A,QISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYYFALAH...,"12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,2...",A


#### 3.2 CASF2016 dataset

In [90]:
path = "./data/PDB/structure/CASF2016"

In [91]:
CASF2016_info_results = parallelize_dataframe(CASF2016_df, get_pocket_chain_info_bulk, 10)

In [92]:
CASF2016_info_results = pd.concat(CASF2016_info_results)

In [93]:
CASF2016_df["Chain"] = CASF2016_info_results.map(lambda a: a[0] if a is not None else None)
CASF2016_df["PDB_seqs"] = CASF2016_info_results.map(lambda a: a[1] if a is not None else None)
CASF2016_df["Residue_index"] = CASF2016_info_results.map(lambda a: a[2] if a is not None else None)
CASF2016_df["Pocket_chain"] = CASF2016_info_results.map(lambda a: a[3] if a is not None else None)

In [94]:
CASF2016_df = CASF2016_df.dropna(axis = 0).reset_index(drop=True)
CASF2016_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.53,mM,3.28,PRL_H_280,PRL_H_280,"L,H,I","CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2...",H
1,1bzc,TPI,4.92,c1cc(cc2c1cc(cc2)C(F)(F)P(=O)(O)O)C(=O)N[C@@H]...,NC(=O)[C@H](CCC(=O)O)NC(=O)C1=CC2=C(C=C1)C=C(C...,12.00,uM,4.92,TPI_A_902,TPI_A_902,A,EMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRYR...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A
2,1c5z,BEN,4.01,[H]/N=C(\c1ccccc1)/N,[H]/N=C(/N)C1=CC=CC=C1,97.00,uM,4.01,BEN_B_251,BEN_B_251,"A,B","LKFQCGQKT,IIGGEFTTIENQPWFAAIYRRHVTYVCGGSLMSPCW...","9,10,11,12,13,14,15,16,17;16,17,18,19,20,21,22...",B
3,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.13,nM,9.89,HUX_A_803,HUX_A_803,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A
4,1eby,BEB,9.70,c1ccc(cc1)CO[C@H]([C@@H]([C@H]([C@H](C(=O)N[C@...,O=C(N[C@H]1C2=CC=CC=C2C[C@H]1O)[C@H](OCC1=CC=C...,0.20,nM,9.70,BEB_B_501,BEB_B_501,"A,B",PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","B,A"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,5aba,UL7,2.98,c1c(cc(c(c1CN2CCC(CC2)N3CCC(CC3)O)O)Br)Cl,OC1=C(CN2CCC(N3CCC(O)CC3)CC2)C=C(Cl)C=C1Br,1040.00,uM,2.98,"UL7_A_1291,UL7_B_1291","UL7_A_1291,UL7_B_1291","A,B",SVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKLFCQLAKTC...,"96,97,98,99,100,101,102,103,104,105,106,107,10...",A;B
275,5c28,4XV,5.66,Cc1c(nc(nc1Cl)C2CC2)N,CC1=C(N)N=C(C2CC2)N=C1Cl,2.20,uM,5.66,4XV_A_803,4XV_A_803,"A,B",HMSICTSEEWQGLMQFTLPVRLCKEIELFHFDIGPFENMWPGIFVY...,"437,438,439,440,441,442,443,444,445,446,447,44...","B,A"
276,5c2h,4XU,11.09,Cc1c(nc(nc1Cl)OCCCc2ccc3ccccc3n2)NCc4c(nc(s4)C)C,CC1=NC(C)=C(CNC2=C(C)C(Cl)=NC(OCCCC3=CC=C4C=CC...,8.20,pM,11.09,4XU_B_803,4XU_B_803,"A,B",QFTLPVRLCKEIELFHFDIGPFENMWPGIFVYMVHRSCGTSCFELE...,"451,452,453,454,455,456,457,458,459,460,461,46...","B,A"
277,5dwr,5H7,11.22,C[C@H]1C[C@H](C[C@H](C1)N)c2ccncc2NC(=O)c3ccc(...,C[C@@H]1C[C@H](N)C[C@H](C2=CC=NC=C2NC(=O)C2=CC...,6.00,pM,11.22,5H7_A_401,5H7_A_401,A,PLESQYQVGPLLGSGGFGSVYSGIRVSDNLPVAIKHVEKDRISDWG...,"33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,4...",A


#### 3.3 CASF2013 dataset

In [95]:
path = "./data/PDB/structure/CASF2013"

In [96]:
CASF2013_info_results = parallelize_dataframe(CASF2013_df, get_pocket_chain_info_bulk, 10)

In [97]:
CASF2013_info_results = pd.concat(CASF2013_info_results)

In [98]:
CASF2013_df["Chain"] = CASF2013_info_results.map(lambda a: a[0] if a is not None else None)
CASF2013_df["PDB_seqs"] = CASF2013_info_results.map(lambda a: a[1] if a is not None else None)
CASF2013_df["Residue_index"] = CASF2013_info_results.map(lambda a: a[2] if a is not None else None)
CASF2013_df["Pocket_chain"] = CASF2013_info_results.map(lambda a: a[3] if a is not None else None)

In [99]:
CASF2013_df = CASF2013_df.dropna(axis = 0).reset_index(drop=True)
CASF2013_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,10gs,VWW,6.40,c1ccc(cc1)CSC[C@@H](C(=O)N[C@H](c2ccccc2)C(=O)...,N[C@@H](CCC(=O)N[C@@H](CSCC1=CC=CC=C1)C(=O)N[C...,0.400,uM,6.40,"VWW_A_210,VWW_B_210","VWW_A_210,VWW_B_210","A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...","B,A;B,A"
1,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.530,mM,3.28,PRL_H_280,PRL_H_280,"L,H,I","CGLRPLFEKKSLED,IVEGSDAEIGMSPWQVMLFRKPQELLCGASL...","1,2,3,4,5,6,7,8,9,10,11,12,13,14;16,17,18,19,2...",H
2,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.130,nM,9.89,HUX_A_803,HUX_A_803,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A
3,1f8b,DAN,5.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@@H]1[C@@H](O)C=C(C(=O)O)O[C@H]1[C@H]...,4.000,uM,5.40,DAN_A_0,DAN_A_0,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A
4,1f8c,4AM,7.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@@H]1[C@@H](N)C=C(C(=O)O)O[C@H]1[C@H]...,0.040,uM,7.40,4AM_A_4,4AM_A_4,A,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...,"82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,9...",A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,4djv,0KM,6.72,[H]/N=C/1\N[C@](C(=O)N1C)(c2ccccc2)c3cccc(c3)c...,[H]/N=C1\N[C@](C2=CC=CC=C2)(C2=CC=CC(C3=CC=CC(...,0.190,uM,6.72,"0KM_A_501,0KM_B_501","0KM_A_501,0KM_B_501","A,B",GSFVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGA...,"58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,7...",A;B
172,4g8m,G8M,7.89,C1C[C@@H]([C@@H]1[C@@H](C(=O)O)N)C(=O)O,N[C@H](C(=O)O)[C@@H]1CC[C@@H]1C(=O)O,12.800,nM,7.89,"G8M_A_301,G8M_B_901","G8M_A_301,G8M_B_901","A,B",ANKTVVVTTILESPYVMMKKNHEMLEGNERYEGYCVDLAAEIAKHC...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A;B
173,4gid,0GH,10.77,C[C@H](c1ccccc1)NC(=O)c2cc(cc(c2)N(C)S(=O)(=O)...,CC(C)CNC(=O)[C@@H](NC[C@H](CC1=CC=CC=C1)NC(=O)...,0.017,nM,10.77,"0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501","0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501","A,B,C,D",FVEMVDNLRGKSGQGYYVEMTVGSPPQTLNILVDTGSSNFAVGAAP...,"47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,6...",A;B;C;D
174,4gqq,0XR,2.89,CCOC(=O)/C=C/c1ccc(c(c1)O)O,CCOC(=O)/C=C/C1=CC=C(O)C(O)=C1,1.300,mM,2.89,"0XR_A_502,0XR_A_503,0XR_A_504",0XR_A_502,A,YSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A


#### 3.4 CSAR2014 dataset

In [100]:
path = "./data/PDB/structure/CSAR2014"

In [101]:
CSAR2014_info_results = parallelize_dataframe(CSAR2014_df, get_pocket_chain_info_bulk, 10)

In [102]:
CSAR2014_info_results = pd.concat(CSAR2014_info_results)

In [103]:
CSAR2014_df["Chain"] = CSAR2014_info_results.map(lambda a: a[0] if a is not None else None)
CSAR2014_df["PDB_seqs"] = CSAR2014_info_results.map(lambda a: a[1] if a is not None else None)
CSAR2014_df["Residue_index"] = CSAR2014_info_results.map(lambda a: a[2] if a is not None else None)
CSAR2014_df["Pocket_chain"] = CSAR2014_info_results.map(lambda a: a[3] if a is not None else None)

In [104]:
CSAR2014_df = CSAR2014_df.dropna(axis = 0).reset_index(drop=True)
CSAR2014_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,4ypw,4FD,5.5,c1ccc(cc1)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC=CC=C2)N=C1,IC50,3162.0,nM,4FD_A_301,4FD_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A
1,4ypx,4FG,4.6,c1cc(ncc1C(=O)N)N,NC(=O)C1=CN=C(N)C=C1,IC50,25119.0,nM,4FG_A_301,4FG_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A
2,4ypy,4F9,3.8,c1cnccc1c2cnc[nH]2,C1=CC(C2=CN=CN2)=CC=N1,IC50,158489.0,nM,4F9_A_301,4F9_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A
3,4ypz,4FL,3.5,c1cnccc1c2[nH]ccn2,C1=CC(C2=NC=CN2)=CC=N1,IC50,316228.0,nM,4FL_A_301,4FL_A_301,A,SHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTFD...,"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17...",A
4,4yq0,4FM,4.4,c1cc(ccc1CNC(=O)c2c(non2)N)Cl,NC1=NON=C1C(=O)NCC1=CC=C(Cl)C=C1,IC50,39811.0,nM,4FM_A_301,4FM_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A
5,4yq1,4FN,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCCC2)NC(=O)c3ccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,4FN_A_301,4FN_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A
6,4yq2,EFY,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,EFY_A_301,EFY_A_301,A,RGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFT...,"-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15...",A
7,4yq3,4G1,6.8,Cc1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc4c3con4,CC1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC=C34)CC...,IC50,158.0,nM,4G1_A_301,4G1_A_301,A,GLVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNP...,"-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11...",A
8,4yq4,4G3,5.7,c1cc(cc(c1)O)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC(O)=CC=C2)N=C1,IC50,1995.0,nM,4G3_A_301,4G3_A_301,A,GSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPRDFTF...,"-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...",A
9,4yq5,4G0,5.4,CN(C)c1cccc(c1)CNc2ccc(cn2)C(=O)N,CN(C)C1=CC=CC(CNC2=CC=C(C(N)=O)C=N2)=C1,IC50,3981.0,nM,4G0_A_301,4G0_A_301,A,LVPRGSHMWIGVISLFPEMFKAITEFGVTGRAVKHNLLKVECWNPR...,"-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12...",A


#### 3.5 CSAR2012 dataset

In [105]:
path = "./data/PDB/structure/CSAR2012"

In [106]:
CSAR2012_info_results = parallelize_dataframe(CSAR2012_df, get_pocket_chain_info_bulk, 10)

In [107]:
CSAR2012_info_results = pd.concat(CSAR2012_info_results)

In [108]:
CSAR2012_df["Chain"] = CSAR2012_info_results.map(lambda a: a[0] if a is not None else None)
CSAR2012_df["PDB_seqs"] = CSAR2012_info_results.map(lambda a: a[1] if a is not None else None)
CSAR2012_df["Residue_index"] = CSAR2012_info_results.map(lambda a: a[2] if a is not None else None)
CSAR2012_df["Pocket_chain"] = CSAR2012_info_results.map(lambda a: a[3] if a is not None else None)

In [109]:
CSAR2012_df = CSAR2012_df.dropna(axis = 0).reset_index(drop=True)
CSAR2012_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,4fud,6UP,6.35,[H]/N=C(\c1ccc2cccc(c2c1)N)/N,[H]/N=C(/N)C1=CC=C2C=CC=C(N)C2=C1,Ki,450.0,nM,6UP_A_301,6UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
1,4fue,7UP,7.23,[H]/N=C(/c1ccc2cc(ccc2c1)C#Cc3ccc4c(c3)CCNC4)\N,[H]/N=C(\N)C1=CC=C2C=C(C#CC3=CC=C4CNCCC4=C3)C=...,Ki,58.8,nM,7UP_A_301,7UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
2,4fu7,1UP,6.2,[H]/N=C(/c1ccc2ccc(c(c2c1)OCC(=O)N)OC)\N,[H]/N=C(\N)C1=CC=C2C=CC(OC)=C(OCC(N)=O)C2=C1,Ki,637.0,nM,1UP_A_305,1UP_A_305,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
3,4fu8,2UP,5.23,[H]/N=C(\c1ccc2ccccc2c1)/N,[H]/N=C(/N)C1=CC=C2C=CC=CC2=C1,Ki,5910.0,nM,2UP_A_301,2UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
4,4fu9,675,6.2,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccccc3)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=CC=C3)C=CC2=C1,Ki,628.0,nM,675_A_313,675_A_313,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
5,4fub,4UP,6.21,[H]/N=C(/c1ccc2cc(ccc2c1)[C@H]3[C@@H](O3)c4ccc...,[H]/N=C(\N)C1=CC=C2C=C([C@@H]3O[C@H]3C3=CC=CC=...,Ki,610.0,nM,4UP_A_301,4UP_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
6,4fuc,239,7.4,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccc(cc3)CN)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=C(CN)C=C3)C...,Ki,40.0,nM,239_A_301,239_A_301,A,IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATH...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
7,4fsm,HK1,7.62,COc1cc2c(cc1OC)-c3c(c([nH]n3)c4ccc(cc4)O)C2,COC1=CC2=C(C=C1OC)C1=NNC(C3=CC=C(O)C=C3)=C1C2,IC50,23.9,nM,HK1_A_301,HK1_A_301,A,VPFVEDWDLVQTLGEGEVQLAVNRVTEEAVAVKIVNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,21,22,23...",A
8,4fsw,HK6,4.76,c1ccc2c(c1)C(=O)Nc3cc(ccc3N2)Cl,O=C1NC2=CC(Cl)=CC=C2NC2=CC=CC=C12,IC50,17200.0,nM,HK6_A_301,HK6_A_301,A,VPFVEDWDLVQTLGEGGEVQLAVNRVTEEAVAVKIVNIKKEICINK...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,21,22...",A
9,4ft5,H2K,7.56,c1cc(c(cc1Cl)NC(=O)Nc2cnc(cn2)C#N)O[C@@H]3CCNC3,N#CC1=NC=C(NC(=O)NC2=C(O[C@@H]3CCNC3)C=CC(Cl)=...,IC50,27.4,nM,H2K_A_300,H2K_A_300,A,VPFVEDWDLVQTLGEVQLAVNRVTEEAVAVKIVDMNIKKEICINKM...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,22,23,24,25...",A


#### 3.6 CSARset1 dataset

In [110]:
path = "./data/PDB/structure/CSARset1"

In [111]:
CSARset1_info_results = parallelize_dataframe(CSARset1_df, get_pocket_chain_info_bulk, 10)

In [112]:
CSARset1_info_results = pd.concat(CSARset1_info_results)

In [113]:
CSARset1_df["Chain"] = CSARset1_info_results.map(lambda a: a[0] if a is not None else None)
CSARset1_df["PDB_seqs"] = CSARset1_info_results.map(lambda a: a[1] if a is not None else None)
CSARset1_df["Residue_index"] = CSARset1_info_results.map(lambda a: a[2] if a is not None else None)
CSARset1_df["Pocket_chain"] = CSARset1_info_results.map(lambda a: a[3] if a is not None else None)

In [114]:
CSARset1_df = CSARset1_df.dropna(axis = 0).reset_index(drop=True)
CSARset1_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,2are,MAN,3.28,C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O)O)O)O)O,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,"MAN_A_253,MAN_B_253","MAN_A_253,MAN_B_253","A,B",QDSLSFGFPTFPSDQKNLIFQGDAQIKNNAVQLTKTDSNGNPVAST...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B
1,2oag,DLI,8.47,CS(=O)(=O)c1cccc(c1)c2cc(ncn2)N3C[C@@H]([C@H](...,CS(=O)(=O)C1=CC=CC(C2=CC(N3C[C@H](C4=CC(F)=C(F...,DLI_B_4000,DLI_B_4000,"A,B,C,D",SRKTYTLTDYLKNTYRLKLYSLRWISDHEYLYKQENNILVFNAEYG...,"39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,5...",B
2,2jbj,G88,9.70,C(CC(=O)O)[C@H](CP(=O)(O)O)C(=O)O,O=C(O)CC[C@H](CP(=O)(O)O)C(=O)O,G88_A_1768,G88_A_1768,A,NMKAFLDELKAENIKKFLYNFTQIPHLAGTEQNFQLAKQIQSQWKE...,"57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,7...",A
3,2pwd,NOJ,4.40,C1[C@@H]([C@H]([C@@H]([C@H](N1)CO)O)O)O,OC[C@H]1NC[C@H](O)[C@@H](O)[C@@H]1O,"NOJ_A_8000,NOJ_B_8001","NOJ_A_8000,NOJ_B_8001","A,B",KPGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B
4,2pwg,CTS,4.82,C1C[N@]2C[C@@H]([C@H]([C@@H]([C@H]2[C@H]1O)O)O)O,O[C@H]1CN2CC[C@H](O)[C@@H]2[C@@H](O)[C@@H]1O,"CTS_A_8000,CTS_B_8001","CTS_A_8000,CTS_B_8001","A,B",PGAPWWKSAVFYQVYPRSFKDTNGDGIGDFKGLTEKLDYLKGLGID...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A;B
...,...,...,...,...,...,...,...,...,...,...,...
151,3ekr,PY9,5.55,Cc1ccccc1[C@H]2CCCN2C(=O)c3ccc(cc3O)O,CC1=CC=CC=C1[C@H]1CCCN1C(=O)C1=CC=C(O)C=C1O,"PY9_A_901,PY9_B_901","PY9_A_901,PY9_B_901","A,B",QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,"10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,2...",A;B
152,3ene,NPZ,6.24,Cn1c2c(c(n1)c3ccc4ccccc4c3)c(ncn2)N,CN1N=C(C2=CC=C3C=CC=CC3=C2)C2=C1N=CN=C2N,NPZ_A_1,NPZ_A_1,A,SEESQAFQRQLTALIGYDVTDVSNVHDDELEFTRRGLVTPRMAEVA...,"144,145,146,147,148,149,150,151,152,153,154,15...",A
153,3eqr,T74,8.70,Cc1cccc(c1Nc2c3cnc(nc3n(n2)CCC(C)(C)OC)Nc4ccc(...,COC(C)(C)CCN1N=C(NC2=C(C)C=CC=C2C)C2=CN=C(NC3=...,"T74_A_1,T74_B_1","T74_A_1,T74_B_1","A,B",LTCLIGEKDLRLLEKLGDGSFGVVRRGEWDAPSGKTVSVAVKCLKP...,"117,118,119,120,121,122,123,124,125,126,127,12...","A;B,A"
154,3f8c,HT1,7.68,CCOc1ccc(cc1)c2[nH]c3cc(ccc3n2)c4[nH]c5cc(ccc5...,CCOC1=CC=C(C2=NC3=CC=C(C4=NC5=CC=C(N6CCN(C)CC6...,HT1_A_127,HT1_A_127,A,EIPKEMLRAQTNVILLNVLKQGDNYVYGIIKQVKEASNGEMELNEA...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A


#### 3.7 CSARset2 dataset

In [115]:
path = "./data/PDB/structure/CSARset2"

In [116]:
CSARset2_info_results = parallelize_dataframe(CSARset2_df, get_pocket_chain_info_bulk, 10)

In [117]:
CSARset2_info_results = pd.concat(CSARset2_info_results)

In [118]:
CSARset2_df["Chain"] = CSARset2_info_results.map(lambda a: a[0] if a is not None else None)
CSARset2_df["PDB_seqs"] = CSARset2_info_results.map(lambda a: a[1] if a is not None else None)
CSARset2_df["Residue_index"] = CSARset2_info_results.map(lambda a: a[2] if a is not None else None)
CSARset2_df["Pocket_chain"] = CSARset2_info_results.map(lambda a: a[3] if a is not None else None)

In [119]:
CSARset2_df = CSARset2_df.dropna(axis = 0).reset_index(drop=True)
CSARset2_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,1a8i,GLS,5.52,C([C@@H]1[C@H]([C@@H]([C@H]([C@]2(O1)C(=O)NC(=...,O=C1NC(=O)[C@@]2(N1)O[C@H](CO)[C@@H](O)[C@H](O...,GLS_A_998,GLS_A_998,A,QEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYY...,"7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,2...",A
1,1a99,PUT,5.70,C(CCN)CN,NCCCCN,"PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371","PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371","A,B,C,D",QKTLHIYNWSDYIAPDTVANFEKETGIKVVYDVFDSNEVLEGKLMA...,"29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,4...",A;B;C;D
2,1ax0,A2G,3.13,CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1O)CO)O)O,CC(=O)N[C@H]1[C@@H](O)O[C@H](CO)[C@H](O)[C@@H]1O,A2G_A_401,A2G_A_401,A,VETISFSFSEFEPGNDNLTLQGAALITQSGVLQLTKINQNGMPAWD...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
3,1b6l,PI4,8.30,CC(C)(C)NC(=O)[C@@H]1CCCC[N@]1C[C@H]([C@@H]2Cc...,CC(C)(C)NC(=O)[C@@H]1CCCCN1C[C@@H](O)[C@@H]1CC...,PI4_A_201,PI4_A_201,"A,B",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","B,A"
4,1b6m,PI6,8.40,CC[C@H](C)[C@H]1C(=O)NCCCOc2ccc(cc2)C[C@@H](C(...,CC[C@H](C)[C@@H]1NC(=O)[C@@H](NC[C@@H](O)[C@H]...,PI6_B_201,PI6_B_201,"A,B",PQITLWKRPLVTIRIGGQLKEALLDTGADDTVIEEMNLPGKWKPKM...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","B,A"
...,...,...,...,...,...,...,...,...,...,...,...
137,2qrk,AMP,4.26,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,AMP_A_500,AMP_A_500,A,AVTLHLRAETKPLEARAALTPTTVKKLIAKGFKIYVEDSPQSTFNI...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A
138,2nn1,M28,5.82,c1cc(ccc1CCC(=O)O)S(=O)(=O)N,NS(=O)(=O)C1=CC=C(CCC(=O)O)C=C1,"M28_A_311,M28_B_312","M28_A_311,M28_B_312","A,B",PDWGYDDKNGPEQWSKLYPIANGNNQSPVDIKTSETKHDTSLKPIS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...","B,A;B,A"
139,2pou,I7A,7.42,c1c(cc(c(c1S(=O)(=O)N)Cl)Cl)S(=O)(=O)N,NS(=O)(=O)C1=CC(S(N)(=O)=O)=C(Cl)C(Cl)=C1,I7A_A_1000,I7A_A_1000,A,HHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLS...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A
140,3cd0,6HI,7.57,CC(C)n1c(c(nc1C(=O)NCc2ccc(cc2)F)c3ccc(cc3)F)C...,CC(C)N1C(C(=O)NCC2=CC=C(F)C=C2)=NC(C2=CC=C(F)C...,"6HI_B_1,6HI_B_2,6HI_C_4,6HI_D_3","6HI_B_1,6HI_C_4,6HI_D_3","A,B,C,D",EPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLIE...,"441,442,443,444,445,446,447,448,449,450,451,45...","B,A;D,C;D,C"


#### 3.8 Astex dataset

In [120]:
path = "./data/PDB/structure/Astex"

In [121]:
Astex_info_results = parallelize_dataframe(Astex_df, get_pocket_chain_info_bulk, 10)

In [122]:
Astex_info_results = pd.concat(Astex_info_results)

In [123]:
Astex_df["Chain"] = Astex_info_results.map(lambda a: a[0] if a is not None else None)
Astex_df["PDB_seqs"] = Astex_info_results.map(lambda a: a[1] if a is not None else None)
Astex_df["Residue_index"] = Astex_info_results.map(lambda a: a[2] if a is not None else None)
Astex_df["Pocket_chain"] = Astex_info_results.map(lambda a: a[3] if a is not None else None)

In [124]:
Astex_df = Astex_df.dropna(axis = 0).reset_index(drop=True)
Astex_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,1gm8,SOX,4.80,CC1([C@@H](N2[C@H]([S@H]1O)[C@@H](C2=O)NC(=O)C...,CC1(C)[C@H](C(=O)O)N2C(=O)[C@@H](NC(=O)CC3=CC=...,Km,16.000,uM,SOX_B_1559,SOX_B_1559,"A,B",SSSEIKIVRDEYGMPHIYANDTWHLFYGYGYVVAQDRLFQMEMARR...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...","B,A"
1,1gpk,HUP,5.37,C/C=C/1\[C@@H]2CC3=C([C@]1(CC(=C2)C)N)C=CC(=O)N3,C/C=C1\[C@H]2C=C(C)C[C@]1(N)C1=C(C2)NC(=O)C=C1,Ki,4.300,uM,HUP_A_1540,HUP_A_1540,A,SELLVNTKSGKVMGTRVPVLSSHISAFLGIPFAEPPVGNMRFRRPE...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A
2,1hnn,SKF,6.24,c1cc2c(cc1S(=O)(=O)N)CNCC2,NS(=O)(=O)C1=CC2=C(C=C1)CCNC2,Ki,0.580,uM,"SKF_A_3001,SKF_B_3002","SKF_A_3001,SKF_B_3002","A,B",AVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQTFAT...,"22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,3...",A;B
3,1hp0,AD3,6.70,c1cnc(c2c1n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3)C...,NC1=NC=CC2=C1N=CN2[C@@H]1O[C@H](CO)[C@@H](O)[C...,Ki,0.200,uM,"AD3_A_1315,AD3_B_1316","AD3_A_1315,AD3_B_1316","A,B",SAKNVVLDHDGNLDDFVAMVLLASNTEKVRLIGALCTDADCFVENG...,"0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B
4,1hq2,PH2,6.77,C1C(=NC2=C(N1)N=C(NC2=O)N)CO,NC1=NC2=C(N=C(CO)CN2)C(=O)N1,Kd,0.170,uM,PH2_A_181,PH2_A_181,A,TVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLG...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1ywr,LI9,7.49,C[C@@H](c1ccccc1)Nc2nccc(n2)C3=C(C(=O)N(N3C)C4...,C[C@H](NC1=NC=CC(C2=C(C3=CC=C(F)C=C3)C(=O)N(C3...,IC50,32.000,nM,LI9_A_361,LI9_A_361,A,RPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGHRV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A
71,1z95,198,7.12,C[C@](CS(=O)(=O)c1ccc(cc1)F)(C(=O)Nc2ccc(c(c2)...,C[C@](O)(CS(=O)(=O)C1=CC=C(F)C=C1)C(=O)NC1=CC=...,Ki,0.076,uM,198_A_501,198_A_501,A,IFLNVLEAIEPGVVCAGHDNNQPDSFAALLSSLNELGERQLVHVVK...,"672,673,674,675,676,677,678,679,680,681,682,68...",A
72,2bm2,PM2,7.82,c1ccc(cc1)CCc2cc(cnc2)C(=O)N3CCC(CC3)c4cccc(c4)CN,NCC1=CC(C2CCN(C(=O)C3=CN=CC(CCC4=CC=CC=C4)=C3)...,Ki,0.015,uM,"PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211","PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211","A,B,C,D",IVGGQEAPRSKWPWQVSLRVHGWMHFCGGSLIHPQWVLTAAHCVGL...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...","A,C;D,B;A,C;D,B"
73,2br1,PFP,5.14,COc1ccc(cc1)c2c3c(ncnc3oc2c4ccc(cc4)OC)NCCO,COC1=CC=C(C2=C(C3=CC=C(OC)C=C3)C3=C(NCCO)N=CN=...,Ki,7200.000,nM,PFP_A_1277,PFP_A_1277,A,AVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRCPE...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...",A


#### 3.9 COACH420 dataset 

In [125]:
path = "./data/PDB/structure/COACH420"

In [126]:
COACH420_info_results = parallelize_dataframe(COACH420_df, get_pocket_chain_info_bulk, 10)

In [127]:
COACH420_info_results = pd.concat(COACH420_info_results)

In [128]:
COACH420_df["Chain"] = COACH420_info_results.map(lambda a: a[0] if a is not None else None)
COACH420_df["PDB_seqs"] = COACH420_info_results.map(lambda a: a[1] if a is not None else None)
COACH420_df["Residue_index"] = COACH420_info_results.map(lambda a: a[2] if a is not None else None)
COACH420_df["Pocket_chain"] = COACH420_info_results.map(lambda a: a[3] if a is not None else None)

In [129]:
COACH420_df = COACH420_df.dropna(axis = 0).reset_index(drop=True)
COACH420_df

Unnamed: 0,PDB,Lig_code,COACH420_chain,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,1a2k,GDP,C,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)OP(=...,"GDP_C_220,GDP_D_220,GDP_E_220","GDP_C_220,GDP_D_220,GDP_E_220","A,B,C,D,E",KPIWEQIGSSFIQHYYQLFDNDRTQLGAIYIDASCLTWEGQQFQGK...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...","B,C;D,A;E"
1,1a4k,FRA,H,CC(=O)Nc1ccc(cc1)N2C(=O)[C@@H]3C4CCC([C@@H]3C2...,CC(=O)NC1=CC=C(N2C(=O)[C@@H]3C4CCC(NC(=O)OCC(=...,"FRA_H_3083,FRA_B_3083","FRA_H_3083,FRA_B_3083","L,H,A,B",ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","H,L;B,A"
2,1a7x,FKA,A,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,CO[C@H]1C[C@@H](C)C/C(C)=C/[C@@H](CCOC(=O)NCC2...,FKA_B_201,FKA_B_201,"A,B",GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","B,A"
3,1afk,PAP,A,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)OP(=O)(...,"PAP_A_125,PAP_B_125","PAP_A_125,PAP_B_125","A,B",KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTF...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A;B
4,1atl,0QI,A,CC(C)C[C@H](CS)C(=O)N[C@@H](Cc1ccc(cc1)OC)C(=O)O,COC1=CC=C(C[C@H](NC(=O)[C@@H](CS)CC(C)C)C(=O)O...,"0QI_A_301,0QI_B_311","0QI_A_301,0QI_B_311","A,B",LPQRYIELVVVADHRVFMKYNSDLNTIRTRVHEIVNFINGFYRSLN...,"3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20...",A;B
...,...,...,...,...,...,...,...,...,...,...,...
344,7dfr,FOL,A,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,NC1=NC(=O)C2=NC(CNC3=CC=C(C(=O)N[C@@H](CCC(=O)...,FOL_A_161,FOL_A_161,A,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
345,7dfr,NAP,A,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,NAP_A_164,NAP_A_164,A,MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLDKPVIMGRHT...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
346,7est,0Z2,E,CC(C)C[C@@H](C(=O)N[C@@H](C)C(=O)Nc1ccc(cc1)C(...,CC(C)C[C@H](NC(=O)C(F)(F)F)C(=O)N[C@@H](C)C(=O...,0Z2_E_1,0Z2_E_1,E,VVGGTEAQRNSWPSQISLQYRSWAHTCGGTLIRQNWVMTAAHCVDR...,"16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,3...",E
347,830c,RS1,A,c1cc(ccc1Oc2ccc(cc2)Cl)S(=O)(=O)CC3(CCOCC3)C(=...,O=C(NO)C1(CS(=O)(=O)C2=CC=C(OC3=CC=C(Cl)C=C3)C...,"RS1_A_1,RS1_B_1","RS1_A_1,RS1_B_1","A,B",YNVFPRTLKWSKMNLTYRIVNYTPDMTHSEVEKAFKKAFKVWSDVT...,"104,105,106,107,108,109,110,111,112,113,114,11...","A;B,A"


#### 3.10 HOLO4K dataset

In [130]:
path = "./data/PDB/structure/HOLO4K"

In [131]:
HOLO4K_info_results = parallelize_dataframe(HOLO4K_df, get_pocket_chain_info_bulk, 10)

In [132]:
HOLO4K_info_results = pd.concat(HOLO4K_info_results)

In [133]:
HOLO4K_df["Chain"] = HOLO4K_info_results.map(lambda a: a[0] if a is not None else None)
HOLO4K_df["PDB_seqs"] = HOLO4K_info_results.map(lambda a: a[1] if a is not None else None)
HOLO4K_df["Residue_index"] = HOLO4K_info_results.map(lambda a: a[2] if a is not None else None)
HOLO4K_df["Pocket_chain"] = HOLO4K_info_results.map(lambda a: a[3] if a is not None else None)

In [134]:
HOLO4K_df = HOLO4K_df.dropna(axis = 0).reset_index(drop=True)
HOLO4K_df

Unnamed: 0,PDB,Lig_code,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info,Check_Lig_info,Chain,PDB_seqs,Residue_index,Pocket_chain
0,121p,GCP,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)O[P@...,GCP_A_167,GCP_A_167,A,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",A
1,12as,AMP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,"AMP_A_332,AMP_B_332","AMP_A_332,AMP_B_332","A,B",AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,2...",A;B
2,13pk,3PG,C([C@H](C(=O)O)O)OP(=O)(O)O,O=C(O)[C@H](O)COP(=O)(O)O,"3PG_A_423,3PG_B_423","3PG_A_423,3PG_B_423","A,B,C,D",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A;B
3,13pk,ADP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@](=O)(O)OP(=O)(O...,"ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421","ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421","A,B,C,D",EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A;B;C;D
4,16pk,BIS,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)O[P@](=...,BIS_A_499,BIS_A_499,A,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,"5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,...",A
...,...,...,...,...,...,...,...,...,...,...
4171,9gss,GTX,CCCCCCSC[C@@H](C(=O)NCC(=O)O)NC(=O)CC[C@@H](C(...,CCCCCCSC[C@H](NC(=O)CC[C@H]([NH3+])C(=O)O)C(=O...,"GTX_A_211,GTX_B_210","GTX_A_211,GTX_B_210","A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,...","B,A;B,A"
4172,9ldb,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401","NAD_A_401,NAD_B_401","A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","B,A;B,A"
4173,9ldb,OXM,C(=O)(C(=O)O)N,NC(=O)C(=O)O,OXM_B_402,OXM_B_402,"A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","B,A"
4174,9ldt,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401","NAD_A_401,NAD_B_401","A,B",ATLKDQLIHNLLKEEHVPHNKITVVGVGAVGMACAISILMKELADE...,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,2...","B,A;B,A"


In [135]:
PDBbind_df = PDBbind_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CASF2016_df = CASF2016_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CASF2013_df = CASF2013_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSAR2014_df = CSAR2014_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSAR2012_df = CSAR2012_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSARset1_df = CSARset1_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]
CSARset2_df = CSARset2_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]
Astex_df = Astex_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
COACH420_df = COACH420_df[["PDB", "Lig_code", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info", "COACH420_chain"]]
HOLO4K_df = HOLO4K_df[["PDB", "Lig_code", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]

In [136]:
PDBbind_df.to_csv("./preprocessed_data/step3_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_df.to_csv("./preprocessed_data/step3_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_df.to_csv("./preprocessed_data/step3_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_df.to_csv("./preprocessed_data/step3_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_df.to_csv("./preprocessed_data/step3_CSAR2012_data.tsv", sep = "\t", index = False)
CSARset1_df.to_csv("./preprocessed_data/step3_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_df.to_csv("./preprocessed_data/step3_CSARset2_data.tsv", sep = "\t", index = False)
Astex_df.to_csv("./preprocessed_data/step3_Astex_data.tsv", sep = "\t", index = False)
COACH420_df.to_csv("./preprocessed_data/step3_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_df.to_csv("./preprocessed_data/step3_HOLO4K_data.tsv", sep = "\t", index = False)

### 4. Add uniprot sequences

In [137]:
def preprocessing_PDBSWS(lines):
    data = dict()
    
    for line in lines:
        line_list = line.strip().split(" ")
        if len(line_list) == 3:
            data[f"{line_list[0]}_{line_list[1]}"] = line_list[-1]
            
    return data

In [138]:
def preprocessing_SIFTS(lines):
    data = dict()
    
    for line in lines[2:]:
        line_list = line.strip().split("\t")
        data[f"{line_list[0]}_{line_list[1]}"] = line_list[2]
    return data

In [139]:
# This file is form SIFTS (https://www.ebi.ac.uk/pdbe/docs/sifts/quick.html).
SIFTS_mapping_table = preprocessing_SIFTS(read_file(open("./data/supplementary/SIFTS_chain_mapping_table.txt", "r")))
len(SIFTS_mapping_table)

593192

In [140]:
# This file is from PDBSWS (http://bioinf.org.uk/servers/pdbsws/).
PDBSWS_mapping_table = preprocessing_PDBSWS(read_file(open("./data/supplementary/PDBSWS_chain_mapping_table.txt", "r")))
len(PDBSWS_mapping_table)

354753

In [141]:
def get_uniprot_id(df):
    uniprot_ids, fail_mapping = list(), 0
    PDBIDs, chain_indexes = df.PDB.values, df.Chain.values
    
    for pdbid, chain_index in zip(PDBIDs, chain_indexes):
        chain_index_list, mapping_uniprot_list = chain_index.split(","), list()
        
        for chain in chain_index_list:
            name = f"{pdbid}_{chain}"
            name = f"{pdbid}_{chain}"
            if name in SIFTS_mapping_table:
                mapping_uniprot_list.append(SIFTS_mapping_table[name])
            
            elif name in PDBSWS_mapping_table:
                mapping_uniprot_list.append(PDBSWS_mapping_table[name])
            
            else:
                mapping_uniprot_list.append('None')
                fail_mapping += 1
                
        uniprot_ids.append(",".join(mapping_uniprot_list))
        
    print(f"\tMapping Fail: {fail_mapping}")
    return uniprot_ids

def get_unique_uniprot_ids(total_uniprot):
    total_list = list()
    
    for uniprots in total_uniprot:
        for uni in uniprots.split(","):
            if uni != 'None':
                total_list.append(uni)
    
    print(f"Total Uniprot IDs: {len(set(total_list))}")
    print()

In [142]:
print(f"[PDBbind] Get Uniprot IDs")
PDBbind_uniprot_ids = get_uniprot_id(PDBbind_df)
get_unique_uniprot_ids(PDBbind_uniprot_ids)

print(f"[CASF2016] Get Uniprot IDs")
CASF2016_uniprot_ids = get_uniprot_id(CASF2016_df)
get_unique_uniprot_ids(CASF2016_uniprot_ids)

print(f"[CASF2013] Get Uniprot IDs")
CASF2013_uniprot_ids = get_uniprot_id(CASF2013_df)
get_unique_uniprot_ids(CASF2013_uniprot_ids)

print(f"[CSAR2014] Get Uniprot IDs")
CSAR2014_uniprot_ids = get_uniprot_id(CSAR2014_df)
get_unique_uniprot_ids(CSAR2014_uniprot_ids)

print(f"[CSAR2012] Get Uniprot IDs")
CSAR2012_uniprot_ids = get_uniprot_id(CSAR2012_df)
get_unique_uniprot_ids(CSAR2012_uniprot_ids)

print(f"[CSARset1] Get Uniprot IDs")
CSARset1_uniprot_ids = get_uniprot_id(CSARset1_df)
get_unique_uniprot_ids(CSARset1_uniprot_ids)

print(f"[CSARset2] Get Uniprot IDs")
CSARset2_uniprot_ids = get_uniprot_id(CSARset2_df)
get_unique_uniprot_ids(CSARset2_uniprot_ids)

print(f"[Astex] Get Uniprot IDs")
Astex_uniprot_ids = get_uniprot_id(Astex_df)
get_unique_uniprot_ids(Astex_uniprot_ids)

print(f"[COACH420] Get Uniprot IDs")
COACH420_uniprot_ids = get_uniprot_id(COACH420_df)
get_unique_uniprot_ids(COACH420_uniprot_ids)

print(f"[HOLO4K] Get Uniprot IDs")
HOLO4K_uniprot_ids = get_uniprot_id(HOLO4K_df)
get_unique_uniprot_ids(HOLO4K_uniprot_ids)

[PDBbind] Get Uniprot IDs
	Mapping Fail: 436
Total Uniprot IDs: 3040

[CASF2016] Get Uniprot IDs
	Mapping Fail: 2
Total Uniprot IDs: 77

[CASF2013] Get Uniprot IDs
	Mapping Fail: 1
Total Uniprot IDs: 75

[CSAR2014] Get Uniprot IDs
	Mapping Fail: 0
Total Uniprot IDs: 3

[CSAR2012] Get Uniprot IDs
	Mapping Fail: 0
Total Uniprot IDs: 8

[CSARset1] Get Uniprot IDs
	Mapping Fail: 3
Total Uniprot IDs: 116

[CSARset2] Get Uniprot IDs
	Mapping Fail: 8
Total Uniprot IDs: 104

[Astex] Get Uniprot IDs
	Mapping Fail: 0
Total Uniprot IDs: 79

[COACH420] Get Uniprot IDs
	Mapping Fail: 19
Total Uniprot IDs: 263

[HOLO4K] Get Uniprot IDs
	Mapping Fail: 109
Total Uniprot IDs: 1228



In [143]:
PDBbind_df["Uniprot_IDs"] = PDBbind_uniprot_ids
CASF2016_df["Uniprot_IDs"] = CASF2016_uniprot_ids
CASF2013_df["Uniprot_IDs"] = CASF2013_uniprot_ids
CSAR2014_df["Uniprot_IDs"] = CSAR2014_uniprot_ids
CSAR2012_df["Uniprot_IDs"] = CSAR2012_uniprot_ids
CSARset1_df["Uniprot_IDs"] = CSARset1_uniprot_ids
CSARset2_df["Uniprot_IDs"] = CSARset2_uniprot_ids
Astex_df["Uniprot_IDs"] = Astex_uniprot_ids
COACH420_df["Uniprot_IDs"] = COACH420_uniprot_ids
HOLO4K_df["Uniprot_IDs"] = HOLO4K_uniprot_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [144]:
def get_uniprot_ids(df_list):
    total_uniprot_ids = list()
    
    for df in df_list:
        uniprot_ids = df.Uniprot_IDs.values
        for i in uniprot_ids:
            for j in i.split(","):
                if j != 'None':
                    total_uniprot_ids.append(j)
    
    for uniprot_ids in list(set(total_uniprot_ids)):
        print(uniprot_ids, end = ",")
    
    return set(total_uniprot_ids)

In [145]:
total_uniprot_ids = get_uniprot_ids([PDBbind_df, CASF2016_df, CASF2013_df, CSAR2014_df, CSAR2012_df, CSARset1_df, CSARset2_df, Astex_df, COACH420_df, HOLO4K_df])

Q8SR45,Q944H0,Q96T66,Q93009,P08179,O43741,P36683,P16446,A0A140NFI1,Q6XT21,Q8WTS6,P06737,P09874,P01011,P0A7Z4,P07332,A0A0H3JRN6,P78540,Q26000,P37353,O94925,P10868,P62942,Q9NYL2,P16469,G8UY02,P19493,P39621,P22498,P61006,O04197,O09460,P01869,Q16658,Q969R5,P9WPA7,Q7BJM5,A5MTN0,P48650,P04391,P47227,P71278,Q47592,Q29437,F0UY39,Q836J0,P81186,Q9AJS0,P07901,P9WJM7,Q8TBN0,Q9FCE4,Q61166,Q43088,P35558,P41021,P83686,P20581,P0ACT8,P40763,P01801,Q00441,P56965,Q91754,A0A5B9,Q93UV0,P55859,B1MDI3,P0DTH5,G3MM57,Q9GPQ4,Q6P179,G3HK48,P22188,K4CN56,Q3J1R2,Q60187,P29274,Q03111,Q8RPA0,P15555,P01375,P07140,O43175,Q9X4K7,Q93EK7,O02372,P0AE78,Q38CA1,Q8ZKF6,Q8U3I1,P25451,P05090,P77918,P26639,P24182,P52756,Q6BB74,Q10587,Q57573,Q54331,P11562,P0AEK4,Q05599,P13726,Q9Q288,Q4QDU3,P56868,P24173,Q9BW91,B9A0T7,Q9Y478,Q5HGZ3,P39593,Q08881,O92972,E0PTS8,O07347,P39748,P9WGY7,P49789,Q9WY43,P9WK17,A0R618,Q2N0S5,P31550,Q10471,P04626,P0C1S8,P29476,P14775,Q9Y530,Q6JP77,O14757,Q9NZQ7,Q7SID3,I6Y9J2,O06842,P69834,A6QHR2,P60709,A0A2H

In [146]:
len(total_uniprot_ids)

4053

### 5. Add uniprot seqs

In [147]:
def preprocessing_fasta(lines):
    
    data, seqs = dict(), ""
    
    for line in lines:
        line = line.strip()
        
        if ">" in line:
            if len(seqs) == 0:
                uniprot_id = line.split("|")[1]
            else:
                if uniprot_id not in data:
                    data[uniprot_id] = seqs
                seqs = ""
                uniprot_id = line.split("|")[1]
        else:
            seqs += line
    
    data[uniprot_id] = seqs
    
    return data

In [148]:
'''
This file is from Uniprot retrieve service (https://www.uniprot.org/id-mapping)
- Mapping 4059 Uniprot IDs.
'''
uniprot_fasta_lines = read_file(open("./data/supplementary/Uniprot_mapping_results.fasta", "r"))

In [149]:
uniprot_fasta_dict = preprocessing_fasta(uniprot_fasta_lines)
print(f"Uniprot fasta: {len(uniprot_fasta_dict)}")

Uniprot fasta: 4011


In [150]:
def mapping_to_uniprot_seqs(df, uniprot_fasta_dict):
    total_uniprot_results = list()
    total_uniprot_list = df.Uniprot_IDs.values
    
    for uniprot_id in total_uniprot_list:
        tmp = list()
        for i in uniprot_id.split(","):
            if i in uniprot_fasta_dict:
                tmp.append(uniprot_fasta_dict[i])
            else:
                tmp.append('None')
        total_uniprot_results.append(",".join(tmp))
       
    return total_uniprot_results

In [151]:
PDBbind_uniprot_seqs = mapping_to_uniprot_seqs(PDBbind_df, uniprot_fasta_dict)
CASF2016_uniprot_seqs = mapping_to_uniprot_seqs(CASF2016_df, uniprot_fasta_dict)
CASF2013_uniprot_seqs = mapping_to_uniprot_seqs(CASF2013_df, uniprot_fasta_dict)
CSAR2014_uniprot_useqs = mapping_to_uniprot_seqs(CSAR2014_df, uniprot_fasta_dict)
CSAR2012_uniprot_seqs = mapping_to_uniprot_seqs(CSAR2012_df, uniprot_fasta_dict)
CSARset1_uniprot_seqs = mapping_to_uniprot_seqs(CSARset1_df, uniprot_fasta_dict)
CSARset2_uniprot_seqs = mapping_to_uniprot_seqs(CSARset2_df, uniprot_fasta_dict)
Astex_uniprot_seqs = mapping_to_uniprot_seqs(Astex_df, uniprot_fasta_dict)
COACH420_uniprot_seqs = mapping_to_uniprot_seqs(COACH420_df, uniprot_fasta_dict)
HOLO4K_uniprot_seqs = mapping_to_uniprot_seqs(HOLO4K_df, uniprot_fasta_dict)

In [152]:
PDBbind_df["Uniprot_seqs"] = PDBbind_uniprot_seqs

CASF2016_df["Uniprot_seqs"] = CASF2016_uniprot_seqs

CASF2013_df["Uniprot_seqs"] = CASF2013_uniprot_seqs

CSAR2014_df["Uniprot_seqs"] = CSAR2014_uniprot_useqs

CSAR2012_df["Uniprot_seqs"] = CSAR2012_uniprot_seqs

CSARset1_df["Uniprot_seqs"] = CSARset1_uniprot_seqs

CSARset2_df["Uniprot_seqs"] = CSARset2_uniprot_seqs

Astex_df["Uniprot_seqs"] = Astex_uniprot_seqs

COACH420_df["Uniprot_seqs"] = COACH420_uniprot_seqs

HOLO4K_df["Uniprot_seqs"] = HOLO4K_uniprot_seqs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

### 6. Extract unique pocket chain protein

In [153]:
def check_pocket_chain(df):
    check_index = list()
    pdb, chain_list, pocket_list, uniprot_seqs_list = df.PDB.values, df.Chain.values, df.Pocket_chain.values, df.Uniprot_seqs.values
    
    def make_dict(chain, seqs):
        results = dict()
        
        for c, s in zip(chain, seqs):
            results[c] = s
        
        return results
    
    for p, chain_info, pocket_info, uniprot_seqs in zip(pdb, chain_list, pocket_list, uniprot_seqs_list):

        chain_seqs_dict = make_dict(chain_info.split(","), uniprot_seqs.split(","))
        flag, tmp_list = 0, list()

        if len("".join(pocket_info.split(";")))!= 0:

            for pockets in pocket_info.split(";"):
                for pocket in pockets.split(","):
                    seqs = chain_seqs_dict[pocket]

                    if seqs == "None":
                        flag = 1

                    else:
                        tmp_list.append(seqs)

            if flag == 0:
                if len(set(tmp_list)) == 1:
                    check_index.append(True)
                else:
                    check_index.append(False)

            elif flag == 1:
                check_index.append(False)
        else:
            check_index.append(False)
            
    return check_index

In [154]:
PDBbind_df = PDBbind_df.loc[check_pocket_chain(PDBbind_df), :]
print(f"[PDBbind]: {len(PDBbind_df)} complexes")

CASF2016_df = CASF2016_df.loc[check_pocket_chain(CASF2016_df), :]
print(f"[CASF2016]: {len(CASF2016_df)} complexes")

CASF2013_df = CASF2013_df.loc[check_pocket_chain(CASF2013_df), :]
print(f"[CASF2013]: {len(CASF2013_df)} complexes")

CSAR2014_df = CSAR2014_df.loc[check_pocket_chain(CSAR2014_df), :]
print(f"[CSAR2014]: {len(CSAR2014_df)} complexes")

CSAR2012_df = CSAR2012_df.loc[check_pocket_chain(CSAR2012_df), :]
print(f"[CSAR2012]: {len(CSAR2012_df)} complexes")

CSARset1_df = CSARset1_df.loc[check_pocket_chain(CSARset1_df), :]
print(f"[CSARset1]: {len(CSARset1_df)} complexes")

CSARset2_df = CSARset2_df.loc[check_pocket_chain(CSARset2_df), :]
print(f"[CSARset2]: {len(CSARset2_df)} complexes")

Astex_df = Astex_df.loc[check_pocket_chain(Astex_df), :]
print(f"[Astex]: {len(Astex_df)} complexes")

COACH420_pocket_df = COACH420_df.loc[check_pocket_chain(COACH420_df), :]
print(f"[COACH420]: {len(COACH420_pocket_df)} complexes")

HOLO4K_pocket_df = HOLO4K_df.loc[check_pocket_chain(HOLO4K_df), :]
print(f"[HOLO4K]: {len(HOLO4K_pocket_df)} complexes")

[PDBbind]: 13716 complexes
[CASF2016]: 269 complexes
[CASF2013]: 172 complexes
[CSAR2014]: 46 complexes
[CSAR2012]: 55 complexes
[CSARset1]: 152 complexes
[CSARset2]: 135 complexes
[Astex]: 74 complexes
[COACH420]: 325 complexes
[HOLO4K]: 3958 complexes


### 7. Save data

In [155]:
PDBbind_df = PDBbind_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CASF2016_df = CASF2016_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CASF2013_df = CASF2013_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSAR2014_df = CSAR2014_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSAR2012_df = CSAR2012_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs",  "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
CSARset1_df = CSARset1_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]
CSARset2_df = CSARset2_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index",  "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]
Astex_df = Astex_df[["PDB", "Lig_code", "Labels", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs",  "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit", "Total_Lig_info", "Check_Lig_info"]]
COACH420_pocket_df = COACH420_pocket_df[["PDB", "Lig_code", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index", "Uniprot_IDs", "Uniprot_seqs", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info", "COACH420_chain"]]
HOLO4K_pocket_df = HOLO4K_pocket_df[["PDB", "Lig_code", "Chain", "Pocket_chain", "PDB_seqs", "Residue_index","Uniprot_IDs", "Uniprot_seqs","OE_stereo_SMILES", "RDKit_iso_SMILES", "Total_Lig_info", "Check_Lig_info"]]

In [156]:
PDBbind_df.to_csv("./preprocessed_data/step4_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_df.to_csv("./preprocessed_data/step4_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_df.to_csv("./preprocessed_data/step4_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_df.to_csv("./preprocessed_data/step4_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_df.to_csv("./preprocessed_data/step4_CSAR2012_data.tsv", sep = "\t", index = False)
CSARset1_df.to_csv("./preprocessed_data/step4_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_df.to_csv("./preprocessed_data/step4_CSARset2_data.tsv", sep = "\t", index = False)
Astex_df.to_csv("./preprocessed_data/step4_Astex_data.tsv", sep = "\t", index = False)
COACH420_pocket_df.to_csv("./preprocessed_data/step4_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_pocket_df.to_csv("./preprocessed_data/step4_HOLO4K_data.tsv", sep = "\t", index = False)