In [1]:
import os
import numpy as np
import pandas as pd
from biopandas.mol2 import PandasMol2
from multiprocessing import Process, Queue, Pool
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles

### 1. Load SMILES info

In [2]:
def read_file(file):
    return file.readlines()

def preprocessing_PDB_ligand(lines, sup_dict):

    results = dict()
    
    for line in lines:
        line_list = line.split("\t")
        
        if line_list[1] in sup_dict:
            results[line_list[1]] = sup_dict[line_list[1]]
            
        elif line_list[0] == "":
            continue
        
        else:
            results[line_list[1]] = line_list[0]

    return results

In [3]:
SMILES_stereo_correct = {"0IU":"C[C@@H](C1=CC=CC=C1)N(CC(=O)N(C)C)C(=O)C[C@@H](CC2=CSC(=N2)N)C(=O)N[C@@H](CC3CCCCC3)[C@H]([C@H](CC(C)C)O)O", 
                     "0QN":"C[C@@H](C(=O)NC1=CC=C(C=C1)C(F)(F)F)NC(=O)[C@H](C(C)C)NC(=O)C(F)(F)F", 
                     "2Z3":"CCCC[C@@H](C(=O)N[C@@H](CC1CCCCC1)C(C(C(=O)NC)(F)F)(O)O)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)N3CCOCC3", 
                     "0DB":"C1CCC(C1)(C[C@@H](CCC2=CC=CC=C2)C(=O)O)C(=O)N[C@@H](CC3=CNC4=CC=CC=C43)C(=O)O", 
                     "0EM":"CC[C@H](C)CNC(=O)CP(=O)([C@H](CC(C)C)NC(=O)[C@H](CC1=CN=CN1)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)OC(C)(C)C)O", 
                     "0DY":"CC(C)C[C@@H](C(=O)N[C@@H](CC(=O)N)C(=O)NCC1=CC(=CC=C1)N)C(=O)NO", 
                     "0QB":"C[C@@H](C1=CC=CC=C1)N(CC(=O)N(C)C)C(=O)C[C@@H](CC2=CSC(=N2)N)C(=O)N[C@@H](CC3CCCCC3)[C@H]([C@@H]4CC(N(C4=O)C)(C)C)O", 
                     "0ED":"CC(C)C[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)C(=O)NCCN2CCOCC2)N[C@H](CCNC(=O)OCC3=CC=CC=C3)C(=O)O", 
                     "0QH":"CC(C)C[C@@H](C(=O)N[N@+](C)(CC1=CC=CC=C1)CC(=O)NC2=CC=C(C=C2)C(C)C)NC(=O)C(F)(F)F", 
                     "0Z0":"C[C@@H](C(=O)NC1=CC=C(C=C1)C(F)(F)F)NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)C(F)(F)F", 
                     "0ZB":"CC(C)C[C@@H](C(=O)N[C@@H](CC(=O)O)C(=O)NCC1=CC=CC=C1)C(=O)NO", 
                     "0D3":"C[C@@H](C(=O)NCC(=O)N)NC(=O)[C@H](CC1=CC=CC=C1)CS", 
                     "0ZC":"CC(C)C[C@@H](C(=O)N1CC2=C(C[C@H]1C(=O)O)C3=CC=CC=C3N2)NC(=O)C4=CC=CO4", 
                     "01S":"C[C@@H](C(=O)NCC(=O)N)NC(=O)[C@H](CC(C)C)C(=O)NO", 
                     "0E9":"CC(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)C([C@H](CC2=CC=CC=C2)NC(=O)[C@H](C(C)C)NC(=O)OCC3=CC=CC=C3)O)NC(=O)OCC4=CC=CC=C4",
                     "0PK":"C[C@@H](C(=O)O)NC(=O)[C@H](CC(C)C)NP(=O)([C@H](CC1=CC=CC=C1)NC(=O)OCC2=CC=CC=C2)O",
                     "0PJ":"CC(C)C[C@@H](C(=O)N[C@@H](CC(C)C)C(=O)O)NP(=O)(CNC(=O)OCC1=CC=CC=C1)O"
                    }      

In [4]:
# This file is from the Ligand expo database (http://ligand-expo.rcsb.org/ld-download.html).
PDB_ligand_isomeric_dict = preprocessing_PDB_ligand(read_file(open("./data/supplementary/Components-smiles-stereo-oe.smi", "r")), 
                                                  SMILES_stereo_correct)

In [5]:
print(f"[Ligand Isomeric] SMILES: {len(PDB_ligand_isomeric_dict)}")

[Ligand Isomeric] SMILES: 40464


### 2. Preprocessing datasets

In [6]:
def read_file(file):
    return file.readlines()

def get_ligand_code(line):
    return line.split(" ")[-1].rstrip(")").lstrip("(").strip("_")

def convert(value, unit):
    if unit == 'nM':
        log_value = -np.log10(value)+9
    elif unit == 'uM':
        log_value = -np.log10(value)+6
    elif unit == 'mM':
        log_value = -np.log10(value)+3
    elif unit == 'pM':
        log_value = -np.log10(value)+12
    elif unit == 'fM':
        log_value = -np.log10(value)+15
    else:
        print(unit)    
    
    return str(np.round(log_value,2))

def get_values(line):
    measure, value, unit = line.split('=')[0], line.split('=')[1][:-2], line.split('=')[1][-2:]
    return measure, value, unit, convert(float(value), unit)

In [7]:
def preprocessing_PDBbind(lines, ligand_stereo_dict, amino_acids_short):
    
    results = dict()

    correct_ligand_code = {"6hzp":"FVT", "6mli":"HIS", "6mmq":"CMP", "6mlp":"HIS", "6nfg":"KKP", 
                           "3adu":"MYI", "6imd":"AH9", "6jad":"GLC", "6jag":"SUC", "1czo":"FMN", 
                           "1d04":"FMN", "6hbm":"AHR", "6agp":"GDP", "6nfo":"KKM", "6hop":"FER", 
                           "1aku":"FMN", "6jki":"SFG", "6a73":"IHP", "6hmb":"3NG", "6mdb":"JE4", 
                           "6mhd":"JRD", "6jz0":"CKO", "6myn":"K6Y", "6ocu":"M5D", "6hub":"GRW", 
                           "5xff":"6LF", "5xfj":"6LF", "1akt":"FMN", "1akw":"FMN", "1azl":"FMN", 
                           "6n3n":"KA4", "1czk":"FMN", "1czr":"FMN", "1akr":"FMN", "1akq":"FMN",
                           "1czl":"FMN", "6moo":"A5F", "1c7f":"FMN", "1akv":"FMN", "1i7m":"CG",
                           "6cqt":"VX", "6cqz":"VX", "1xzx":"T3", "1a5v":"Y3", "1af2":"U", 
                           "1phw":"N", "2gfj":"VI", "1kll":"MC", "6o4x":"AA",
                            "5moq":"BEN", "1tyr":"9CR", "3t83":"MG5", "4elh":"53I", "5w0l":"9UD", 
                            "5mnh":"BEN", "1h00":"FCP", "4p0n":"1IR", "4a50":"DQ6", "5n9n":"KC5", 
                            "5lvx":"7E4", "1p28":"HBS", "1bap":"ARA", "5x72":"P59", "5mo0":"BEN", 
                            "3lk1":"JKE", "6eu6":"ZDM", "2cht":"TSA", "1apb":"FCA", "1oko":"GAL", 
                            "1f5k":"BEN", "7abp":"FCA", "5g6u":"SGN", "5ijj":"IHP", "4q1e":"2Y7", 
                            "5mng":"BEN", "3rv6":"RVA", "5mw3":"5JJ", "5wkm":"N02", "1c5p":"BEN", 
                            "1abf":"FCA", "6qs5":"JGB", "2srt":"8MI", "9abp":"GLA", "4d1d":"5ND", 
                            "4elg":"52I", "1c5o":"BEN", "1h07":"MFP", "3rv8":"RVC", "6eiz":"FOF", 
                            "1c5z":"BEN", "4qhp":"32Q", "5ijp":"IHP", "6abp":"ARA", "5abp":"GLA", 
                            "1ws5":"MMA", "5wkl":"AVY", "8abp":"GLA", "5t52":"A2G", "3uwl":"FFO",
                            "2qtr":"DND", "2x4z":"7KC", "1rdn":"NDG", "1szm":"BI4", "2qwe":"ZMR", 
                            "3dla":"DND", "4elb":"34R"}   
    for line in lines[6:]:
        
        line_list, ligand_code = line.strip().split("//")[0].strip().split('  '), get_ligand_code(line.strip().split("//")[-1].strip())

        if line_list[0] in correct_ligand_code:
            ligand_code = correct_ligand_code[line_list[0]]

        if 'mer' in ligand_code:
            continue

        elif '<' in line_list[4] or '>' in line_list[4] or '~' in line_list[4]:
            continue

        elif ligand_code not in ligand_stereo_dict:
            continue

        elif ligand_code in amino_acids_short:
            continue

        else:
            measure, value, unit, aff = get_values(line_list[4])
            results[line_list[0]] = [measure, value, unit, aff, line_list[3], ligand_code] 

    return results        

In [8]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [9]:
# This file is from the PDBbind database (http://www.pdbbind.org.cn/).
lines = read_file(open("./data/supplementary/INDEX_general_PL_data.2020", "r"))
PDBbind_labels_dict = preprocessing_PDBbind(lines, PDB_ligand_isomeric_dict, amino_acids_short)
print(f"PDBbind labels: {len(PDBbind_labels_dict)}")

PDBbind labels: 16148


#### 2.1 CSAR2014 and CSAR2012 dataset

In [10]:
#This file is from the CSAR database (www.csardock.org).
CSAR2014_info = pd.read_csv("./data/supplementary/CSAR2014_labels.csv", sep = ",")
CSAR2012_info = pd.read_csv("./data/supplementary/CSAR2012_labels.csv", sep = ",")

In [11]:
def make_df(df, ligand_stereo_dict, amino_acids_short):
    results = {"PDB":[], "Lig_code":[], "Labels":[], "OE_stereo_SMILES":[], "Measure":[], "Potency":[], "Unit":[]}
    
    for line in df.values:
        pdb, ligand_code, measure, potency, unit, label = line[2], line[3], line[5], line[6], line[7], line[8]  
        
        if ligand_code not in ligand_stereo_dict:
            continue
        
        elif ligand_code in amino_acids_short:
            continue
        
        else:
            results["PDB"].append(pdb.lower())
            results["Lig_code"].append(ligand_code)
            results["Labels"].append(label)
            results["OE_stereo_SMILES"].append(ligand_stereo_dict[ligand_code])
            results["Measure"].append(measure)
            results["Potency"].append(potency)
            results["Unit"].append(unit)
            
    return pd.DataFrame(results)    

In [12]:
CSAR2014_df = make_df(CSAR2014_info,  PDB_ligand_isomeric_dict, amino_acids_short)
CSAR2014_list = CSAR2014_df.PDB.values
print(f"CSAR2014 set: {len(CSAR2014_list)}")

CSAR2014 set: 46


In [13]:
CSAR2012_df = make_df(CSAR2012_info, PDB_ligand_isomeric_dict, amino_acids_short)
CSAR2012_list = CSAR2012_df.PDB.values
print(f"CSAR2012 set: {len(CSAR2012_list)}")

CSAR2012 set: 55


#### 2.2 CSARset1 and CSARset2 dataset

In [14]:
#This file is from the CSAR database (www.csardock.org). 
CSARset1_raw_info = read_file(open("./data/supplementary/set1.csv", "r"))
CSARset2_raw_info = read_file(open("./data/supplementary/set2.csv", "r"))

In [15]:
def preprocessing_HiQ(lines):
    data = dict()
    
    for idx, val in enumerate(lines[1:]):
        line_list = val.strip().split(",")
        data[line_list[1].strip()] = [line_list[0].strip(), line_list[2].strip(), line_list[3].strip()]
    return data

In [16]:
CSARset1_dict = preprocessing_HiQ(CSARset1_raw_info)
CSARset2_dict = preprocessing_HiQ(CSARset2_raw_info)

In [17]:
def make_df(info_dict, ligand_stereo_dict, amino_acids_short):
    results = {"PDB":[], "Lig_code":[], "Labels":[], "OE_stereo_SMILES":[]}
    
    for pdbid in list(info_dict.keys()):
        label, ligand_code = info_dict[pdbid][1], info_dict[pdbid][2] 
        
        if ligand_code not in ligand_stereo_dict:
            continue
            
        elif ligand_code in amino_acids_short:
            continue

        else:
            results["PDB"].append(pdbid)
            results["Lig_code"].append(ligand_code)
            results["Labels"].append(label)
            results["OE_stereo_SMILES"].append(ligand_stereo_dict[ligand_code])            

    return pd.DataFrame(results)   

In [18]:
CSARset1_df = make_df(CSARset1_dict, PDB_ligand_isomeric_dict, amino_acids_short)
CSARset1_list = CSARset1_df.PDB.values
print(f"CSARset1 set: {len(CSARset1_list)}")

CSARset1 set: 161


In [19]:
CSARset2_df = make_df(CSARset2_dict, PDB_ligand_isomeric_dict, amino_acids_short)
CSARset2_list = CSARset2_df.PDB.values
print(f"CSARset2 set: {len(CSARset2_list)}")

CSARset2 set: 149


#### 2.3 Astex dataset

In [20]:
# This file is from the original paper (https://doi.org/10.1021/jm061277y) and we have modified the data by cross-check it with the PDB database.
Astex_info = pd.read_csv("./data/supplementary/Astex_labels.csv", sep = "\t")

In [21]:
def make_df(df, ligand_stereo_dict, amino_acids_short):
    results = {"PDB":[], "Lig_code":[], "Labels":[], "OE_stereo_SMILES":[], "Measure":[], "Potency":[], "Unit":[]}
    
    for line in df.values:
        pdb, ligand_code, measure, potency, unit, label = line[0], line[3], line[4], line[5], line[6], line[8]

        if ligand_code not in ligand_stereo_dict:
            continue
        
        elif ligand_code in amino_acids_short:
            continue
            
        else:
            results["PDB"].append(pdb)
            results["Lig_code"].append(ligand_code)
            results["Labels"].append(label)
            results["OE_stereo_SMILES"].append(ligand_stereo_dict[ligand_code])
            results["Measure"].append(measure)
            results["Potency"].append(potency)
            results["Unit"].append(unit)
            
    return pd.DataFrame(results)

In [22]:
Astex_df = make_df(Astex_info, PDB_ligand_isomeric_dict, amino_acids_short)
Astex_list = Astex_df.PDB.values
print(f"Astex set: {len(Astex_list)}")

Astex set: 75


#### 2.4 COACH420 dataset

In [23]:
def load_ligand_info(path):
    
    results = dict()
    
    lines = open(path, "r").readlines()
    
    for line in lines:
        if "#" not in line:
            line = line.strip()
            if line != "":
                line_list = line.split("  ")
                pdb = line_list[0].split("/")[1][:-4]
                lig_list = line_list[1].split(",")

                tmp = list()
                for i in lig_list:
                    if i not in amino_acids_short:
                        tmp.append(i.strip())

                if len(tmp) != 0:
                    results[pdb] = tmp            
    
    return results

In [24]:
#This file is from the P2Rank github (https://github.com/rdk/p2rank).
COACH420_ligand_dict = load_ligand_info("./data/supplementary/COACH420_ligand_code.txt")

In [25]:
def make_df(input_dict, ligand_stereo_dict, amino_acids_short):
    results = {"PDB":[], "Lig_code":[], "COACH420_chain":[], "OE_stereo_SMILES":[]}
    
    for pdbid_chain in list(input_dict.keys()):
        pdbid, chain, ligand_code_list = pdbid_chain[:-1], pdbid_chain[-1:], input_dict[pdbid_chain]
        
        for lig_code in ligand_code_list:
            
            if f"{pdbid}_{lig_code}" not in ["2vu9_GAL",  # problem in graph preprocessing
                                            "1awf_TYS", "1c5o_BAM", "1c5p_BAM", "1ett_0ZG", "1f5k_BAM", "1i72_PUT", "1j37_MCO", "1qhr_TYS", "1qj1_TYS", "1qj6_TYS", "1qj7_TYS", "2qwe_GNA"]: 
                if lig_code not in ligand_stereo_dict:
                    continue

                elif lig_code in amino_acids_short:
                    continue            

                else:
                    results["PDB"].append(pdbid)
                    results["Lig_code"].append(lig_code)
                    results["COACH420_chain"].append(chain)
                    results["OE_stereo_SMILES"].append(ligand_stereo_dict[lig_code])

    return pd.DataFrame(results)    

In [26]:
COACH420_df = make_df(COACH420_ligand_dict, PDB_ligand_isomeric_dict, amino_acids_short)
COACH420_list = COACH420_df.PDB.values
print(f"COACH420 set: {len(COACH420_list)}")

COACH420 set: 367


#### 2.5 HOLO4K dataset

In [27]:
# This file is from the P2Rank github (https://github.com/rdk/p2rank).
HOLO4K_ligand_dict = load_ligand_info("./data/supplementary/HOLO4K_ligand_code.txt")

In [28]:
def make_df(input_dict, ligand_stereo_dict, amino_acids_short):
    results = {"PDB":[], "Lig_code":[], "OE_stereo_SMILES":[]}

    for pdbid in list(input_dict.keys()):
        ligand_code_list = input_dict[pdbid]    
        for lig_code in ligand_code_list:
            
            if f"{pdbid}_{lig_code}" not in ['1nx8_N7P', # problem in graph preprocessing
                                            "1awf_TYS", "1c5o_BAM", "1c5p_BAM", "1ett_0ZG", "1f5k_BAM", "1i72_PUT", "1j37_MCO", "1qhr_TYS", "1qj1_TYS", "1qj6_TYS", "1qj7_TYS", "2qwe_GNA"]:  
                if lig_code not in ligand_stereo_dict:
                    continue

                elif lig_code in amino_acids_short:
                    continue    

                else:
                    results["PDB"].append(pdbid)
                    results["Lig_code"].append(lig_code)
                    results["OE_stereo_SMILES"].append(ligand_stereo_dict[lig_code])

    return pd.DataFrame(results)                

In [29]:
HOLO4K_df = make_df(HOLO4K_ligand_dict, PDB_ligand_isomeric_dict, amino_acids_short)
HOLO4K_list = HOLO4K_df.PDB.values
print(f"HOLO4K set: {len(HOLO4K_list)}")

HOLO4K set: 4314


#### 2.6 CASF2016 and CASF2013 dataset

In [30]:
def make_df(labels_dict, complex_list, ligand_stereo_dict):
    results = {"PDB":[], "Lig_code":[], "Labels":[], "OE_stereo_SMILES":[], "Measure":[], "Potency":[], "Unit":[]}
    
    for pdbid in complex_list:
        
        if pdbid in labels_dict:
            data = labels_dict[pdbid]
            
            if f"{pdbid}_{data[-1]}" not in ['1gbt_GBS', '5g0q_IF6', '3rxj_GBS', '3rme_RME', '2gss_EAA']: # problem in graph preprocessing
                results["PDB"].append(pdbid)
                results["Lig_code"].append(data[-1])
                results["Labels"].append(data[-2])
                results["OE_stereo_SMILES"].append(ligand_stereo_dict[data[-1]])          
                results["Measure"].append(data[1])
                results["Potency"].append(data[2])
                results["Unit"].append(data[3])
        
    return pd.DataFrame(results)

In [31]:
CASF2016_list = [i.strip() for i in read_file(open("./data/supplementary/CASF2016_list.txt", "r"))]
CASF2016_df = make_df(PDBbind_labels_dict, CASF2016_list, PDB_ligand_isomeric_dict)
CASF2016_list = CASF2016_df.PDB.values
print(f"CASF2016 set: {len(CASF2016_list)}")

CASF2016 set: 279


In [32]:
CASF2013_list = [i.strip() for i in read_file(open("./data/supplementary/CASF2013_list.txt", "r"))]
CASF2013_df = make_df(PDBbind_labels_dict, CASF2013_list, PDB_ligand_isomeric_dict)
CASF2013_list = CASF2013_df.PDB.values
print(f"CASF2013 set: {len(CASF2013_list)}")

CASF2013 set: 180


#### 2.7 PDBbind dataset

In [33]:
# This file is from the PDBbind database (http://www.pdbbind.org.cn/).
general_set_except_refined_set_list = [i.strip() for i in read_file(open("./data/supplementary/PDBbind_2020_general_list.txt", "r"))]
refiend_set_list = [i.strip() for i in read_file(open("./data/supplementary/PDBbind_2020_refined_list.txt", "r"))]

In [34]:
total_set_list = list(set(general_set_except_refined_set_list) | set(refiend_set_list))
print(f"total set: {len(total_set_list)}")

total set: 19443


In [35]:
PDBbind_list = list(set(total_set_list) - set(CASF2016_list) - set(CASF2013_list) - set(CSAR2014_list) - set(CSAR2012_list) - set(CSARset1_list) - set(CSARset2_list) - set(Astex_list) - set(COACH420_list) - set(HOLO4K_list))
PDBbind_df = make_df(PDBbind_labels_dict, PDBbind_list, PDB_ligand_isomeric_dict)
print(f"PDBbind training set: {len(PDBbind_df)}")

PDBbind training set: 14725


### 3. Convert SMILES

#### 3.1  PDBbind dataset

In [36]:
def convert_rdkit_smiles(df, isomeric = False):
    
    results = list()

    for line in df.values:
        try:
            smiles = MolToSmiles(MolFromSmiles(line[3]),isomericSmiles = isomeric, kekuleSmiles = True)
            results.append(smiles)
        except:
            results.append(None)
    return results

In [37]:
rdkit_smiles_isomeric = convert_rdkit_smiles(PDBbind_df, isomeric = True)

[21:49:51] Explicit valence for atom # 24 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 6 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 25 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 2 O, 3, is greater than permitted
[21:49:51] Explicit valence for atom # 10 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 0 B, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 10 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 0 B, 5, is greater than permitted
[21:49:51] Explicit valence for atom # 28 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 6 N, 4, is greater than permitted
[21:49:51] Explicit valence for atom # 0 B, 6, is greater than permitted
[21:49:52] Explicit valence for atom # 11 N, 4, is greater than permitted
[21:49:52] Explicit valence for atom # 3 N, 4, is greater than permitted
[21:49:52] Explicit valence for atom # 0 B, 6

In [38]:
PDBbind_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [39]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)

In [40]:
print(f"[PDBbind] complexes: {len(PDBbind_df)}")

[PDBbind] complexes: 14673


#### 3.2  CASF2016 dataset

In [41]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CASF2016_df, isomeric = True)

In [42]:
CASF2016_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [43]:
CASF2016_df = CASF2016_df.dropna(axis = 0).reset_index(drop=True)

In [44]:
print(f"[CASF2016] complexes: {len(CASF2016_df)}")

[CASF2016] complexes: 279


#### 3.3  CASF2013 dataset

In [45]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CASF2013_df, isomeric = True)

In [46]:
CASF2013_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [47]:
CASF2013_df = CASF2013_df.dropna(axis = 0).reset_index(drop=True)

In [48]:
print(f"[CASF2013] complexes: {len(CASF2013_df)}")

[CASF2013] complexes: 180


#### 3.4  CSAR2014 dataset

In [49]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CSAR2014_df, isomeric = True)

In [50]:
CSAR2014_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [51]:
CSAR2014_df = CSAR2014_df.dropna(axis = 0).reset_index(drop=True)

In [52]:
print(f"[CSAR2014] complexes: {len(CSAR2014_df)}")

[CSAR2014] complexes: 46


#### 3.5  CSAR2012 dataset

In [53]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CSAR2012_df, isomeric = True)

In [54]:
CSAR2012_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [55]:
CSAR2012_df = CSAR2012_df.dropna(axis = 0).reset_index(drop=True)

In [56]:
print(f"[CSAR2012] complexes: {len(CSAR2012_df)}")

[CSAR2012] complexes: 55


#### 3.6  CSARset1 dataset

In [57]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CSARset1_df, isomeric = True)

In [58]:
CSARset1_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [59]:
CSARset1_df = CSARset1_df.dropna(axis = 0).reset_index(drop=True)

In [60]:
print(f"[CSARset1] complexes: {len(CSARset1_df)}")

[CSARset1] complexes: 161


#### 3.7  CSARset2 dataset

In [61]:
rdkit_smiles_isomeric = convert_rdkit_smiles(CSARset2_df, isomeric = True)

In [62]:
CSARset2_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [63]:
CSARset2_df = CSARset2_df.dropna(axis = 0).reset_index(drop=True)

In [64]:
print(f"[CSARset2] complexes: {len(CSARset2_df)}")

[CSARset2] complexes: 149


#### 3.8  Astex dataset

In [65]:
rdkit_smiles_isomeric = convert_rdkit_smiles(Astex_df, isomeric = True)

In [66]:
Astex_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [67]:
Astex_df = Astex_df.dropna(axis = 0).reset_index(drop=True)

In [68]:
print(f"[Astex] complexes: {len(Astex_df)}")

[Astex] complexes: 75


#### 3.9 COACH420 dataset

In [69]:
rdkit_smiles_isomeric = convert_rdkit_smiles(COACH420_df, isomeric = True)

In [70]:
COACH420_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [71]:
COACH420_df = COACH420_df.dropna(axis = 0).reset_index(drop=True)

In [72]:
print(f"[COACH420] complexes: {len(COACH420_df)}")

[COACH420] complexes: 367


#### 3.10 HOLO4K dataset

In [73]:
def convert_rdkit_smiles(df, isomeric = False):
    
    results = list()

    for line in df.values:
        try:
            smiles = MolToSmiles(MolFromSmiles(line[2]),isomericSmiles = isomeric, kekuleSmiles = True)
            results.append(smiles)
        except:
            results.append(None)
    return results

In [74]:
rdkit_smiles_isomeric = convert_rdkit_smiles(HOLO4K_df, isomeric = True)

[21:50:00] Explicit valence for atom # 0 Be, 3, is greater than permitted
[21:50:00] Explicit valence for atom # 0 Be, 4, is greater than permitted
[21:50:00] Explicit valence for atom # 0 Be, 3, is greater than permitted
[21:50:00] Explicit valence for atom # 0 Be, 4, is greater than permitted
[21:50:00] Explicit valence for atom # 0 Be, 3, is greater than permitted
[21:50:00] Explicit valence for atom # 4 O, 3, is greater than permitted
[21:50:00] Explicit valence for atom # 3 O, 3, is greater than permitted
[21:50:00] Explicit valence for atom # 7 Cl, 5, is greater than permitted


In [75]:
HOLO4K_df["RDKit_iso_SMILES"] = rdkit_smiles_isomeric

In [76]:
HOLO4K_df = HOLO4K_df.dropna(axis = 0).reset_index(drop=True)

In [77]:
print(f"[HOLO4K] complexes: {len(HOLO4K_df)}")

[HOLO4K] complexes: 4306


### 4. Save data

In [78]:
PDBbind_df = PDBbind_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]
CASF2016_df = CASF2016_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]
CASF2013_df = CASF2013_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]
CSAR2014_df = CSAR2014_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]
CSAR2012_df = CSAR2012_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]
Astex_df = Astex_df[["PDB", "Lig_code", "Labels", "OE_stereo_SMILES", "RDKit_iso_SMILES", "Measure", "Potency", "Unit"]]

In [79]:
PDBbind_df.to_csv("./preprocessed_data/step1_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_df.to_csv("./preprocessed_data/step1_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_df.to_csv("./preprocessed_data/step1_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_df.to_csv("./preprocessed_data/step1_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_df.to_csv("./preprocessed_data/step1_CSAR2012_data.tsv", sep = "\t", index = False)
CSARset1_df.to_csv("./preprocessed_data/step1_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_df.to_csv("./preprocessed_data/step1_CSARset2_data.tsv", sep = "\t", index = False)
Astex_df.to_csv("./preprocessed_data/step1_Astex_data.tsv", sep = "\t", index = False)
COACH420_df.to_csv("./preprocessed_data/step1_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_df.to_csv("./preprocessed_data/step1_HOLO4K_data.tsv", sep = "\t", index = False)

### 5. Download PDB and Ideal ligand structure

In [80]:
PDBbind_PDB, CASF2016_PDB, CASF2013_PDB, CSAR2014_PDB, CASR2012_PDB, CSARset1_PDB, CSARset2_PDB, Astex_PDB, COACH420_PDB, HOLO4K_PDB = PDBbind_df.iloc[:,0].values, CASF2016_df.iloc[:,0].values, CASF2013_df.iloc[:,0].values, CSAR2014_df.iloc[:,0].values, CSAR2012_df.iloc[:,0].values, CSARset1_df.iloc[:,0].values, CSARset2_df.iloc[:,0].values, Astex_df.iloc[:,0].values, COACH420_df.iloc[:,0].values, HOLO4K_df.iloc[:,0].values
PDBbind_LIG, CASF2016_LIG, CASF2013_LIG, CSAR2014_LIG, CASR2012_LIG, CSARset1_LIG, CSARset2_LIG, Astex_LIG, COACH420_LIG, HOLO4K_LIG = PDBbind_df.iloc[:,1].values, CASF2016_df.iloc[:,1].values, CASF2013_df.iloc[:,1].values, CSAR2014_df.iloc[:,1].values, CSAR2012_df.iloc[:,1].values, CSARset1_df.iloc[:,1].values, CSARset2_df.iloc[:,1].values, Astex_df.iloc[:,1].values, COACH420_df.iloc[:,1].values, HOLO4K_df.iloc[:,1].values

In [81]:
PDB_list = np.concatenate((PDBbind_PDB, CASF2016_PDB, CASF2013_PDB, CSAR2014_PDB, CASR2012_PDB, CSARset1_PDB, CSARset2_PDB, Astex_PDB, COACH420_PDB, HOLO4K_PDB))
LIG_list = np.concatenate((PDBbind_LIG, CASF2016_LIG, CASF2013_LIG, CSAR2014_LIG, CASR2012_LIG, CSARset1_LIG, CSARset2_LIG, Astex_LIG, COACH420_LIG, HOLO4K_LIG))

In [82]:
print(f"Unique PDB: {len(np.unique(PDB_list))}, Unique Ligand: {len(np.unique(LIG_list))}")

Unique PDB: 19013, Unique Ligand: 13352


In [83]:
PDB_df = pd.DataFrame({"PDB":[i for i in np.unique(PDB_list)]})
LIG_df = pd.DataFrame({"LIG":[i for i in np.unique(LIG_list)]})

In [84]:
PDB_df

Unnamed: 0,PDB
0,10gs
1,121p
2,12as
3,13gs
4,13pk
...,...
19008,9gss
19009,9hvp
19010,9icd
19011,9ldb


In [85]:
LIG_df

Unnamed: 0,LIG
0,001
1,002
2,003
3,006
4,007
...,...
13347,ZZO
13348,ZZP
13349,ZZQ
13350,ZZX


In [86]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [87]:
def download_PDB_structure(pdb):
    if not os.path.isfile(f"./data/PDB/protein/{pdb}.pdb"):
        command = f"wget https://files.rcsb.org/download/{pdb}.pdb --directory-prefix={path}"
        os.system(command)

In [88]:
def get_pdb_info_bulk(df):
    return df.PDB.map(download_PDB_structure)

In [89]:
path = "./data/PDB/protein"

In [90]:
_ = parallelize_dataframe(PDB_df, get_pdb_info_bulk, 10)

--2024-09-04 21:50:04--  https://files.rcsb.org/download/10gs.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/182l.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/17gs.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/13pk.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/183l.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/16pk.pdb
Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... --2024-09-04 21:50:04--  https://files.rcsb.org/download/12as.pdb
Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... --2024-09-04 21:50:04--  https://files.rcsb.org/download/121p.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/13gs.pdb
--2024-09-04 21:50:04--  https://files.rcsb.org/download/184l.pdb
Resolving files.rcsb.org (files.rcsb.org)... 128.6.159.100
128.6.159.100
Connecting to files.rcsb.org (files.rcsb.o

   400K .......... .......... .......... .......... ..........  263K
   450K .......... .......... .......... .......... .......... 12.3M
   500K .......... .......... .......... .......... .......... 12.2M
   550K .......... .......... .......... .......... .......... 10.0M
   600K .......... .......... .......... .......... .......... 12.2M
   650K .......... .......... .......... .......... .......... 9.91M
   700K .......... .......... .......... .......... .......... 11.9M
   750K .......... .......... .......... .......... .......... 12.0M
   800K .......... .......... .......... .......... ..........  291K
   850K .......... .......... .......... .......... .......... 12.1M
   900K .......... .......... .......... .......... .......... 12.2M
   950K .......... .......... .......... .......... .......... 10.1M
  1000K .......... .......... .......... .......... .......... 11.9M
  1050K .......... .......... .......... .......... .......... 10.2M
  1100K .......... .......... ....

In [91]:
def download_ligand_structure(lig):
    if not os.path.isfile(f"./data/PDB/ligand/{lig}_ideal.pdb"):
        #command = f"wget https://files.rcsb.org/ligands/download/{lig}_ideal.pdb --directory-prefix={path}"
        command = f"wget https://files.rcsb.org/ligands/{lig[0]}/{lig}/{lig}_ideal.pdb --directory-prefix={path}"
        os.system(command)
    
    if not os.path.isfile(f"./data/PDB/ligand/{lig}_ideal.sdf"):
        command = f"wget https://files.rcsb.org/ligands/download/{lig}_ideal.sdf --directory-prefix={path}"
        os.system(command)

In [92]:
def get_ligand_info_bulk(df):
    return df.LIG.map(download_ligand_structure)

In [93]:
path = "./data/PDB/ligand/"

In [94]:
_ = parallelize_dataframe(LIG_df, get_ligand_info_bulk, 10)

--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/002/002_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/001/001_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/003/003_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/006/006_ideal.pdb
Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... Resolving files.rcsb.org (files.rcsb.org)... --2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/007/007_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/00G/00G_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/009/009_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/00L/00L_ideal.pdb
--2024-09-04 21:50:10--  https://files.rcsb.org/ligands/0/00N/00N_ideal.pdb
128.6.159.100
Connecting to files.rcsb.org (files.rcsb.org)|128.6.159.100|:443... --2024-09-04 21:50:10--  https://files.rcsb.org/ligand

In [95]:
def read_file(file):
    return file.readlines()

def additional_download(df):
    path = "./data/PDB/ligand/"

    for lig in df.values:
        if not os.path.isfile(f"{path}{lig[0]}_ideal.pdb"):
            lines = read_file(open(f"{path}{lig[0]}_ideal.pdb", "r"))

            command = f"wget https://files.rcsb.org/ligands/download/{lig[0]}_ideal.mol2 --directory-prefix={path}"
            os.system(command)

            command = f"obabel -imol2 {path}{lig[0]}_ideal.mol2 -opdb -O {path}{lig[0]}_ideal.pdb"
            os.system(command)

            command = f"obabel -imol2 {path}{lig[0]}_ideal.mol2 -osdf -O {path}{lig[0]}_ideal.sdf"
            os.system(command)    

In [96]:
additional_download(LIG_df)

### 6. Check ligand info

In [97]:
path = "./data/PDB/protein"

In [98]:
def extract_ligand_info(row):
    pdbid, lig_code, ligand_info = row[0], row[1], list()
    lines = read_file(open(f"{path}/{pdbid}.pdb", "r"))
    
    model_flag = 0
    
    """ Extract ligand info """
    for line in lines:
        if "ANISOU" not in line:
            if line[:5].strip() == "MODEL":
                if model_flag == 1:
                    break
                model_falg = 1
            
            if "HETATM" in line:
                residue, chain, residue_seq_number = line[17:20].strip(), line[21:22].strip(), line[22:26].strip()

                if residue == lig_code:
                    if f"{residue}_{chain}_{residue_seq_number}" not in ligand_info:
                        ligand_info.append(f"{residue}_{chain}_{residue_seq_number}")

    if len(ligand_info) == 0:
        return None
    else:
        return ",".join(ligand_info)

In [99]:
def extract_ligand_info_bulk(df):
    return df.apply(extract_ligand_info, axis = 1)

In [100]:
def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

#### 6.1 PDBbind dataset

In [101]:
PDBbind_ligand_info_results = parallelize_dataframe(PDBbind_df, extract_ligand_info_bulk, 10)

In [102]:
PDBbind_ligand_info_results = pd.concat(PDBbind_ligand_info_results)

In [103]:
PDBbind_df["Total_Lig_info"] = PDBbind_ligand_info_results.map(lambda a: a if a is not None else None)

In [104]:
PDBbind_df = PDBbind_df.dropna(axis = 0).reset_index(drop=True)
PDBbind_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,5u8a,82D,7.89,Cn1cc(c2c1cccc2)[C@H]3CN(C[C@@H]3N(C)C)Cc4c(cc...,CN(C)[C@H]1CN(CC2=C(F)C=CC=C2Br)C[C@@H]1C1=CN(...,13,nM,7.89,82D_A_501
1,6e1y,HLM,7.66,C[C@@H](c1cccc(c1)Cl)NC(=O)c2cccc(c2)CNC3=NCCN3,C[C@H](NC(=O)C1=CC=CC(CNC2=NCCN2)=C1)C1=CC=CC(...,22,nM,7.66,"HLM_A_401,HLM_B_401"
2,6i61,H3W,6.24,CCC(C)(c1ccc(cc1)O)c2ccc(cc2)O,CCC(C)(C1=CC=C(O)C=C1)C1=CC=C(O)C=C1,569.0,nM,6.24,H3W_A_501
3,6hub,GRW,8.10,C[C@H]([C@@H](C(=O)N[C@H](CCS(=O)(=O)C)Cc1ccc(...,CC(C)C[C@H](NC(=O)[C@@H](CC1=CC=CC=C1)N=[N+]=[...,8,nM,8.1,"GRW_H_301,GRW_K_301,GRW_V_301,GRW_Y_301"
4,4hhy,15R,8.72,c1ccc2c(c1)C(=NNC2=O)Cc3ccc(c(c3)C(=O)N4CCC(CC...,O=C1NC[C@H]2C3=C1C=CC=C3CCN2C(=O)C1CCN(C(=O)C2...,1.92,nM,8.72,"15R_A_401,15R_B_401,15R_C_401,15R_D_401"
...,...,...,...,...,...,...,...,...,...
14538,4bty,JWF,7.70,CN1CCN(CC1)c2ccc(cc2)NC3=CC(=O)N(N=C3c4[nH]ncn...,CN1CCN(C2=CC=C(NC3=CC(=O)N(C4=CC=C(Cl)C=C4)N=C...,0.020,uM,7.7,"JWF_A_2000,JWF_B_2000"
14539,5csh,54E,3.57,c1ccc(cc1)c2ccc(cc2Cl)CN,NCC1=CC=C(C2=CC=CC=C2)C(Cl)=C1,270,uM,3.57,"54E_A_401,54E_A_402,54E_B_403,54E_B_404"
14540,2pj2,864,7.92,CC(C)[C@H](NC(=O)OCc1ccccc1)[P@@](=O)(C[C@@H](...,CC(C)[C@H](NC(=O)OCC1=CC=CC=C1)[P@](=O)(O)C[C@...,12,nM,7.92,"864_A_401,864_B_501,864_C_601"
14541,4xmo,46G,8.70,Cc1cc(on1)c2cc(c3nnc(n3c2)[C@@](C)(c4ccc5c(c4)...,COC1=CC2=CC([C@@](C)(F)C3=NN=C4C(F)=CC(C5=CC(C...,2,nM,8.7,46G_A_1401


#### 6.2 CASF2016 dataset

In [105]:
CASF2016_ligand_info_results = parallelize_dataframe(CASF2016_df, extract_ligand_info_bulk, 10)

In [106]:
CASF2016_ligand_info_results = pd.concat(CASF2016_ligand_info_results)

In [107]:
CASF2016_df["Total_Lig_info"] = CASF2016_ligand_info_results.map(lambda a: a if a is not None else None)

In [108]:
CASF2016_df = CASF2016_df.dropna(axis = 0).reset_index(drop=True)
CASF2016_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.53,mM,3.28,PRL_H_280
1,1bzc,TPI,4.92,c1cc(cc2c1cc(cc2)C(F)(F)P(=O)(O)O)C(=O)N[C@@H]...,NC(=O)[C@H](CCC(=O)O)NC(=O)C1=CC2=C(C=C1)C=C(C...,12,uM,4.92,TPI_A_902
2,1c5z,BEN,4.01,[H]/N=C(\c1ccccc1)/N,[H]/N=C(/N)C1=CC=CC=C1,97,uM,4.01,BEN_B_251
3,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.13,nM,9.89,HUX_A_803
4,1eby,BEB,9.70,c1ccc(cc1)CO[C@H]([C@@H]([C@H]([C@H](C(=O)N[C@...,O=C(N[C@H]1C2=CC=CC=C2C[C@H]1O)[C@H](OCC1=CC=C...,0.20,nM,9.7,BEB_B_501
...,...,...,...,...,...,...,...,...,...
274,5aba,UL7,2.98,c1c(cc(c(c1CN2CCC(CC2)N3CCC(CC3)O)O)Br)Cl,OC1=C(CN2CCC(N3CCC(O)CC3)CC2)C=C(Cl)C=C1Br,1040,uM,2.98,"UL7_A_1291,UL7_B_1291"
275,5c28,4XV,5.66,Cc1c(nc(nc1Cl)C2CC2)N,CC1=C(N)N=C(C2CC2)N=C1Cl,2.2,uM,5.66,4XV_A_803
276,5c2h,4XU,11.09,Cc1c(nc(nc1Cl)OCCCc2ccc3ccccc3n2)NCc4c(nc(s4)C)C,CC1=NC(C)=C(CNC2=C(C)C(Cl)=NC(OCCCC3=CC=C4C=CC...,8.2,pM,11.09,4XU_B_803
277,5dwr,5H7,11.22,C[C@H]1C[C@H](C[C@H](C1)N)c2ccncc2NC(=O)c3ccc(...,C[C@@H]1C[C@H](N)C[C@H](C2=CC=NC=C2NC(=O)C2=CC...,6,pM,11.22,5H7_A_401


#### 6.3 CASF2013 dataset

In [109]:
CASF2013_ligand_info_results = parallelize_dataframe(CASF2013_df, extract_ligand_info_bulk, 10)

In [110]:
CASF2013_ligand_info_results = pd.concat(CASF2013_ligand_info_results)

In [111]:
CASF2013_df["Total_Lig_info"] = CASF2013_ligand_info_results.map(lambda a: a if a is not None else None)

In [112]:
CASF2013_df = CASF2013_df.dropna(axis = 0).reset_index(drop=True)
CASF2013_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,10gs,VWW,6.40,c1ccc(cc1)CSC[C@@H](C(=O)N[C@H](c2ccccc2)C(=O)...,N[C@@H](CCC(=O)N[C@@H](CSCC1=CC=CC=C1)C(=O)N[C...,0.4,uM,6.4,"VWW_A_210,VWW_B_210"
1,1bcu,PRL,3.28,c1cc(cc2c1cc3ccc(cc3n2)N)N,NC1=CC2=NC3=C(C=CC(N)=C3)C=C2C=C1,0.53,mM,3.28,PRL_H_280
2,1e66,HUX,9.89,CCC1=C[C@@H]2Cc3c(c(c4ccc(cc4n3)Cl)N)[C@@H](C2)C1,CCC1=C[C@@H]2CC3=C(C(N)=C4C=CC(Cl)=CC4=N3)[C@H...,0.13,nM,9.89,HUX_A_803
3,1f8b,DAN,5.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)OC(C(=O)...,4.0,uM,5.4,DAN_A_0
4,1f8c,4AM,7.40,CC(=O)N[C@@H]1[C@H](C=C(O[C@H]1[C@@H]([C@@H](C...,CC(=O)N[C@H]1[C@H]([C@H](O)[C@H](O)CO)OC(C(=O)...,0.04,uM,7.4,4AM_A_4
...,...,...,...,...,...,...,...,...,...
172,4djv,0KM,6.72,[H]/N=C/1\N[C@](C(=O)N1C)(c2ccccc2)c3cccc(c3)c...,[H]/N=C1\N[C@](C2=CC=CC=C2)(C2=CC=CC(C3=CC=CC(...,0.19,uM,6.72,"0KM_A_501,0KM_B_501"
173,4g8m,G8M,7.89,C1C[C@@H]([C@@H]1[C@@H](C(=O)O)N)C(=O)O,N[C@H](C(=O)O)[C@@H]1CC[C@@H]1C(=O)O,12.8,nM,7.89,"G8M_A_301,G8M_B_901"
174,4gid,0GH,10.77,C[C@H](c1ccccc1)NC(=O)c2cc(cc(c2)N(C)S(=O)(=O)...,CC(C)CNC(=O)[C@@H](NC[C@H](CC1=CC=CC=C1)NC(=O)...,0.017,nM,10.77,"0GH_A_501,0GH_B_501,0GH_C_501,0GH_D_501"
175,4gqq,0XR,2.89,CCOC(=O)/C=C/c1ccc(c(c1)O)O,CCOC(=O)/C=C/C1=CC=C(O)C(O)=C1,1.3,mM,2.89,"0XR_A_502,0XR_A_503,0XR_A_504"


#### 6.4 CSAR2014 dataset

In [113]:
CSAR2014_ligand_info_results = parallelize_dataframe(CSAR2014_df, extract_ligand_info_bulk, 10)

In [114]:
CSAR2014_ligand_info_results = pd.concat(CSAR2014_ligand_info_results)

In [115]:
CSAR2014_df["Total_Lig_info"] = CSAR2014_ligand_info_results.map(lambda a: a if a is not None else None)

In [116]:
CSAR2014_df = CSAR2014_df.dropna(axis = 0).reset_index(drop=True)
CSAR2014_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,4ypw,4FD,5.5,c1ccc(cc1)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC=CC=C2)N=C1,IC50,3162.0,nM,4FD_A_301
1,4ypx,4FG,4.6,c1cc(ncc1C(=O)N)N,NC(=O)C1=CN=C(N)C=C1,IC50,25119.0,nM,4FG_A_301
2,4ypy,4F9,3.8,c1cnccc1c2cnc[nH]2,C1=CC(C2=CN=CN2)=CC=N1,IC50,158489.0,nM,4F9_A_301
3,4ypz,4FL,3.5,c1cnccc1c2[nH]ccn2,C1=CC(C2=NC=CN2)=CC=N1,IC50,316228.0,nM,4FL_A_301
4,4yq0,4FM,4.4,c1cc(ccc1CNC(=O)c2c(non2)N)Cl,NC1=NON=C1C(=O)NCC1=CC=C(Cl)C=C1,IC50,39811.0,nM,4FM_A_301
5,4yq1,4FN,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCCC2)NC(=O)c3ccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,4FN_A_301
6,4yq2,EFY,8.3,CC(C)(C)c1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc...,CC(C)(C)C1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC...,IC50,5.01,nM,EFY_A_301
7,4yq3,4G1,6.8,Cc1cccc(c1)C(=O)NCC2(CCCC2)NC(=O)c3cccc4c3con4,CC1=CC=CC(C(=O)NCC2(NC(=O)C3=CC=CC4=NOC=C34)CC...,IC50,158.0,nM,4G1_A_301
8,4yq4,4G3,5.7,c1cc(cc(c1)O)CNc2ccc(cn2)C(=O)N,NC(=O)C1=CC=C(NCC2=CC(O)=CC=C2)N=C1,IC50,1995.0,nM,4G3_A_301
9,4yq5,4G0,5.4,CN(C)c1cccc(c1)CNc2ccc(cn2)C(=O)N,CN(C)C1=CC=CC(CNC2=CC=C(C(N)=O)C=N2)=C1,IC50,3981.0,nM,4G0_A_301


#### 6.5 CSAR2012 dataset

In [117]:
CSAR2012_ligand_info_results = parallelize_dataframe(CSAR2012_df, extract_ligand_info_bulk, 10)

In [118]:
CSAR2012_ligand_info_results = pd.concat(CSAR2012_ligand_info_results)

In [119]:
CSAR2012_df["Total_Lig_info"] = CSAR2012_ligand_info_results.map(lambda a: a if a is not None else None)

In [120]:
CSAR2012_df = CSAR2012_df.dropna(axis = 0).reset_index(drop=True)
CSAR2012_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,4fud,6UP,6.35,[H]/N=C(\c1ccc2cccc(c2c1)N)/N,[H]/N=C(/N)C1=CC=C2C=CC=C(N)C2=C1,Ki,450.0,nM,6UP_A_301
1,4fue,7UP,7.23,[H]/N=C(/c1ccc2cc(ccc2c1)C#Cc3ccc4c(c3)CCNC4)\N,[H]/N=C(\N)C1=CC=C2C=C(C#CC3=CC=C4CNCCC4=C3)C=...,Ki,58.8,nM,7UP_A_301
2,4fu7,1UP,6.2,[H]/N=C(/c1ccc2ccc(c(c2c1)OCC(=O)N)OC)\N,[H]/N=C(\N)C1=CC=C2C=CC(OC)=C(OCC(N)=O)C2=C1,Ki,637.0,nM,1UP_A_305
3,4fu8,2UP,5.23,[H]/N=C(\c1ccc2ccccc2c1)/N,[H]/N=C(/N)C1=CC=C2C=CC=CC2=C1,Ki,5910.0,nM,2UP_A_301
4,4fu9,675,6.2,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccccc3)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=CC=C3)C=CC2=C1,Ki,628.0,nM,675_A_313
5,4fub,4UP,6.21,[H]/N=C(/c1ccc2cc(ccc2c1)[C@H]3[C@@H](O3)c4ccc...,[H]/N=C(\N)C1=CC=C2C=C([C@@H]3O[C@H]3C3=CC=CC=...,Ki,610.0,nM,4UP_A_301
6,4fuc,239,7.4,[H]/N=C(/c1ccc2cc(ccc2c1)C(=O)Nc3ccc(cc3)CN)\N,[H]/N=C(\N)C1=CC=C2C=C(C(=O)NC3=CC=C(CN)C=C3)C...,Ki,40.0,nM,239_A_301
7,4fsm,HK1,7.62,COc1cc2c(cc1OC)-c3c(c([nH]n3)c4ccc(cc4)O)C2,COC1=CC2=C(C=C1OC)C1=NNC(C3=CC=C(O)C=C3)=C1C2,IC50,23.9,nM,HK1_A_301
8,4fsw,HK6,4.76,c1ccc2c(c1)C(=O)Nc3cc(ccc3N2)Cl,O=C1NC2=CC(Cl)=CC=C2NC2=CC=CC=C12,IC50,17200.0,nM,HK6_A_301
9,4ft5,H2K,7.56,c1cc(c(cc1Cl)NC(=O)Nc2cnc(cn2)C#N)O[C@@H]3CCNC3,N#CC1=NC=C(NC(=O)NC2=C(O[C@@H]3CCNC3)C=CC(Cl)=...,IC50,27.4,nM,H2K_A_300


#### 6.6 CSARset1 dataset

In [121]:
CSARset1_ligand_info_results = parallelize_dataframe(CSARset1_df, extract_ligand_info_bulk, 10)

In [122]:
CSARset1_ligand_info_results = pd.concat(CSARset1_ligand_info_results)

In [123]:
CSARset1_df["Total_Lig_info"] = CSARset1_ligand_info_results.map(lambda a: a if a is not None else None)

In [124]:
CSARset1_df = CSARset1_df.dropna(axis = 0).reset_index(drop=True)
CSARset1_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info
0,2are,MAN,3.28,C([C@@H]1[C@H]([C@@H]([C@@H]([C@H](O1)O)O)O)O)O,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,"MAN_A_253,MAN_B_253"
1,2oag,DLI,8.47,CS(=O)(=O)c1cccc(c1)c2cc(ncn2)N3C[C@@H]([C@H](...,CS(=O)(=O)C1=CC=CC(C2=CC(N3C[C@H](C4=CC(F)=C(F...,DLI_B_4000
2,2jbj,G88,9.7,C(CC(=O)O)[C@H](CP(=O)(O)O)C(=O)O,O=C(O)CC[C@H](CP(=O)(O)O)C(=O)O,G88_A_1768
3,2pwd,NOJ,4.4,C1[C@@H]([C@H]([C@@H]([C@H](N1)CO)O)O)O,OC[C@H]1NC[C@H](O)[C@@H](O)[C@@H]1O,"NOJ_A_8000,NOJ_B_8001"
4,2pwg,CTS,4.82,C1C[N@]2C[C@@H]([C@H]([C@@H]([C@H]2[C@H]1O)O)O)O,O[C@H]1[C@H](O)[C@@H](O)CN2CC[C@H](O)[C@H]12,"CTS_A_8000,CTS_B_8001"
...,...,...,...,...,...,...
151,3ekr,PY9,5.55,Cc1ccccc1[C@H]2CCCN2C(=O)c3ccc(cc3O)O,CC1=CC=CC=C1[C@H]1CCCN1C(=O)C1=CC=C(O)C=C1O,"PY9_A_901,PY9_B_901"
152,3ene,NPZ,6.24,Cn1c2c(c(n1)c3ccc4ccccc4c3)c(ncn2)N,CN1N=C(C2=CC=C3C=CC=CC3=C2)C2=C1N=CN=C2N,NPZ_A_1
153,3eqr,T74,8.7,Cc1cccc(c1Nc2c3cnc(nc3n(n2)CCC(C)(C)OC)Nc4ccc(...,COC(C)(C)CCN1N=C(NC2=C(C)C=CC=C2C)C2=CN=C(NC3=...,"T74_A_1,T74_B_1"
154,3f8c,HT1,7.68,CCOc1ccc(cc1)c2[nH]c3cc(ccc3n2)c4[nH]c5cc(ccc5...,CCOC1=CC=C(C2=NC3=CC=C(C4=NC5=CC=C(N6CCN(C)CC6...,HT1_A_127


#### 6.7 CSARset2 dataset

In [125]:
CSARset2_ligand_info_results = parallelize_dataframe(CSARset2_df, extract_ligand_info_bulk, 10)

In [126]:
CSARset2_ligand_info_results = pd.concat(CSARset2_ligand_info_results)

In [127]:
CSARset2_df["Total_Lig_info"] = CSARset2_ligand_info_results.map(lambda a: a if a is not None else None)

In [128]:
CSARset2_df = CSARset2_df.dropna(axis = 0).reset_index(drop=True)
CSARset2_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info
0,1a8i,GLS,5.52,C([C@@H]1[C@H]([C@@H]([C@H]([C@]2(O1)C(=O)NC(=...,O=C1NC(=O)[C@@]2(N1)O[C@H](CO)[C@@H](O)[C@H](O...,GLS_A_998
1,1a99,PUT,5.7,C(CCN)CN,NCCCCN,"PUT_A_371,PUT_B_371,PUT_C_371,PUT_D_371"
2,1ax0,A2G,3.13,CC(=O)N[C@@H]1[C@H]([C@H]([C@H](O[C@@H]1O)CO)O)O,CC(=O)N[C@@H]1[C@@H](O)[C@@H](O)[C@@H](CO)O[C@...,A2G_A_401
3,1b6l,PI4,8.3,CC(C)(C)NC(=O)[C@@H]1CCCC[N@]1C[C@H]([C@@H]2Cc...,CC(C)(C)NC(=O)[C@@H]1CCCCN1C[C@@H](O)[C@@H]1CC...,PI4_A_201
4,1b6m,PI6,8.4,CC[C@H](C)[C@H]1C(=O)NCCCOc2ccc(cc2)C[C@@H](C(...,CC[C@H](C)[C@@H]1NC(=O)[C@@H](NC[C@@H](O)[C@H]...,PI6_B_201
...,...,...,...,...,...,...
137,2qrk,AMP,4.26,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,AMP_A_500
138,2nn1,M28,5.82,c1cc(ccc1CCC(=O)O)S(=O)(=O)N,NS(=O)(=O)C1=CC=C(CCC(=O)O)C=C1,"M28_A_311,M28_B_312"
139,2pou,I7A,7.42,c1c(cc(c(c1S(=O)(=O)N)Cl)Cl)S(=O)(=O)N,NS(=O)(=O)C1=CC(S(N)(=O)=O)=C(Cl)C(Cl)=C1,I7A_A_1000
140,3cd0,6HI,7.57,CC(C)n1c(c(nc1C(=O)NCc2ccc(cc2)F)c3ccc(cc3)F)C...,CC(C)N1C(C(=O)NCC2=CC=C(F)C=C2)=NC(C2=CC=C(F)C...,"6HI_B_1,6HI_B_2,6HI_C_4,6HI_D_3"


#### 6.8 Astex dataset

In [129]:
Astex_ligand_info_results = parallelize_dataframe(Astex_df, extract_ligand_info_bulk, 10)

In [130]:
Astex_ligand_info_results = pd.concat(Astex_ligand_info_results)

In [131]:
Astex_df["Total_Lig_info"] = Astex_ligand_info_results.map(lambda a: a if a is not None else None)

In [132]:
Astex_df = Astex_df.dropna(axis = 0).reset_index(drop=True)
Astex_df

Unnamed: 0,PDB,Lig_code,Labels,OE_stereo_SMILES,RDKit_iso_SMILES,Measure,Potency,Unit,Total_Lig_info
0,1gm8,SOX,4.80,CC1([C@@H](N2[C@H]([S@H]1O)[C@@H](C2=O)NC(=O)C...,CC1(C)[C@H](C(=O)O)N2C(=O)[C@@H](NC(=O)CC3=CC=...,Km,16.000,uM,SOX_B_1559
1,1gpk,HUP,5.37,C/C=C/1\[C@@H]2CC3=C([C@]1(CC(=C2)C)N)C=CC(=O)N3,C/C=C1\[C@H]2C=C(C)C[C@]1(N)C1=C(C2)NC(=O)C=C1,Ki,4.300,uM,HUP_A_1540
2,1hnn,SKF,6.24,c1cc2c(cc1S(=O)(=O)N)CNCC2,NS(=O)(=O)C1=CC2=C(C=C1)CCNC2,Ki,0.580,uM,"SKF_A_3001,SKF_B_3002"
3,1hp0,AD3,6.70,c1cnc(c2c1n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3)C...,NC1=NC=CC2=C1N=CN2[C@@H]1O[C@H](CO)[C@@H](O)[C...,Ki,0.200,uM,"AD3_A_1315,AD3_B_1316"
4,1hq2,PH2,6.77,C1C(=NC2=C(N1)N=C(NC2=O)N)CO,NC1=NC2=C(N=C(CO)CN2)C(=O)N1,Kd,0.170,uM,PH2_A_181
...,...,...,...,...,...,...,...,...,...
70,1ywr,LI9,7.49,C[C@@H](c1ccccc1)Nc2nccc(n2)C3=C(C(=O)N(N3C)C4...,C[C@H](NC1=NC=CC(C2=C(C3=CC=C(F)C=C3)C(=O)N(C3...,IC50,32.000,nM,LI9_A_361
71,1z95,198,7.12,C[C@](CS(=O)(=O)c1ccc(cc1)F)(C(=O)Nc2ccc(c(c2)...,C[C@](O)(CS(=O)(=O)C1=CC=C(F)C=C1)C(=O)NC1=CC=...,Ki,0.076,uM,198_A_501
72,2bm2,PM2,7.82,c1ccc(cc1)CCc2cc(cnc2)C(=O)N3CCC(CC3)c4cccc(c4)CN,NCC1=CC(C2CCN(C(=O)C3=CN=CC(CCC4=CC=CC=C4)=C3)...,Ki,0.015,uM,"PM2_A_3211,PM2_B_3211,PM2_C_3211,PM2_D_3211"
73,2br1,PFP,5.14,COc1ccc(cc1)c2c3c(ncnc3oc2c4ccc(cc4)OC)NCCO,COC1=CC=C(C2=C(C3=CC=C(OC)C=C3)C3=C(NCCO)N=CN=...,Ki,7200.000,nM,PFP_A_1277


#### 6.9 COACH420 dataset

In [133]:
COACH420_ligand_info_results = parallelize_dataframe(COACH420_df, extract_ligand_info_bulk, 10)

In [134]:
COACH420_ligand_info_results = pd.concat(COACH420_ligand_info_results)

In [135]:
COACH420_df["Total_Lig_info"] = COACH420_ligand_info_results.map(lambda a: a if a is not None else None)

In [136]:
COACH420_df = COACH420_df.dropna(axis = 0).reset_index(drop=True)
COACH420_df

Unnamed: 0,PDB,Lig_code,COACH420_chain,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info
0,1a26,CNA,A,c1cc(c[n+](c1)[C@@H]2C[C@@H]([C@H]([C@H]2O)O)C...,NC(=O)C1=C[N+]([C@@H]2C[C@H](COP(=O)(O)OP(=O)(...,CNA_A_200
1,1a2k,GDP,C,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)OP(=...,"GDP_C_220,GDP_D_220,GDP_E_220"
2,1a4k,FRA,H,CC(=O)Nc1ccc(cc1)N2C(=O)[C@@H]3C4CCC([C@@H]3C2...,CC(=O)NC1=CC=C(N2C(=O)[C@@H]3C4CCC(NC(=O)OCC(=...,"FRA_H_3083,FRA_B_3083"
3,1a7x,FKA,A,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,CO[C@H]1C[C@@H](C)C/C(C)=C/[C@@H](CCOC(=O)NCC2...,FKA_B_201
4,1afk,PAP,A,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)OP(=O)(...,"PAP_A_125,PAP_B_125"
...,...,...,...,...,...,...
353,7dfr,FOL,A,c1cc(ccc1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCc2cnc3...,NC1=NC(=O)C2=NC(CNC3=CC=C(C(=O)N[C@@H](CCC(=O)...,FOL_A_161
354,7dfr,NAP,A,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,NAP_A_164
355,7est,0Z2,E,CC(C)C[C@@H](C(=O)N[C@@H](C)C(=O)Nc1ccc(cc1)C(...,CC(C)C[C@H](NC(=O)C(F)(F)F)C(=O)N[C@@H](C)C(=O...,0Z2_E_1
356,830c,RS1,A,c1cc(ccc1Oc2ccc(cc2)Cl)S(=O)(=O)CC3(CCOCC3)C(=...,O=C(NO)C1(CS(=O)(=O)C2=CC=C(OC3=CC=C(Cl)C=C3)C...,"RS1_A_1,RS1_B_1"


#### 6.10 HOLO4K dataset

In [137]:
HOLO4K_ligand_info_results = parallelize_dataframe(HOLO4K_df, extract_ligand_info_bulk, 10)

In [138]:
HOLO4K_ligand_info_results = pd.concat(HOLO4K_ligand_info_results)

In [139]:
HOLO4K_df["Total_Lig_info"] = HOLO4K_ligand_info_results.map(lambda a: a if a is not None else None)

In [140]:
HOLO4K_df = HOLO4K_df.dropna(axis = 0).reset_index(drop=True)
HOLO4K_df

Unnamed: 0,PDB,Lig_code,OE_stereo_SMILES,RDKit_iso_SMILES,Total_Lig_info
0,121p,GCP,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...,NC1=NC2=C(N=CN2[C@@H]2O[C@H](CO[P@](=O)(O)O[P@...,GCP_A_167
1,12as,AMP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)...,"AMP_A_332,AMP_B_332"
2,13pk,3PG,C([C@H](C(=O)O)O)OP(=O)(O)O,O=C(O)[C@H](O)COP(=O)(O)O,"3PG_A_423,3PG_B_423"
3,13pk,ADP,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@](=O)(O)OP(=O)(O...,"ADP_A_421,ADP_B_421,ADP_C_421,ADP_D_421"
4,16pk,BIS,c1nc(c2c(n1)n(cn2)[C@H]3[C@@H]([C@@H]([C@H](O3...,NC1=C2N=CN([C@@H]3O[C@H](CO[P@@](=O)(O)O[P@](=...,BIS_A_499
...,...,...,...,...,...
4214,9gss,GTX,CCCCCCSC[C@@H](C(=O)NCC(=O)O)NC(=O)CC[C@@H](C(...,CCCCCCSC[C@H](NC(=O)CC[C@H]([NH3+])C(=O)O)C(=O...,"GTX_A_211,GTX_B_210"
4215,9ldb,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401"
4216,9ldb,OXM,C(=O)(C(=O)O)N,NC(=O)C(=O)O,OXM_B_402
4217,9ldt,NAD,c1cc(c[n+](c1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO[...,NC(=O)C1=C[N+]([C@@H]2O[C@H](CO[P@@](=O)([O-])...,"NAD_A_401,NAD_B_401"


### 7. Save data

In [141]:
PDBbind_df.to_csv("./preprocessed_data/step2_PDBbind_data.tsv", sep = "\t", index = False)
CASF2016_df.to_csv("./preprocessed_data/step2_CASF2016_data.tsv", sep = "\t", index = False)
CASF2013_df.to_csv("./preprocessed_data/step2_CASF2013_data.tsv", sep = "\t", index = False)
CSAR2014_df.to_csv("./preprocessed_data/step2_CSAR2014_data.tsv", sep = "\t", index = False)
CSAR2012_df.to_csv("./preprocessed_data/step2_CSAR2012_data.tsv", sep = "\t", index = False)
CSARset1_df.to_csv("./preprocessed_data/step2_CSARset1_data.tsv", sep = "\t", index = False)
CSARset2_df.to_csv("./preprocessed_data/step2_CSARset2_data.tsv", sep = "\t", index = False)
Astex_df.to_csv("./preprocessed_data/step2_Astex_data.tsv", sep = "\t", index = False)
COACH420_df.to_csv("./preprocessed_data/step2_COACH420_data.tsv", sep = "\t", index = False)
HOLO4K_df.to_csv("./preprocessed_data/step2_HOLO4K_data.tsv", sep = "\t", index = False)