In [1]:
from glob import glob
import pandas as pd
import numpy as np
from Bio import PDB
import os
from rdkit.Chem.rdmolfiles import MolFromSmiles, MolToSmiles
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser

### 1. Load complex list

In [2]:
pdb_path = os.path.abspath("../datasets/examples/PDBbind/")
info_path = os.path.abspath("../datasets/examples/")

In [3]:
complex_list = os.listdir(pdb_path)

### 2. Remove HEATM

In [4]:
def remove_HEATM_PDBbind(input_list, path):

    class NonHetSelect(Select):
        def accept_residue(self, residue):
            return 1 if residue.id[0] == " " else 0
    
    for pdb in input_list:

        src_file = f"{pdb_path}/{pdb}/{pdb}_protein.pdb"
        des_file = f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb"
        
        pdb = PDBParser().get_structure(pdb, src_file)
        io = PDBIO()
        io.set_structure(pdb)
        io.save(des_file, NonHetSelect()) 

In [5]:
remove_HEATM_PDBbind(complex_list, pdb_path)



### 3. Load Binding sites info

In [6]:
pdb_parser = PDB.PDBParser(QUIET = True)

In [7]:
amino_acids_short = {"ALA":"A", "ARG":"R", "ASN":"N", "ASP":"D", "CYS":"C", "GLU":"E", "GLN":"Q", "GLY":"G", "HIS":"H", "ILE":"I", "LEU":"L", "LYS":"K", "MET":"M", "PHE":"F", "PRO":"P", "SER":"S", "THR":"T", "TRP":"W", "TYR":"Y", "VAL":"V", "SEC":"U", "PYL":"O"}

In [8]:
data_df = pd.DataFrame({"PDB":complex_list})

In [9]:
def get_info(pdb):
    try:
        """ Load protein info """
        structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_remove_HEATM_protein.pdb")
        chain_name_list, pdb_sequence_list, seq_lengths_list, protein_atom_coords, protein_atom_residue_list, reindex = list(), list(), list(), list(), list(), 0
        
        """ Extract protein info """
        for chain_name in list(structure[0].child_dict.keys()):
            chain = structure[0][chain_name]

            pdb_sequence = ""
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    pdb_sequence += amino_acids_short[residue.resname]

                    for atom in residue:
                        protein_atom_coords.append(atom.get_coord())
                        protein_atom_residue_list.append(reindex)
                    reindex += 1     

            if len(pdb_sequence) != 0:
                chain_name_list.append(chain_name)
                pdb_sequence_list.append(pdb_sequence)
                seq_lengths_list.append(len(pdb_sequence))

        """ Load pocket info """      
        protein_atom_coords, binding_index = np.array(protein_atom_coords), list()
        pocket_structure = pdb_parser.get_structure(pdb, f"{pdb_path}/{pdb}/{pdb}_pocket.pdb")

        pocket_coordi = list()

        for chain_name in list(pocket_structure[0].child_dict.keys()):
            chain = pocket_structure[0][chain_name]
            for residue in chain.get_residues():
                if residue.resname in amino_acids_short.keys():
                    for atom in residue:
                        pocket_coordi.append(atom.get_coord())
        
        """ Matchin pocket info """
        pocket_coordi = np.array(pocket_coordi)
        bi_x, bi_y, bi_z = pocket_coordi[:, 0], pocket_coordi[:, 1], pocket_coordi[:, 2]

        for i, j, k in zip(bi_x, bi_y, bi_z):
            tmp_coordi = np.array([i, j, k], dtype = np.float32)
            ind = np.where((protein_atom_coords == tmp_coordi).all(axis = 1))[0][0]
            binding_index.append(protein_atom_residue_list[ind])

        binding_index = sorted(list(set(binding_index)))
        binding_index = list(map(str, binding_index))

        total_seq_lengths = np.sum(np.array(seq_lengths_list))
        seq_lengths_list = list(map(str, seq_lengths_list))

        return ",".join(chain_name_list), ",".join(pdb_sequence_list), total_seq_lengths, ",".join(seq_lengths_list), ",".join(binding_index)

    except Exception as e:
        print(pdb, e)
        return None

In [10]:
from multiprocessing import Process, Queue, Pool

def parallelize_dataframe(df, func, num_partitions=5):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_partitions)
    results = pool.map(func, df_split)
    pool.close()
    pool.join()
    return results

In [11]:
def get_pdb_info_bulk(df):
    return df.PDB.map(get_info)

In [12]:
info_results = parallelize_dataframe(data_df, get_pdb_info_bulk, 5)

In [13]:
info_results = pd.concat(info_results)

In [14]:
data_df["Chain"] = info_results.map(lambda a: a[0] if a is not None else None)

In [15]:
data_df["Sequence"] = info_results.map(lambda a: a[1] if a is not None else None)

In [16]:
data_df["Total_seq_lengths"] = info_results.map(lambda a: a[2] if a is not None else None)

In [17]:
data_df["Chain_seq_lengths"] = info_results.map(lambda a: a[3] if a is not None else None)

In [18]:
data_df["BS"] = info_results.map(lambda a: a[4] if a is not None else None)

In [19]:
data_df = data_df.loc[data_df.Sequence.isna()==False].reset_index(drop=True)
data_df = data_df.loc[data_df.Chain != " "].reset_index(drop=True)

In [20]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,4x6o,A,IVGGTASVRGEWPWQVTLHTTSPTQRHLCGGSIIGNQWILTAAHCF...,238,238,"0,1,16,17,18,19,23,24,25,26,27,28,29,41,42,43,..."
1,3miy,A,GSVIDPSELTFVQEIGSGGLVHLGYWLNKDKVAIKTISEEDFIEEA...,239,239,"12,13,14,15,16,19,20,21,22,23,30,31,32,33,34,4..."
2,2gsu,A,TPHALLLISIDGLRADMLDRGITPNLSHLAREGVRARWMAPSYPSL...,382,382,"10,11,43,44,45,46,47,48,49,50,66,67,68,77,78,7..."
3,3hmo,A,ANECISVKGRIYSILKQIGSGGSSKVFQVLNEKKQIYAIKYVNLEE...,257,257,"15,16,17,18,19,20,23,24,25,26,27,28,35,36,37,3..."
4,2xm1,A,SLQPPPQQLIVQNKTIDLPAVYQLNGGEEANPHAVKVLKELLSGKQ...,645,645,"129,130,131,132,133,134,160,161,162,163,167,20..."
5,3zxz,A,HIDLSALNPELVQAVQHVVIGPSSLIVHFNEVIGRGHFGCVYHGTL...,290,290,"31,32,33,34,35,39,40,41,42,43,50,51,52,53,54,5..."
6,3eko,A,QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,217,217,"34,37,38,39,40,41,42,43,44,45,46,47,48,49,68,8..."
7,1c1u,"L,H","TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,IVEGSDAEI...",288,36252,"36,37,53,60,61,62,63,64,76,78,79,80,82,85,87,8..."
8,10gs,"A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,416,208208,"5,6,7,8,9,10,11,12,15,31,32,33,34,36,37,41,42,..."
9,2y5h,"A,L",IVGGQECKDGECPWQALLINEENEGFCGGTILSEFYILTAAHCLYQ...,288,23454,"0,1,25,26,41,42,45,79,80,81,82,83,84,85,87,126..."


### 4. Remove complex over than 1,500 protein seq length

In [21]:
lengths = data_df.Total_seq_lengths.values

In [22]:
data_df = data_df[lengths <= 1500].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS
0,4x6o,A,IVGGTASVRGEWPWQVTLHTTSPTQRHLCGGSIIGNQWILTAAHCF...,238,238,"0,1,16,17,18,19,23,24,25,26,27,28,29,41,42,43,..."
1,3miy,A,GSVIDPSELTFVQEIGSGGLVHLGYWLNKDKVAIKTISEEDFIEEA...,239,239,"12,13,14,15,16,19,20,21,22,23,30,31,32,33,34,4..."
2,2gsu,A,TPHALLLISIDGLRADMLDRGITPNLSHLAREGVRARWMAPSYPSL...,382,382,"10,11,43,44,45,46,47,48,49,50,66,67,68,77,78,7..."
3,3hmo,A,ANECISVKGRIYSILKQIGSGGSSKVFQVLNEKKQIYAIKYVNLEE...,257,257,"15,16,17,18,19,20,23,24,25,26,27,28,35,36,37,3..."
4,2xm1,A,SLQPPPQQLIVQNKTIDLPAVYQLNGGEEANPHAVKVLKELLSGKQ...,645,645,"129,130,131,132,133,134,160,161,162,163,167,20..."
5,3zxz,A,HIDLSALNPELVQAVQHVVIGPSSLIVHFNEVIGRGHFGCVYHGTL...,290,290,"31,32,33,34,35,39,40,41,42,43,50,51,52,53,54,5..."
6,3eko,A,QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,217,217,"34,37,38,39,40,41,42,43,44,45,46,47,48,49,68,8..."
7,1c1u,"L,H","TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,IVEGSDAEI...",288,36252,"36,37,53,60,61,62,63,64,76,78,79,80,82,85,87,8..."
8,10gs,"A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,416,208208,"5,6,7,8,9,10,11,12,15,31,32,33,34,36,37,41,42,..."
9,2y5h,"A,L",IVGGQECKDGECPWQALLINEENEGFCGGTILSEFYILTAAHCLYQ...,288,23454,"0,1,25,26,41,42,45,79,80,81,82,83,84,85,87,126..."


### 5. Remove complex over than 160 SMILES length

In [23]:
def convert_smiles(row):
    pdb = row.PDB
    
    mol = f"{pdb_path}/{pdb}/{pdb}_ligand.mol2"
    command = f"obabel -imol2 {mol} -osmi -xC | obabel -ismi -osmi -xk -O tmp.smi"
    os.system(command)
    
    smiles = read_file(open("tmp.smi"))[0].split('\t')[0].strip()
    
    try:
        smiles = MolToSmiles(MolFromSmiles(smiles),isomericSmiles = False, kekuleSmiles = True)
        return smiles
    
    except Exception as e:
        print(pdb, e)
        return None

In [24]:
def read_file(file):
    return [i.strip() for i in file.readlines()]

In [25]:
SMILES = data_df.apply(convert_smiles, axis = 1)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


In [26]:
data_df["SMILES"] = SMILES

In [27]:
data_df = data_df.loc[data_df.SMILES.isna()==False].reset_index(drop=True)
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,SMILES
0,4x6o,A,IVGGTASVRGEWPWQVTLHTTSPTQRHLCGGSIIGNQWILTAAHCF...,238,238,"0,1,16,17,18,19,23,24,25,26,27,28,29,41,42,43,...",COC(=O)NC1=CC=C(C2=C(Cl)NC(C(CC3=CC=CC=C3)NC(=...
1,3miy,A,GSVIDPSELTFVQEIGSGGLVHLGYWLNKDKVAIKTISEEDFIEEA...,239,239,"12,13,14,15,16,19,20,21,22,23,30,31,32,33,34,4...",CC[NH+](CC)CCNC(=O)C1=C(C)NC(C=C2C(=O)NC3=C2C=...
2,2gsu,A,TPHALLLISIDGLRADMLDRGITPNLSHLAREGVRARWMAPSYPSL...,382,382,"10,11,43,44,45,46,47,48,49,50,66,67,68,77,78,7...",NC1=NC=NC2=C1N=CN2C1OC(CO[PH](O)(O)O)C(O)C1O
3,3hmo,A,ANECISVKGRIYSILKQIGSGGSSKVFQVLNEKKQIYAIKYVNLEE...,257,257,"15,16,17,18,19,20,23,24,25,26,27,28,35,36,37,3...",C[NH2+]C1CC2OC(C)(C1OC)N1C3=CC=CC=C3C3=C1C1=C(...
4,2xm1,A,SLQPPPQQLIVQNKTIDLPAVYQLNGGEEANPHAVKVLKELLSGKQ...,645,645,"129,130,131,132,133,134,160,161,162,163,167,20...",CC(=O)NC1C(=O)NC(CO)C(O)C1O
5,3zxz,A,HIDLSALNPELVQAVQHVVIGPSSLIVHFNEVIGRGHFGCVYHGTL...,290,290,"31,32,33,34,35,39,40,41,42,43,50,51,52,53,54,5...",OCCN1C=C(C2=NC3=C(N=C2)N=NN3CC2=CC=C3N=CC=CC3=...
6,3eko,A,QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,217,217,"34,37,38,39,40,41,42,43,44,45,46,47,48,49,68,8...",O=C(C1=C(O)C=C(O)C=C1O)N1C=CC=C1
7,1c1u,"L,H","TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,IVEGSDAEI...",288,36252,"36,37,53,60,61,62,63,64,76,78,79,80,82,85,87,8...",NC(N)C1=CC=C2N=C(CC3=NC4=CC=CC=C4N3)NC2=C1
8,10gs,"A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,416,208208,"5,6,7,8,9,10,11,12,15,31,32,33,34,36,37,41,42,...",[NH3+]C(CCC(=O)NC(CSCC1=CC=CC=C1)C(=O)NC(C(=O)...
9,2y5h,"A,L",IVGGQECKDGECPWQALLINEENEGFCGGTILSEFYILTAAHCLYQ...,288,23454,"0,1,25,26,41,42,45,79,80,81,82,83,84,85,87,126...",C[N+](C)(C)CCCN1C(=O)C2C(C1=O)C(C1=COC(C3=CC=C...


In [28]:
def get_SMILES_length(df):
    index = [True if len(smi) <= 160 else False for smi in df.SMILES.values]
    return index

In [29]:
smiles_index = get_SMILES_length(data_df)

In [30]:
data_df = data_df.loc[smiles_index].reset_index(drop=True)

In [31]:
data_df

Unnamed: 0,PDB,Chain,Sequence,Total_seq_lengths,Chain_seq_lengths,BS,SMILES
0,4x6o,A,IVGGTASVRGEWPWQVTLHTTSPTQRHLCGGSIIGNQWILTAAHCF...,238,238,"0,1,16,17,18,19,23,24,25,26,27,28,29,41,42,43,...",COC(=O)NC1=CC=C(C2=C(Cl)NC(C(CC3=CC=CC=C3)NC(=...
1,3miy,A,GSVIDPSELTFVQEIGSGGLVHLGYWLNKDKVAIKTISEEDFIEEA...,239,239,"12,13,14,15,16,19,20,21,22,23,30,31,32,33,34,4...",CC[NH+](CC)CCNC(=O)C1=C(C)NC(C=C2C(=O)NC3=C2C=...
2,2gsu,A,TPHALLLISIDGLRADMLDRGITPNLSHLAREGVRARWMAPSYPSL...,382,382,"10,11,43,44,45,46,47,48,49,50,66,67,68,77,78,7...",NC1=NC=NC2=C1N=CN2C1OC(CO[PH](O)(O)O)C(O)C1O
3,3hmo,A,ANECISVKGRIYSILKQIGSGGSSKVFQVLNEKKQIYAIKYVNLEE...,257,257,"15,16,17,18,19,20,23,24,25,26,27,28,35,36,37,3...",C[NH2+]C1CC2OC(C)(C1OC)N1C3=CC=CC=C3C3=C1C1=C(...
4,2xm1,A,SLQPPPQQLIVQNKTIDLPAVYQLNGGEEANPHAVKVLKELLSGKQ...,645,645,"129,130,131,132,133,134,160,161,162,163,167,20...",CC(=O)NC1C(=O)NC(CO)C(O)C1O
5,3zxz,A,HIDLSALNPELVQAVQHVVIGPSSLIVHFNEVIGRGHFGCVYHGTL...,290,290,"31,32,33,34,35,39,40,41,42,43,50,51,52,53,54,5...",OCCN1C=C(C2=NC3=C(N=C2)N=NN3CC2=CC=C3N=CC=CC3=...
6,3eko,A,QPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDA...,217,217,"34,37,38,39,40,41,42,43,44,45,46,47,48,49,68,8...",O=C(C1=C(O)C=C(O)C=C1O)N1C=CC=C1
7,1c1u,"L,H","TFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGR,IVEGSDAEI...",288,36252,"36,37,53,60,61,62,63,64,76,78,79,80,82,85,87,8...",NC(N)C1=CC=C2N=C(CC3=NC4=CC=CC=C4N3)NC2=C1
8,10gs,"A,B",PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,416,208208,"5,6,7,8,9,10,11,12,15,31,32,33,34,36,37,41,42,...",[NH3+]C(CCC(=O)NC(CSCC1=CC=CC=C1)C(=O)NC(C(=O)...
9,2y5h,"A,L",IVGGQECKDGECPWQALLINEENEGFCGGTILSEFYILTAAHCLYQ...,288,23454,"0,1,25,26,41,42,45,79,80,81,82,83,84,85,87,126...",C[N+](C)(C)CCCN1C(=O)C2C(C1=O)C(C1=COC(C3=CC=C...


### 5. Save data

In [32]:
data_df = data_df.iloc[:, [0, 2, 5]]

In [33]:
data_df.to_csv("../datasets/examples/PDBbind_data.tsv", sep = "\t", index = False)