In [1]:
import pandas as pd
import rdkit
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
import os
from rdkit import RDConfig
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.PandasTools import SaveXlsxFromFrame
from rdkit import RDLogger

In [2]:
df1= pd.read_csv('5k_pdbbind_db.csv', sep=',', dtype={"pdb_id": str})


In [3]:
df1['mol']=df1['isomeric_smiles'].map(Chem.MolFromSmiles)

In [4]:
df1['mol'].isna().sum()

0

In [5]:
def standardize(mol): 
    
    rdBase.DisableLog('rdApp.info')
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
     
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
    
    return uncharged_parent_clean_mol
 


In [6]:
df1['mol']=df1['mol'].apply(standardize)

In [7]:
csmi=[]
for m in df1['mol']:
    
    m=Chem.MolToSmiles(m, isomericSmiles=False)
    csmi.append(m)
df1['canonical_smiles']=csmi

In [8]:
df1['canonical_smiles'].value_counts()

Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O                      31
CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1    23
N=c1[nH]c(=O)c2ncn(C3OC(COP(=O)(O)OP(=O)(O)O)C(O)C3O)c2[nH]1         20
Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O             20
Nc1ncnc2c1ncn2C1OC(CSCCC(N)C(=O)O)C(O)C1O                            17
                                                                     ..
CC(C)OP(=O)(O)OC1C(O)OCC(O)C1O                                        1
Cc1c(N)nc(C2CC2)nc1Cl                                                 1
OCC1OC(OC2(COC3(CO)OC(CO)C(O)C3O)OC(CO)C(O)C2O)C(O)C(O)C1O            1
OCC1OC(OC2(CCl)OC(CCl)C(O)C2O)C(O)C(O)C1Cl                            1
Cc1nc(F)ccc1-c1cc(C[n+]2ccn(C)c2N)cc(C(=O)NCc2ccc(Cl)c(Cl)c2)c1       1
Name: canonical_smiles, Length: 4131, dtype: int64

In [9]:
df1['target'].value_counts()

HIV-1 PROTEASE                           303
CARBONIC ANHYDRASE 2                     291
HEAT SHOCK PROTEIN HSP90-ALPHA            94
BROMODOMAIN-CONTAINING PROTEIN 4          69
TRYPSIN                                   64
                                        ... 
METHIONINE GAMMA-LYASE                     1
FREQUENIN 2                                1
FUCOSE-BINDING LECTIN PROTEIN              1
CG5907-PA, ISOFORM A                       1
ANTI-CIGUATOXIN ANTIBODY, LIGHT CHAIN      1
Name: target, Length: 1412, dtype: int64

In [141]:
df2=pd.read_csv('5k_pdbbind_finaldb.csv', sep=',', dtype={"pdb_id": str})

In [142]:
df2.shape

(5298, 14)

In [143]:
df2= df2.drop(columns='index')

In [144]:
df2.columns

Index(['pdb_id', 'resolution', 'year_x', 'log_ki_kd', 'activity_type',
       'activity_units', 'activity_data', 'ligand_name', 'uniprot_id',
       'target', 'isomeric_smiles', 'ligand', 'canonical_smiles '],
      dtype='object')

df2=df2.drop(columns=['canonical_smiles '])

In [145]:
df2['mol']=df2['canonical_smiles '].map(Chem.MolFromSmiles)

csmi=[]
for m in df2['mol']:
    m=Chem.MolToSmiles(m, isomericSmiles=False)
    csmi.append(m)
df2['canonical_smiles']=csmi

In [147]:
df2['canonical_smiles '].value_counts()

CC(CN(S(=O)(=O)c1ccc(cc1)N)C[C@H]([C@H](Cc1ccccc1)NC(=O)O[C@H]1CO[C@@H]2[C@H]1CCO2)O)C           23
OC(=O)[C@H](CCSC[C@H]1O[C@H]([C@@H]([C@@H]1O)O)n1cnc2c1ncnc2N)[NH3+]                             17
NC(=N)NCCC[C@@H](C(=O)O)[NH3+]                                                                   16
O[C@@H]1[C@@H](CO[P@](=O)(OP(=O)(O)O)O)O[C@H]([C@@H]1O)n1cnc2c1ncnc2N                            16
O[C@@H]1[C@@H](CO[P@@](=O)(OP(=O)(O)O)O)O[C@H]([C@@H]1O)n1cnc2c1ncnc2N                           15
                                                                                                 ..
O[C@H]1C[NH2+]CC[C@@H]1CNc1cc(NCc2ccccc2)n2c(n1)c(cn2)C(C)C                                       1
OC(=O)[C@H]([NH3+])CS                                                                             1
SC[C@@H](Cc1ccccc1)NC(=O)CC(=O)O                                                                  1
CN(C(=O)C1=CC(C=N1)c1n[nH]cc1c1ccccc1)C                                                           1


In [148]:
import VS_filters
from VS_filters import *
import importlib
importlib.reload(VS_filters)

<module 'VS_filters' from 'C:\\Users\\Akhila\\Documents\\Dataprocessing on PDBbind_v2020\\VS_filters.py'>

In [149]:
df3=P_containing_molecules(df2)

In [150]:
df3.shape

(5298, 15)

In [151]:
df3_no_P=df3[df3['P_containing']==0]

In [152]:
df3_no_P.shape

(4631, 15)

In [153]:
df3_with_P=df3[df3['P_containing']>0]
df3_with_P.shape

(667, 15)

In [154]:
df4_ro5=df3_no_P['mol'].apply(ro5_property_estimation)

In [155]:
df4_ro5.shape

(4631, 6)

In [156]:
df5_merge=pd.concat([df3_no_P, df4_ro5], axis=1)
df5_merge.shape

(4631, 21)

In [157]:
df5_merge.reset_index(inplace=True)

In [158]:
df6_ro5_fulfilled=df5_merge[df5_merge['ro5_fulfilled']==True]

In [159]:
df6_ro5_fulfilled.shape

(1708, 22)

In [160]:
4631-1708

2923

In [161]:
df6_ro5_fulfilled.shape

(1708, 22)

In [162]:
df6_ro5_fulfilled=aliphatic_atom_count(df6_ro5_fulfilled)

df6_ro5_fulfilled['ali_N']=df6_ro5_fulfilled['mol'].apply(aliphatic_amino_count) 

df6_ro5_fulfilled=car_acids(df6_ro5_fulfilled)

df6_ro5_fulfilled=chiral_center_and_ringcount(df6_ro5_fulfilled)

df6_ro5_fulfilled=four_fusedring_count(df6_ro5_fulfilled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Aliphatic_chain_len']=ali_c
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6_ro5_fulfilled['ali_N']=df6_ro5_fulfilled['mol'].apply(aliphatic_amino_count)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['carboxyl_group_count']=y
A value is trying to be set on a copy of a slice from a DataFra

In [163]:
df7= df6_ro5_fulfilled[(df6_ro5_fulfilled['ring_c'] > 0)& (df6_ro5_fulfilled['Aliphatic_chain_len']<6)& (df6_ro5_fulfilled['chiral_c'] <3)& (df6_ro5_fulfilled['count_four_fused_rings']<1)]

In [164]:
df7.shape #345  {-4, -24, -275, -14}

(1391, 28)

In [165]:
1708-1391

317

In [166]:
df8= df7[(df7['ali_N']<=2) & (df7['carboxyl_group_count']<=1)]

df8=df8.drop(df8.loc[(df7['carboxyl_group_count']==1)& (df8['ali_N'].isin([1,2]))].index)

In [167]:
df8.shape #56

(1334, 28)

In [168]:
df8.to_csv('PDB_bind_after_VS_filters.csv')

In [169]:
df8.columns

Index(['index', 'pdb_id', 'resolution', 'year_x', 'log_ki_kd', 'activity_type',
       'activity_units', 'activity_data', 'ligand_name', 'uniprot_id',
       'target', 'isomeric_smiles', 'ligand', 'canonical_smiles ', 'mol',
       'P_containing', 'Molecular_Weight', 'n_HBA', 'n_HBD', 'LogP',
       'Rot_bonds', 'ro5_fulfilled', 'Aliphatic_chain_len', 'ali_N',
       'carboxyl_group_count', 'chiral_c', 'ring_c', 'count_four_fused_rings'],
      dtype='object')

In [170]:
df8.groupby('uniprot_id')['canonical_smiles '].value_counts() 

uniprot_id  canonical_smiles                                                  
------      OC(=O)CCN(C(=O)c1ccc2c(c1)nc(n2C)CNc1ccc(cc1)C(=N)N)c1ccccn1          3
            O[C@@H]1CCC/C=C/c2cc(O)cc(c2C(=O)O[C@H](CCC1)C)O                      2
            C=CCNc1nc(SCc2ccc(cc2)Cl)nc2c1c1CC[N@@H+](Cc1s2)C                     1
            CC(=O)Nc1ccc2c(c1S(=O)(=O)O)cccc2S(=O)(=O)O                           1
            CCN1C(c2ccccc2)c2cc(N)ccc2-c2c1cc(N)cc2                               1
                                                                                 ..
Q9Z2X8      OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(c(c1)CN(S(=O)(=O)C)C)Cl           1
            OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(c(c1)CN(S(=O)(=O)c1ccccc1)C)Cl    1
            OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(cc1)Cl                            1
Q9ZLT0      CC(Cn1c2nn(c(c2c(=O)n(c1=O)C)c1ccncc1)Cc1cccc2c1cccc2)C               1
U6NCW5      NC(=O)Cn1c2C[C@@H](CCc2c2c1ccc(c2)Cl)C(=O)O                          

In [178]:
df9=df8.drop_duplicates(subset=['target', 'canonical_smiles '], keep='first') #final dataset********

In [179]:
df9.shape

(1299, 28)

In [180]:
df9.groupby('target')['canonical_smiles '].value_counts()  #unique target and ligand #1299

target                                         canonical_smiles                                                           
(3R)-HYDROXYACYL-ACP DEHYDRATASE SUBUNIT HADA  Oc1ccc(cc1)/C=C/C(=O)c1ccc(cc1O)O                                              1
                                               Oc1ccc2c(c1)O[C@@H]([C@H](C2=O)O)c1ccc(c(c1)O)O                                1
(3R)-HYDROXYMYRISTOYL-ACYL CARRIER PROTEIN     Cc1cc(O)c2c(c1)C(=O)c1c(C2=O)c(O)cc(c1)O                                       1
                                               O=C(c1cccnc1)N/N=C/c1cc(Br)c(c(c1O)Br)O                                        1
2',5'-PHOSPHODIESTERASE 12                     OC[C@H]1[C@@H](OCCN1C(=O)c1ccc2c(c1)n(C)c(n2)c1c[nH]c2c1ccc(c2)C#N)c1ccccc1    1
                                                                                                                             ..
WD REPEAT-CONTAINING PROTEIN 5                 OCc1c[nH]c(n1)CCCCNC(=O)OCc1ccccc1                            

In [182]:
df10=df9.loc[df9['uniprot_id'] != '------']

In [183]:
df10.shape #1286

(1286, 28)

In [None]:
PandasTools.SaveXlsxFromFrame(df10,'vs_fil_pdbbind_db_1286.xlsx', molCol='mol', size=(250,250))