In [1]:
import pandas as pd
import rdkit
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
import os
from rdkit import RDConfig
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.PandasTools import SaveXlsxFromFrame

In [2]:
df= pd.read_csv('5k_pdbbind_finaldb2.csv', sep=',', dtype={"pdb_id": str})


In [3]:
df['mol']=df['isomeric_smiles'].map(Chem.MolFromSmiles)

In [4]:
def standardize(mol): 
    
     
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
     
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
    
    
    return uncharged_parent_clean_mol
 


In [8]:
def neutralize_atoms(mol):
    pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
    at_matches = mol.GetSubstructMatches(pattern)
    at_matches_list = [y[0] for y in at_matches]
    if len(at_matches_list) > 0:
        for at_idx in at_matches_list:
            atom = mol.GetAtomWithIdx(at_idx)
            chg = atom.GetFormalCharge()
            hcount = atom.GetTotalNumHs()
            atom.SetFormalCharge(0)
            atom.SetNumExplicitHs(hcount - chg)
            atom.UpdatePropertyCache()
    return mol



In [5]:
df['mol']=df['mol'].map(standardize)

[11:39:22] Initializing MetalDisconnector
[11:39:22] Running MetalDisconnector
[11:39:22] Initializing Normalizer
[11:39:22] Running Normalizer
[11:39:22] Initializing MetalDisconnector
[11:39:22] Running MetalDisconnector
[11:39:22] Initializing Normalizer
[11:39:22] Running Normalizer
[11:39:22] Running LargestFragmentChooser
[11:39:22] Fragment: C[NH+](C)CCCC[C@H]([NH3+])C(=O)O
[11:39:22] New largest fragment: C[NH+](C)CCCC[C@H]([NH3+])C(=O)O (32)
[11:39:22] Running Uncharger
[11:39:22] Removed positive charge.
[11:39:22] Initializing MetalDisconnector
[11:39:22] Running MetalDisconnector
[11:39:22] Initializing Normalizer
[11:39:22] Running Normalizer
[11:39:22] Initializing MetalDisconnector
[11:39:22] Running MetalDisconnector
[11:39:22] Initializing Normalizer
[11:39:22] Running Normalizer
[11:39:22] Running LargestFragmentChooser
[11:39:22] Fragment: O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O)(O)OP(=O)(O)O)[C@H](O)[C@@H]1O
[11:39:22] New largest fragment: O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O

AtomValenceException: Explicit valence for atom # 0 C, 5, is greater than permitted

In [10]:
df['mol']=df['mol'].map(neutralize_atoms)

In [11]:
csmi=[]
for m in df['mol']:
    
    m=Chem.MolToSmiles(m, isomericSmiles=False)
    csmi.append(m)
df['canonical_smiles']=csmi

In [12]:
df['canonical_smiles'].value_counts()

Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O                      31
CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1    23
N=c1[nH]c(=O)c2ncn(C3OC(COP(=O)(O)OP(=O)(O)O)C(O)C3O)c2[nH]1         20
Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O             20
Nc1ncnc2c1ncn2C1OC(CSCCC(N)C(=O)O)C(O)C1O                            17
                                                                     ..
CC(C)OP(=O)(O)OC1C(O)OCC(O)C1O                                        1
Cc1c(N)nc(C2CC2)nc1Cl                                                 1
OCC1OC(OC2(COC3(CO)OC(CO)C(O)C3O)OC(CO)C(O)C2O)C(O)C(O)C1O            1
OCC1OC(OC2(CCl)OC(CCl)C(O)C2O)C(O)C(O)C1Cl                            1
Cc1nc(F)ccc1-c1cc(C[n+]2ccn(C)c2N)cc(C(=O)NCc2ccc(Cl)c(Cl)c2)c1       1
Name: canonical_smiles, Length: 4131, dtype: int64

In [27]:
PandasTools.SaveXlsxFromFrame(df1,'pdbbind_db_after_data_prep.xlsx', molCol='mol', size=(250,250))

In [None]:
count.shape

In [13]:
df['target'].value_counts()

HIV-1 PROTEASE                           303
CARBONIC ANHYDRASE 2                     291
HEAT SHOCK PROTEIN HSP90-ALPHA            94
BROMODOMAIN-CONTAINING PROTEIN 4          69
TRYPSIN                                   64
                                        ... 
METHIONINE GAMMA-LYASE                     1
FREQUENIN 2                                1
FUCOSE-BINDING LECTIN PROTEIN              1
CG5907-PA, ISOFORM A                       1
ANTI-CIGUATOXIN ANTIBODY, LIGHT CHAIN      1
Name: target, Length: 1412, dtype: int64

In [14]:
df1= pd.read_csv('5k_pdbbind_finaldb.csv', sep=',')

In [15]:
df1.columns

Index(['index', 'pdb_id', 'resolution', 'year_x', 'log_ki_kd', 'activity_type',
       'activity_units', 'activity_data', 'ligand_name', 'uniprot_id',
       'target', 'isomeric_smiles', 'ligand', 'canonical_smiles '],
      dtype='object')

In [16]:
df1= df1.drop(columns='index')

In [17]:
df1.shape

(5298, 13)

In [18]:
df1.columns

Index(['pdb_id', 'resolution', 'year_x', 'log_ki_kd', 'activity_type',
       'activity_units', 'activity_data', 'ligand_name', 'uniprot_id',
       'target', 'isomeric_smiles', 'ligand', 'canonical_smiles '],
      dtype='object')

In [22]:
df1=df1.drop(columns=['canonical_smiles '])

In [23]:
df1['mol']=df1['isomeric_smiles'].map(Chem.MolFromSmiles)

In [24]:
df1['mol']=df1['mol'].map(standardize)

[11:48:54] Initializing MetalDisconnector
[11:48:54] Running MetalDisconnector
[11:48:54] Initializing Normalizer
[11:48:54] Running Normalizer
[11:48:54] Initializing MetalDisconnector
[11:48:54] Running MetalDisconnector
[11:48:54] Initializing Normalizer
[11:48:54] Running Normalizer
[11:48:54] Running LargestFragmentChooser
[11:48:54] Fragment: C[NH+](C)CCCC[C@H]([NH3+])C(=O)O
[11:48:54] New largest fragment: C[NH+](C)CCCC[C@H]([NH3+])C(=O)O (32)
[11:48:54] Running Uncharger
[11:48:54] Removed positive charge.
[11:48:54] Initializing MetalDisconnector
[11:48:54] Running MetalDisconnector
[11:48:54] Initializing Normalizer
[11:48:54] Running Normalizer
[11:48:54] Initializing MetalDisconnector
[11:48:54] Running MetalDisconnector
[11:48:54] Initializing Normalizer
[11:48:54] Running Normalizer
[11:48:54] Running LargestFragmentChooser
[11:48:54] Fragment: O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O)(O)OP(=O)(O)O)[C@H](O)[C@@H]1O
[11:48:54] New largest fragment: O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O

AtomValenceException: Explicit valence for atom # 0 C, 5, is greater than permitted

In [25]:
csmi=[]
for m in df1['mol']:
    
    m=Chem.MolToSmiles(m, isomericSmiles=False)
    csmi.append(m)
df1['canonical_smiles']=csmi

In [26]:
df1['canonical_smiles'].value_counts()

Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)O)C(O)C1O                                                                                         31
CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(N)cc1                                                                       23
N=c1[nH]c(=O)c2ncn(C3OC(COP(=O)(O)OP(=O)(O)O)C(O)C3O)c2[nH]1                                                                            20
Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O)O)C(O)C1O                                                                                20
Nc1ncnc2c1ncn2C1OC(CSCCC([NH3+])C(=O)O)C(O)C1O                                                                                          17
                                                                                                                                        ..
CC[N+](C)(CC)CCCN1c2cc(N)ccc2-c2ccc(N)cc2C1c1ccccc1                                                                                      1
O=C(O)C(Cc1ccccc1)N(Cc1cccc

In [59]:
df1['mol'].isna().sum()

0

In [60]:
df1.head()

Unnamed: 0,pdb_id,resolution,year_x,log_ki_kd,activity_type,activity_units,activity_data,ligand_name,uniprot_id,target,isomeric_smiles,ligand,canonical_smiles,mol
0,2r58,2.0,2007,2.0,kd,mM,10.0,MLY,Q9VHA0,POLYCOMB PROTEIN SCM,[NH3+][C@@H](CCCC[NH+](C)C)C(=O)O,2r58_ligand,C[NH+](CCCC[C@@H](C(=O)O)[NH3+])C,<rdkit.Chem.rdchem.Mol object at 0x00000176A1A...
1,3c2f,2.35,2008,2.0,kd,mM,10.1,PRP,P43619,NICOTINATE-NUCLEOTIDE PYROPHOSPHORYLASE,[C@H]1([C@@H]([C@@H]([C@@H](COP(=O)(O)O)O1)O)O...,3c2f_ligand,O[C@H]1[C@H](O[C@@H]([C@H]1O)COP(=O)(O)O)O[P@@...,<rdkit.Chem.rdchem.Mol object at 0x00000176A1A...
2,3g2y,1.31,2009,2.0,ki,mM,10.0,GF4,Q9L5C8,BETA-LACTAMASE CTX-M-9A,O=c1c(CC)c(C)[nH]n1c1n[nH]nn1,3g2y_ligand,CCc1c(C)[nH]n(c1=O)c1n[nH]nn1,<rdkit.Chem.rdchem.Mol object at 0x00000176A1A...
3,3pce,2.06,1998,2.0,ki,mM,10.0,3HP,P00436,"PROTOCATECHUATE 3,4-DIOXYGENASE",C(=O)(O)Cc1cc(ccc1)O,3pce_ligand,OC(=O)Cc1cccc(c1)O,<rdkit.Chem.rdchem.Mol object at 0x00000176A1A...
4,4qsu,1.9,2014,2.0,kd,mM,10.0,TDR,Q6PL18,ATPASE FAMILY AAA DOMAIN-CONTAINING PROTEIN 2,N1=CC(C(=O)NC1=O)C,4qsu_ligand,CC1C=NC(=O)NC1=O,<rdkit.Chem.rdchem.Mol object at 0x00000176A1A...


In [61]:
df1['uniprot_id'].value_counts()

P00918    291
------    131
P00760    100
P07900     93
P00734     83
         ... 
P0A794      1
Q9BX68      1
Q9UUB1      1
Q9WAF5      1
P04391      1
Name: uniprot_id, Length: 1502, dtype: int64

In [62]:
df1['target'].value_counts()

HIV-1 PROTEASE                                     303
CARBONIC ANHYDRASE 2                               290
HEAT SHOCK PROTEIN HSP90-ALPHA                      94
BROMODOMAIN-CONTAINING PROTEIN 4                    69
TRYPSIN                                             64
                                                  ... 
16S RRNA (ADENINE(1408)-N(1))-METHYLTRANSFERASE      1
METHIONINE GAMMA-LYASE                               1
FREQUENIN 2                                          1
CG5907-PA, ISOFORM A                                 1
ORNITHINE TRANSCARBAMOYLASE                          1
Name: target, Length: 1409, dtype: int64

In [63]:
df1.groupby('target')['uniprot_id'].value_counts()

target                                         uniprot_id
(+)-LIMONENE SYNTHASE                          ------        1
(3R)-HYDROXYACYL-ACP DEHYDRATASE SUBUNIT HADA  I6Y8B9        3
(3R)-HYDROXYMYRISTOYL-ACYL CARRIER PROTEIN     Q5G940        4
0PV-C.01 ANTIBODY FAB HEAVY CHAIN              ------        1
1,3-BETA-GLUCANOSYLTRANSFERASE GAS2            Q06135        7
                                                            ..
XYLULOSE KINASE                                O75191        1
YROSINE-PROTEIN KINASE BTK                     Q06187        1
YTH DOMAIN-CONTAINING FAMILY PROTEIN 2         Q9Y5A9        1
YUAA PROTEIN                                   O32080        2
ZEBAVIDIN                                      E7F650        1
Name: uniprot_id, Length: 1693, dtype: int64

In [64]:
df1.groupby('target')['canonical_smiles '].value_counts()

target                                         canonical_smiles                                                                      
(+)-LIMONENE SYNTHASE                          F/C(=C(/CCC=C(C)C)\C)/CO[P@](=O)(OP(=O)(O)O)O                                             1
(3R)-HYDROXYACYL-ACP DEHYDRATASE SUBUNIT HADA  Oc1ccc(c(c1)O)[C@@H](/C=C/c1ccc(c(c1)O)O)O                                                1
                                               Oc1ccc(cc1)/C=C/C(=O)c1ccc(cc1O)O                                                         1
                                               Oc1ccc2c(c1)O[C@@H]([C@H](C2=O)O)c1ccc(c(c1)O)O                                           1
(3R)-HYDROXYMYRISTOYL-ACYL CARRIER PROTEIN     Cc1cc(O)c2c(c1)C(=O)c1c(C2=O)c(O)cc(c1)O                                                  1
                                                                                                                                        ..
YROSINE-PROTEIN KINASE BTK      

In [65]:
df1['canonical_smiles '].value_counts()

CC(CN(S(=O)(=O)c1ccc(cc1)N)C[C@H]([C@H](Cc1ccccc1)NC(=O)O[C@H]1CO[C@@H]2[C@H]1CCO2)O)C           23
OC(=O)[C@H](CCSC[C@H]1O[C@H]([C@@H]([C@@H]1O)O)n1cnc2c1ncnc2N)[NH3+]                             17
NC(=N)NCCC[C@@H](C(=O)O)[NH3+]                                                                   16
O[C@@H]1[C@@H](CO[P@](=O)(OP(=O)(O)O)O)O[C@H]([C@@H]1O)n1cnc2c1ncnc2N                            16
O[C@@H]1[C@@H](CO[P@@](=O)(OP(=O)(O)O)O)O[C@H]([C@@H]1O)n1cnc2c1ncnc2N                           15
                                                                                                 ..
O[C@H]1C[NH2+]CC[C@@H]1CNc1cc(NCc2ccccc2)n2c(n1)c(cn2)C(C)C                                       1
OC(=O)[C@H]([NH3+])CS                                                                             1
SC[C@@H](Cc1ccccc1)NC(=O)CC(=O)O                                                                  1
CN(C(=O)C1=CC(C=N1)c1n[nH]cc1c1ccccc1)C                                                           1


In [66]:
from rdkit.Chem.PandasTools import SaveXlsxFromFrame

In [89]:
import VS_filters
from VS_filters import *

In [90]:
import importlib
importlib.reload(VS_filters)

<module 'VS_filters' from 'C:\\Users\\Akhila\\Desktop\\PDBbind\\pdbbind_article\\VS_filters.py'>

In [91]:
df2=P_containing_molecules(df1)

In [92]:
df2.shape

(5298, 15)

In [93]:
df3_no_P=df2[df2['P_containing']==0]

In [94]:
df3_no_P.shape

(4631, 15)

In [95]:
df4=df2[df2['P_containing']>0]
df4.shape

(667, 15)

In [96]:
df5_ro5=df3_no_P['mol'].apply(ro5_property_estimation)

In [97]:
df5_ro5.shape

(4631, 6)

In [98]:
df6_merge=pd.concat([df3_no_P, df5_ro5], axis=1)
df6_merge.shape

(4631, 21)

In [99]:
df6_merge.reset_index(inplace=True)

In [100]:
df7_ro5_fulfilled=df6_merge[df6_merge['ro5_fulfilled']==True]

In [101]:
4631-1708

2923

In [102]:
df7_ro5_fulfilled.shape

(1708, 22)

In [103]:
df8_ro5_not_fulfilled=df6_merge[df6_merge['ro5_fulfilled']==False]
df8_ro5_not_fulfilled.shape

(2923, 22)

In [104]:
df7_ro5_fulfilled['ali_N']=df7_ro5_fulfilled['mol'].apply(aliphatic_amino_count) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df7_ro5_fulfilled['ali_N']=df7_ro5_fulfilled['mol'].apply(aliphatic_amino_count)


In [105]:
df7_ro5_fulfilled=car_acids(df7_ro5_fulfilled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for i in df['mol']:


In [106]:
df7_ro5_fulfilled=aliphatic_atom_count(df7_ro5_fulfilled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result=connect_aa(df)


In [107]:
df7_ro5_fulfilled=chiral_center_and_ringcount(df7_ro5_fulfilled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  z.append(num_rings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [108]:
df7_ro5_fulfilled=four_fusedring_count(df7_ro5_fulfilled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for mol in df['mol']:


In [109]:
df8= df7_ro5_fulfilled[(df7_ro5_fulfilled['ring_c'] > 0)& (df7_ro5_fulfilled['Aliphatic_chain_len']<6)& (df7_ro5_fulfilled['chiral_c'] <3)& (df7_ro5_fulfilled['count_four_fused_rings']<1)]

In [110]:
df8.shape #345  {-4, -24, -275, -14}

(1391, 28)

In [111]:
1708-1391

317

In [112]:
df9= df8[(df8['ali_N']<=2) & (df8['carboxyl_group_count']<=1)]

In [113]:
df9=df9.drop(df9.loc[(df9['carboxyl_group_count']==1)& (df9['ali_N'].isin([1,2]))].index)

In [114]:
df9.shape #56

(1334, 28)

PandasTools.SaveXlsxFromFrame(df9,'/Users/amfmf/Downloads/vs_fil_pdbbind_db.xlsx', molCol='mol', size=(250,250))

df9.to_csv('df9.csv')#1334 is the final count without dropping duplicates

PandasTools.SaveXlsxFromFrame(df9,'/Users/amfmf/Downloads/pdbbind_vsfilters1.xlsx', molCol='mol', size=(250,250))

In [115]:
imat=df9[df9['pdb_id']=='3gvu']

In [116]:
imat

Unnamed: 0,index,pdb_id,resolution,year_x,log_ki_kd,activity_type,activity_units,activity_data,ligand_name,uniprot_id,...,n_HBD,LogP,Rot_bonds,ro5_fulfilled,ali_N,carboxyl_group_count,Aliphatic_chain_len,chiral_c,ring_c,count_four_fused_rings
3580,4194,3gvu,2.05,2009,8.0,kd,nM,10.0,STI,P42684,...,4,1.75612,7,True,2,0,0,0,5,0


In [117]:
df9['target'].value_counts() #unique targets #362

CARBONIC ANHYDRASE 2                109
HEAT SHOCK PROTEIN HSP90-ALPHA       77
BROMODOMAIN-CONTAINING PROTEIN 4     57
COAGULATION FACTOR XA                32
TRANSTHYRETIN                        27
                                   ... 
PIRIN                                 1
ATROLYSIN C                           1
BIFUNCTIONAL PROTEIN GLMU             1
TOLL-LIKE RECEPTOR 7                  1
ANTI DABIGATRAN FAB                   1
Name: target, Length: 362, dtype: int64

In [118]:
df9['uniprot_id'].value_counts() #unique uniprot; few targets had multiple uniprots therefore uniprot count is more than target count.

P00918    109
P07900     76
O60885     56
P00742     44
P00760     41
         ... 
P49336      1
Q84MC7      1
O00625      1
Q2VPJ6      1
P10828      1
Name: uniprot_id, Length: 367, dtype: int64

In [119]:
df10=df9['canonical_smiles '].drop_duplicates().reset_index()

In [120]:
df10.shape #unique canonical smiles #1229

(1229, 2)

In [121]:
1334-1229 #105 duplicate smiles

105

In [122]:
df9.groupby('uniprot_id')['canonical_smiles '].value_counts() 

uniprot_id  canonical_smiles                                                  
------      OC(=O)CCN(C(=O)c1ccc2c(c1)nc(n2C)CNc1ccc(cc1)C(=N)N)c1ccccn1          3
            O[C@@H]1CCC/C=C/c2cc(O)cc(c2C(=O)O[C@H](CCC1)C)O                      2
            C=CCNc1nc(SCc2ccc(cc2)Cl)nc2c1c1CC[N@@H+](Cc1s2)C                     1
            CC(=O)Nc1ccc2c(c1S(=O)(=O)O)cccc2S(=O)(=O)O                           1
            CCN1C(c2ccccc2)c2cc(N)ccc2-c2c1cc(N)cc2                               1
                                                                                 ..
Q9Z2X8      OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(c(c1)CN(S(=O)(=O)C)C)Cl           1
            OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(c(c1)CN(S(=O)(=O)c1ccccc1)C)Cl    1
            OC(=O)C[C@H](c1ccc2c(c1)nnn2C)c1ccc(cc1)Cl                            1
Q9ZLT0      CC(Cn1c2nn(c(c2c(=O)n(c1=O)C)c1ccncc1)Cc1cccc2c1cccc2)C               1
U6NCW5      NC(=O)Cn1c2C[C@@H](CCc2c2c1ccc(c2)Cl)C(=O)O                          

In [123]:
df11=df9.drop_duplicates(subset=['uniprot_id', 'canonical_smiles '], keep='first')

In [124]:
df11.shape

(1300, 28)

In [125]:
df12=df9.drop_duplicates(subset=['target', 'canonical_smiles '], keep='first') #final dataset********

In [126]:
df12.shape

(1299, 28)

PandasTools.SaveXlsxFromFrame(df12,'/Users/amfmf/Downloads/vs_fil_pdbbind_db_1299.xlsx', molCol='mol', size=(250,250))

In [127]:
df9.groupby('target')['canonical_smiles '].value_counts()  #unique target and ligand #1299

target                                         canonical_smiles                                                           
(3R)-HYDROXYACYL-ACP DEHYDRATASE SUBUNIT HADA  Oc1ccc(cc1)/C=C/C(=O)c1ccc(cc1O)O                                              1
                                               Oc1ccc2c(c1)O[C@@H]([C@H](C2=O)O)c1ccc(c(c1)O)O                                1
(3R)-HYDROXYMYRISTOYL-ACYL CARRIER PROTEIN     Cc1cc(O)c2c(c1)C(=O)c1c(C2=O)c(O)cc(c1)O                                       1
                                               O=C(c1cccnc1)N/N=C/c1cc(Br)c(c(c1O)Br)O                                        1
2',5'-PHOSPHODIESTERASE 12                     OC[C@H]1[C@@H](OCCN1C(=O)c1ccc2c(c1)n(C)c(n2)c1c[nH]c2c1ccc(c2)C#N)c1ccccc1    1
                                                                                                                             ..
WD REPEAT-CONTAINING PROTEIN 5                 OCc1c[nH]c(n1)CCCCNC(=O)OCc1ccccc1                            

In [128]:
x=df12.loc[df12['uniprot_id']=='------']

In [129]:
df12['target'].value_counts()

CARBONIC ANHYDRASE 2                101
HEAT SHOCK PROTEIN HSP90-ALPHA       72
BROMODOMAIN-CONTAINING PROTEIN 4     56
COAGULATION FACTOR XA                32
CASEIN KINASE II, ALPHA SUBUNIT      26
                                   ... 
ATROLYSIN C                           1
BIFUNCTIONAL PROTEIN GLMU             1
TOLL-LIKE RECEPTOR 7                  1
LACTONASE FOR PROTEIN                 1
ANTI DABIGATRAN FAB                   1
Name: target, Length: 362, dtype: int64

In [130]:
df13=df12.loc[df12['uniprot_id'] != '------']

In [131]:
df13.shape

(1286, 28)

PandasTools.SaveXlsxFromFrame(df13,'vs_fil_pdbbind_db_1286.xlsx', molCol='mol', size=(250,250))

In [132]:
df13.reset_index(inplace=True)

In [133]:
df13.shape

(1286, 29)

In [134]:
df13.groupby('uniprot_id')['target'].value_counts() 

uniprot_id  target                                     
A2RI36      TRANSCRIPTIONAL REGULATOR, PADR-LIKE FAMILY    1
A4GRE3      MACROPHAGE MIGRATION INHIBITORY FACTOR         1
A5GZX3      GLYOXALASE I                                   1
A5H660      HISTONE DEACETYLASE                            4
A5K1A2      GLYCYLPEPTIDE N-TETRADECANOYLTRANSFERASE       2
                                                          ..
Q9Y5S2      SERINE/THREONINE-PROTEIN KINASE MRCK BETA      1
Q9Y657      SPINDLIN-1                                     2
Q9Z2X8      KELCH-LIKE ECH-ASSOCIATED PROTEIN 1            4
Q9ZLT0      GLUTAMATE RACEMASE                             1
U6NCW5      DNA POLYMERASE III SUBUNIT BETA                1
Name: target, Length: 389, dtype: int64

In [135]:
df13.head()

Unnamed: 0,level_0,index,pdb_id,resolution,year_x,log_ki_kd,activity_type,activity_units,activity_data,ligand_name,...,n_HBD,LogP,Rot_bonds,ro5_fulfilled,ali_N,carboxyl_group_count,Aliphatic_chain_len,chiral_c,ring_c,count_four_fused_rings
0,26,32,3ao5,1.8,2011,2.23,kd,mM,5.9,BBY,...,1,2.1605,1,True,0,0,0,0,3,0
1,40,52,5aol,1.5,2015,2.32,kd,uM,4800.0,UFV,...,2,2.6323,0,True,0,0,0,0,1,0
2,102,124,5er4,1.81,2016,2.76,kd,uM,1752.0,5RL,...,2,3.3965,0,True,0,0,0,0,4,0
3,109,132,5fbi,1.47,2016,2.8,kd,uM,1600.0,5WD,...,3,2.544,5,True,0,1,2,0,3,0
4,117,141,3zt2,1.7,2012,2.84,kd,uM,1435.0,ZT2,...,1,2.7793,2,True,0,1,0,0,4,0


In [136]:
df13['target'].value_counts()            #350

CARBONIC ANHYDRASE 2                       101
HEAT SHOCK PROTEIN HSP90-ALPHA              72
BROMODOMAIN-CONTAINING PROTEIN 4            56
COAGULATION FACTOR XA                       32
CASEIN KINASE II, ALPHA SUBUNIT             26
                                          ... 
ABSCISIC ACID RECEPTOR PYL9                  1
PIRIN                                        1
N-METHYL-D-ASPARTATE RECEPTOR SUBUNIT 1      1
ATROLYSIN C                                  1
THYROID HORMONE RECEPTOR BETA-1              1
Name: target, Length: 350, dtype: int64

In [137]:
df13['canonical_smiles '].value_counts() #1286-1219=67 smiles repeated 

OS(=O)(=O)c1cccc2c1c(ccc2)Nc1ccccc1                                      4
NC(=N)c1sc2c(c1)c(I)ccc2                                                 3
CCOC(=O)CO/N=C(\C1(CC1)c1ccc2c(c1)nc(n2C)CNc1ccc(cc1)C(=N)N)/c1ccccn1    3
O=C(c1ccc(c(c1)Cl)S(=O)(=O)N)CSc1ncccn1                                  3
Oc1ccccc1c1nc2c([nH]1)cc(cc2)C(=N)N                                      3
                                                                        ..
CCc1cc(c(cc1O)O)C1=C(C(N=N1)C)c1ccccc1F                                  1
N#Cc1cc(OCCN2C[C@@](CC2=O)(C)c2ccccc2)ccc1F                              1
N=C1N[C@](C(=O)N1C)(c1ccccc1)c1cccc(c1)c1cccnc1                          1
SC[C@H](C(=O)N[C@H](C(=O)O)Cc1ccc(cc1)OC)CC(C)C                          1
FC([C@]([C@@H](C(=O)N=O)NC(=O)c1ccc(cc1)C#CC#Cc1ccc(cc1)N)(O)C)F         1
Name: canonical_smiles , Length: 1219, dtype: int64

In [138]:
df12[df12['canonical_smiles ']=='O=C(c1ccc(c(c1)Cl)S(=O)(=O)N)CSc1ncccn1']['target']

1909    CARBONIC ANHYDRASE 12
3421    CARBONIC ANHYDRASE 13
3533     CARBONIC ANHYDRASE 2
Name: target, dtype: object

In [139]:
df31=pd.read_excel('frase_id.xlsx')

In [140]:
df31.shape

(1139, 1)

In [141]:
df14=df13[df13['pdb_id'].isin(df31['FRASE_ID'])]

In [142]:
df14.shape

(1139, 29)

In [143]:
df14.head()

Unnamed: 0,level_0,index,pdb_id,resolution,year_x,log_ki_kd,activity_type,activity_units,activity_data,ligand_name,...,n_HBD,LogP,Rot_bonds,ro5_fulfilled,ali_N,carboxyl_group_count,Aliphatic_chain_len,chiral_c,ring_c,count_four_fused_rings
0,26,32,3ao5,1.8,2011,2.23,kd,mM,5.9,BBY,...,1,2.1605,1,True,0,0,0,0,3,0
1,40,52,5aol,1.5,2015,2.32,kd,uM,4800.0,UFV,...,2,2.6323,0,True,0,0,0,0,1,0
2,102,124,5er4,1.81,2016,2.76,kd,uM,1752.0,5RL,...,2,3.3965,0,True,0,0,0,0,4,0
3,109,132,5fbi,1.47,2016,2.8,kd,uM,1600.0,5WD,...,3,2.544,5,True,0,1,2,0,3,0
5,120,144,3zt3,1.95,2012,2.86,kd,uM,1375.0,ZT4,...,2,2.5711,2,True,0,1,0,1,4,0
