# AllBinging.ipynb

In this script we load the data from the document "BindingDB_All.tsv" witch is Ligand-Target-Affinity Dataset with 2,713,006 measurements, 1,165,900 compounds, 9,028 targets from https://www.bindingdb.org/rwd/bind/index.jsp database. 

The main objective is to be able to filter the ligads for NRAS, and to obtain a dataset to train the ML model with ChemProp.

In [5]:
from rdkit import Chem
from rdkit.Chem import PandasTools
import pandas as pd


#load file 
df = pd.read_csv('../../../../../Downloads/BindingDB_All.tsv', sep='\t', error_bad_lines=False)



  df = pd.read_csv('../../../../../Downloads/BindingDB_All.tsv', sep='\t', error_bad_lines=False)
b'Skipping line 1325152: expected 243 fields, saw 266\n'
b'Skipping line 2301014: expected 243 fields, saw 266\n'
b'Skipping line 2388235: expected 243 fields, saw 266\nSkipping line 2388236: expected 243 fields, saw 266\nSkipping line 2388237: expected 243 fields, saw 266\nSkipping line 2388238: expected 243 fields, saw 266\nSkipping line 2388239: expected 243 fields, saw 266\nSkipping line 2388240: expected 243 fields, saw 266\nSkipping line 2388241: expected 243 fields, saw 266\nSkipping line 2388242: expected 243 fields, saw 266\nSkipping line 2388243: expected 243 fields, saw 266\nSkipping line 2388244: expected 243 fields, saw 266\nSkipping line 2388245: expected 243 fields, saw 266\nSkipping line 2388246: expected 243 fields, saw 266\nSkipping line 2388247: expected 243 fields, saw 266\nSkipping line 2388248: expected 243 fields, saw 266\nSkipping line 2388249: expected 243 fields

In [4]:
#let's see how many compounds are loaded
print(len(df)) 

#what information do we have for each compound
print(df.columns)

#let's look at the first 10
print(df.head(10))

2695816
Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)',
       ...
       'Unnamed: 233', 'Unnamed: 234', 'Unnamed: 235', 'Unnamed: 236',
       'Unnamed: 237', 'Unnamed: 238', 'Unnamed: 239', 'Unnamed: 240',
       'Unnamed: 241', 'Unnamed: 242'],
      dtype='object', length=243)
   BindingDB Reactant_set_id  \
0                          2   
1                          3   
2                          4   
3                          5   
4                          6   
5                          7   
6                          8   
7                          9   
8                         10   
9                         11   

                                       Ligand SMILES  \
0  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...   
1  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc

## Filter NRAS ligands

The dataset is filtered through the keyword "GTPase NRas".

In [18]:
keyword = "GTPase NRas"
NRAS_df = df[df["Target Name"].str.contains(keyword)]

#let's see how many ligands were found
print(len(NRAS_df)) #1835

1835


In [24]:
#let's see how the data looks like
print(NRAS_df.head(10))


         BindingDB Reactant_set_id  \
1119442                    1141734   
1119444                    1141736   
1119446                    1141738   
1119449                    1141741   
1119451                    1141743   
1119453                    1141745   
1119455                    1141747   
1119457                    1141749   
1119459                    1141751   
1119461                    1141753   

                                             Ligand SMILES  \
1119442    COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cn[nH]c1   
1119444      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cccnc1   
1119446      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1ccncc1   
1119449  Cc1n[nH]c(C)c1-c1cn2c(cnc2cc1CO)-c1cccc(NC2CCN...   
1119451  COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cnn(CCN...   
1119453    COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cc[nH]c1   
1119455   Cc1noc(C)c1-c1cn2c(cnc2cc1CO)-c1cccc(NC2CCNC2)n1   
1119457   COc1cc2ncc(-c3cccc(NC4CCCNC4)n3)n2cc1-c1cn[nH]c1   
1119459  CN(C1CCNC1)c1c

In [21]:
#remove columns with unnecessary information
col_data = ['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource',
       'IC50 (nM)']

NRAS_2_df = NRAS_df[col_data]
print(NRAS_2_df.head(5))

         BindingDB Reactant_set_id  \
1119442                    1141734   
1119444                    1141736   
1119446                    1141738   
1119449                    1141741   
1119451                    1141743   

                                             Ligand SMILES  \
1119442    COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cn[nH]c1   
1119444      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cccnc1   
1119446      COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1ccncc1   
1119449  Cc1n[nH]c(C)c1-c1cn2c(cnc2cc1CO)-c1cccc(NC2CCN...   
1119451  COc1cc2ncc(-c3cccc(NC4CCNC4)n3)n2cc1-c1cnn(CCN...   

                                              Ligand InChI  \
1119442                                                NaN   
1119444  InChI=1S/C22H22N6O/c1-29-20-10-22-25-13-19(28(...   
1119446  InChI=1S/C22H22N6O/c1-29-20-11-22-25-13-19(28(...   
1119449                                                NaN   
1119451  InChI=1S/C26H32N8O2/c1-35-24-13-26-28-16-23(22...   

                    Ligan

In [23]:
#save data in a csv file
NRAS_2_df.to_csv('../Data/NRAS_ligands.csv', index=False)

In [7]:
#make dataset with the first 1000 molecules for negative train 
df = df.head(1000)

col_data = ['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource',
       'IC50 (nM)']

df = df[col_data]
print(df.head(5))

df.to_csv('../Data/BindingDB_1000.csv', index=False)

   BindingDB Reactant_set_id  \
0                          2   
1                          3   
2                          4   
3                          5   
4                          6   

                                       Ligand SMILES  \
0  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...   
1  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...   
2  OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...   
3  OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...   
4  CCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H](...   

                                        Ligand InChI  \
0  InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...   
1  InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...   
2  InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...   
3  InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...   
4  InChI=1S/C27H36N2O3/c1-2-3-16-28-23(17-20-10-6...   

              Ligand InChI Key  BindingDB MonomerID  \
0  UZLMEAPBHYEHAC-UNTBESQGSA-N                   22   
1  HYNYUFZPPJMPOB-UTWJF