<a href="https://colab.research.google.com/github/ENSB-Projects/DrugDiscovery/blob/main/PFE_2022/Retrieving_bioactivities_for_DUDE_BACE_Inhibitors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1 : Install the ChEMBL Webresource Client

In [None]:
! pip install chembl_webresource_client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.9 MB/s 
Collecting requests-cache~=0.7.0
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting attrs<22.0,>=21.2
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 8.8 MB/s 
[?25hCollecting url-normalize<2.0,>=1.4
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Collecting pyyaml>=5.4
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.5 MB/s 
[?25hCollecting itsdangerous>=2.0.1
  Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Installing collected packages: url-normalize, pyyaml, itsdangerous, attrs, reques

# Step 2 : Read the CSV file

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

from google.colab import data_table
data_table.enable_dataframe_formatter()
# Read the CSV file in a DataFrame
df = pd.read_csv('Active_DUDE_BACE_inhibitors.csv')
df

Unnamed: 0.1,Unnamed: 0,ID,smiles
0,0,CHEMBL253799,C=CC[NH2+]C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)c1cc...
1,1,CHEMBL272718,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...
2,2,CHEMBL272718,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(\N)N...
3,3,CHEMBL272719,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...
4,4,CHEMBL272719,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(\N)N...
...,...,...,...
480,480,CHEMBL190010,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@...
481,481,CHEMBL582817,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)C[NH2+][...
482,482,CHEMBL582817,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CN[C@@]1...
483,483,CHEMBL1209106,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)C[NH2+]C...


## Checking if there are unique smiles

In [None]:
len(df.smiles.unique())

485

# Step 3 : Retrieve all bioactivities for Active_DUDE_BACE_inhibitors from ChEMBL

In [None]:
df_from_DUDE = pd.DataFrame()
for id in df['ID']:
  activities = new_client.activity
  result = activities.filter(molecule_chembl_id = id, standard_type = 'IC50', target_chembl_id ='CHEMBL4822', pchembl_value__isnull=False).only(['molecule_chembl_id','canonical_smiles','pchembl_value'])
  df_result = pd.DataFrame.from_dict(result)
  df_from_DUDE = df_from_DUDE.append(df_result)
  
  #print(id)
  #print(result)
  #print(type(result))
  
df_from_DUDE

Unnamed: 0,canonical_smiles,molecule_chembl_id,pchembl_value,value
0,C=CCNC[C@@H](O)[C@H](Cc1ccccc1)NC(=O)c1cc(C(=O...,CHEMBL253799,6.65,223.0
0,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272718,6.57,0.269
0,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272718,6.57,0.269
0,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272719,6.62,0.241
0,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272719,6.62,0.241
...,...,...,...,...
0,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CN[C@@]1...,CHEMBL582817,7.07,85.0
0,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209106,7.06,88.0
1,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209106,7.06,88.0
0,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209106,7.06,88.0


# Step 4 : Drop duplicate rows in df_from_DUDE DataFrame
## 1. First, we drop duplicates that have same values

In [None]:
df_from_DUDE.drop_duplicates(inplace=True)
df_from_DUDE.reset_index(drop=True, inplace=True)
df_from_DUDE

Unnamed: 0,canonical_smiles,molecule_chembl_id,pchembl_value,value
0,C=CCNC[C@@H](O)[C@H](Cc1ccccc1)NC(=O)c1cc(C(=O...,CHEMBL253799,6.65,223.0
1,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272718,6.57,0.269
2,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272719,6.62,0.241
3,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209054,6.43,370.0
4,CCn1cc2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,CHEMBL564530,8.52,0.003
...,...,...,...,...
368,C[C@@H](NC(=O)c1cc(C(=O)N[C@@H](Cc2ccccc2)[C@H...,CHEMBL474057,7.25,56.0
369,Cc1cc2c(C(=O)N[C@@H](Cc3ccccc3)[C@H](O)CNC3CC3...,CHEMBL463865,6.00,998.0
370,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@...,CHEMBL190010,7.22,0.06
371,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CN[C@@]1...,CHEMBL582817,7.07,85.0


## 2. Drop duplicates and keep only those having the best pIC50

In [None]:
maxList = []
for i in range (0, df_from_DUDE.shape[0]-1):
  if df_from_DUDE.iloc[i].molecule_chembl_id == df_from_DUDE.iloc[i+1].molecule_chembl_id:
    if df_from_DUDE.iloc[i].pchembl_value < df_from_DUDE.iloc[i+1].pchembl_value:
      maxList.append(i+1)
    else:
      maxList.append(i)
df_from_DUDE_cleaned = df_from_DUDE.drop(maxList, axis=0)
df_from_DUDE_cleaned.reset_index(drop=True, inplace=True)
df_from_DUDE_cleaned

Unnamed: 0,canonical_smiles,molecule_chembl_id,pchembl_value,value
0,C=CCNC[C@@H](O)[C@H](Cc1ccccc1)NC(=O)c1cc(C(=O...,CHEMBL253799,6.65,223.0
1,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272718,6.57,0.269
2,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272719,6.62,0.241
3,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209054,6.43,370.0
4,CCn1cc2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,CHEMBL564530,8.52,0.003
...,...,...,...,...
267,C[C@@H](NC(=O)c1cc(C(=O)N[C@@H](Cc2ccccc2)[C@H...,CHEMBL474057,7.25,56.0
268,Cc1cc2c(C(=O)N[C@@H](Cc3ccccc3)[C@H](O)CNC3CC3...,CHEMBL463865,6.00,998.0
269,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@...,CHEMBL190010,7.22,0.06
270,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CN[C@@]1...,CHEMBL582817,7.07,85.0


# Step 5 : Rename DataFrame columns

In [None]:
df_from_DUDE_cleaned.rename(columns = {'pchembl_value':'pIC50', 'value':'IC50'}, inplace = True)
df_from_DUDE_cleaned

Unnamed: 0,canonical_smiles,molecule_chembl_id,pIC50,IC50
0,C=CCNC[C@@H](O)[C@H](Cc1ccccc1)NC(=O)c1cc(C(=O...,CHEMBL253799,6.65,223.0
1,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272718,6.57,0.269
2,CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)/N=C(/N)N...,CHEMBL272719,6.62,0.241
3,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CNC1(c2c...,CHEMBL1209054,6.43,370.0
4,CCn1cc2c3c(cc(C(=O)N[C@@H](Cc4ccccc4)[C@H](O)C...,CHEMBL564530,8.52,0.003
...,...,...,...,...
267,C[C@@H](NC(=O)c1cc(C(=O)N[C@@H](Cc2ccccc2)[C@H...,CHEMBL474057,7.25,56.0
268,Cc1cc2c(C(=O)N[C@@H](Cc3ccccc3)[C@H](O)CNC3CC3...,CHEMBL463865,6.00,998.0
269,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@...,CHEMBL190010,7.22,0.06
270,CC(=O)N[C@@H](Cc1cc(F)cc(F)c1)[C@H](O)CN[C@@]1...,CHEMBL582817,7.07,85.0


# Step 6 : Saving inhibitors in a new CSV file

In [None]:
df_from_DUDE_cleaned.to_csv('Cleaned_DUDE_BACE_inhibitors.csv', index=False)

In [None]:
 ! zip Cleaned_DUDE_BACE_inhibitors.zip *.csv

  adding: Active_DUDE_BACE_inhibitors.csv (deflated 84%)
  adding: Cleaned_DUDE_BACE_inhibitors.csv (deflated 78%)


In [None]:
! ls -l

total 76
-rw-r--r-- 1 root root 44517 Aug 16 09:38 Active_DUDE_BACE_inhibitors.csv
-rw-r--r-- 1 root root 26127 Aug 16 09:44 Cleaned_DUDE_BACE_inhibitors.csv
drwxr-xr-x 1 root root  4096 Aug  3 20:21 sample_data


# Draft Code

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Active_DUDE_BACE_inhibitors.csv to Active_DUDE_BACE_inhibitors.csv


In [None]:
Acts = new_client.activity
results = Acts.filter(molecule_chembl_id = 'CHEMBL1096683', standard_type = 'IC50', target_chembl_id ='CHEMBL4822', pchembl_value__isnull=False)
print(results)
for res in results:
  print(res)
results[0].keys()

[{'activity_comment': None, 'activity_id': 3188336, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL1107777', 'assay_description': 'Inhibition of human BACE1 expressed in mouse fibroblast cells assessed as inhibition of secreted APPbeta-NF production at pH 7.2', 'assay_type': 'B', 'assay_variant_accession': None, 'assay_variant_mutation': None, 'bao_endpoint': 'BAO_0000190', 'bao_format': 'BAO_0000219', 'bao_label': 'cell-based format', 'canonical_smiles': 'C[C@@H](NC(=O)c1cc(-c2ncc([C@](C)(N)Cc3ccccc3)o2)cc(N(C)S(C)(=O)=O)c1)c1ccc(F)cc1', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1155422', 'document_journal': 'Bioorg. Med. Chem. Lett.', 'document_year': 2010, 'ligand_efficiency': {'bei': '15.07', 'le': '0.29', 'lle': '3.52', 'sei': '7.00'}, 'molecule_chembl_id': 'CHEMBL1096683', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL1096683', 'pchembl_value': '8.30', 'potential_duplicate': 0, 'qudt_units': 'http://www.

dict_keys(['activity_comment', 'activity_id', 'activity_properties', 'assay_chembl_id', 'assay_description', 'assay_type', 'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint', 'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment', 'data_validity_description', 'document_chembl_id', 'document_journal', 'document_year', 'ligand_efficiency', 'molecule_chembl_id', 'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value', 'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id', 'standard_flag', 'standard_relation', 'standard_text_value', 'standard_type', 'standard_units', 'standard_upper_value', 'standard_value', 'target_chembl_id', 'target_organism', 'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type', 'units', 'uo_units', 'upper_value', 'value'])

In [None]:
df_from_DUDE_copy = df_from_DUDE.copy()
df_from_DUDE_copy = pd.DataFrame()
for id in df['ID']:
  activities = new_client.activity
  result = activities.filter(molecule_chembl_id = id, standard_type = 'IC50', target_chembl_id ='CHEMBL4822', pchembl_value__isnull=False).only(['molecule_chembl_id','canonical_smiles','pchembl_value'])
  df_result = pd.DataFrame.from_dict(result)
  df_from_DUDE_copy = df_from_DUDE_copy.append(df_result)
print(len(df_from_DUDE_copy.index))
print(f"DataFrame shape: {df_from_DUDE_copy.shape}")


675
DataFrame shape: (675, 4)


In [None]:
print(len(df_from_DUDE_cleaned.index))
print(f"DataFrame shape: {df_from_DUDE_cleaned.shape}")

272
DataFrame shape: (272, 4)


## Wrong Solution

In [None]:
from chembl_webresource_client.new_client import new_client
for id in df['ID']:
  molecule = new_client.molecule
  m1 = molecule.filter(chembl_id=id).only(['molecule_chembl_id', 'molecule_structures'])
  print(m1[0]['molecule_structures']['canonical_smiles'])