# Data Collection and Pre-Processing from the ChEMBL Database.




## **Installing libraries**

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [4]:
! pip install chembl_webresource_client



## **Importing libraries**

In [5]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Searching for Angiotensin-Converting Enzyme

In [6]:
target = new_client.target
target_query = target.search('ACE')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Caenorhabditis elegans,Acetylcholinesterase 1,18.0,False,CHEMBL3341583,"[{'accession': 'P38433', 'component_descriptio...",SINGLE PROTEIN,6239
1,"[{'xref_id': 'P12822', 'xref_name': None, 'xre...",Oryctolagus cuniculus,Angiotensin-converting enzyme,16.0,False,CHEMBL4074,"[{'accession': 'P12822', 'component_descriptio...",SINGLE PROTEIN,9986
2,"[{'xref_id': 'P47820', 'xref_name': None, 'xre...",Rattus norvegicus,Angiotensin-converting enzyme,16.0,False,CHEMBL2625,"[{'accession': 'P47820', 'component_descriptio...",SINGLE PROTEIN,10116
3,[],Sus scrofa,Angiotensin-converting enzyme,16.0,False,CHEMBL4523113,"[{'accession': 'F1RRW5', 'component_descriptio...",SINGLE PROTEIN,9823
4,"[{'xref_id': 'P12821', 'xref_name': None, 'xre...",Homo sapiens,Angiotensin-converting enzyme,15.0,False,CHEMBL1808,"[{'accession': 'P12821', 'component_descriptio...",SINGLE PROTEIN,9606
5,"[{'xref_id': 'P09470', 'xref_name': None, 'xre...",Mus musculus,Angiotensin-converting enzyme,15.0,False,CHEMBL2994,"[{'accession': 'P09470', 'component_descriptio...",SINGLE PROTEIN,10090
6,[],Anopheles gambiae,Acetylcholinesterase,15.0,False,CHEMBL2046266,"[{'accession': 'Q869C3', 'component_descriptio...",SINGLE PROTEIN,7165
7,[],Nephotettix cincticeps,Ace-orthologous acetylcholinesterase,15.0,False,CHEMBL2366514,"[{'accession': 'Q9NJH6', 'component_descriptio...",SINGLE PROTEIN,94400
8,[],Homo sapiens,Angiotensin-converting enzyme,14.0,False,CHEMBL2096989,"[{'accession': 'P12821', 'component_descriptio...",PROTEIN FAMILY,9606
9,[],Aedes aegypti,Acetylcholinesterase,14.0,False,CHEMBL4295607,"[{'accession': 'Q6A2E2', 'component_descriptio...",SINGLE PROTEIN,7159


In [7]:
selected_target = targets.target_chembl_id[4]
selected_target

'CHEMBL1808'

In [23]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [9]:
df = pd.DataFrame.from_dict(res)

In [10]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72373,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,0.09
1,,,77815,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,50.0
2,,,77816,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,2.4
3,,,83360,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,1.2
4,,,87461,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090624,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,33.0
869,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090625,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,12.0
870,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090626,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,30.0
871,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090627,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,14.0


 save the resulting bioactivity data to a CSV file **ACE_01_bioactivity_data.csv**.

In [11]:
df.to_csv('ACE_01_bioactivity_data_raw.csv', index=False)

## **Handling missing data**


In [12]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72373,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,0.09
1,,,77815,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,50.0
2,,,77816,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,2.4
3,,,83360,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,1.2
4,,,87461,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090624,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,33.0
869,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090625,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,12.0
870,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090626,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,30.0
871,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090627,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,14.0


In [13]:
len(df2.canonical_smiles.unique())

708

In [14]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72373,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,0.09
1,,,77815,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,50.0
2,,,77816,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,2.4
3,,,83360,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,1.2
4,,,87461,[],CHEMBL648529,Inhibition of Angiotensin I converting enzyme,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,uM,UO_0000065,,260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090619,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,3.0
864,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090620,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,5.8
867,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090623,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,1.7
871,"{'action_type': 'INHIBITOR', 'description': 'N...",,25090627,[],CHEMBL5258025,Inhibition of angiotensin-converting enzyme (u...,B,,,BAO_0000190,...,Homo sapiens,Angiotensin-converting enzyme,9606,,,IC50,nM,UO_0000065,,14.0


**Data pre-processing of the bioactivity data**


In [15]:

selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL152758,CC(N[C@H](C)C(=O)N1CCC[C@H]1C(=O)O)C(=O)O,90.0
1,CHEMBL291381,O=C(CCC(=O)N1CCCC1C(=O)O)NO,50000.0
2,CHEMBL358439,C[C@@H](NCC(=O)O)C(=O)N1CCC[C@H]1C(=O)O,2400.0
3,CHEMBL1237,NCCCC[C@H](N[C@@H](CCc1ccccc1)C(=O)O)C(=O)N1CC...,1.2
4,CHEMBL293213,CC(CCC(=O)N1CCCC1C(=O)O)C(=O)O,260000.0
...,...,...,...
863,CHEMBL1201405,COc1cc2c(cc1OC)CN(C(=O)[C@H](C)N[C@@H](CCc1ccc...,3.0
864,CHEMBL5287967,C[C@H](N[C@@H](CCc1ccccc1)C(=O)O)C(=O)N1CCc2cc...,5.8
867,CHEMBL1192519,O=C(O)CN1C(=O)[C@@H](N[C@@H](CCc2ccccc2)C(=O)O...,1.7
871,CHEMBL49920,C[C@H](NC(=O)[C@@H](CS)Cc1ccccc1)C(=O)N1Cc2ccc...,14.0


In [16]:
df3.to_csv('ACE_02_bioactivity_data_preprocessed.csv', index=False)

In [17]:
df4 = pd.read_csv('ACE_02_bioactivity_data_preprocessed.csv')

In [18]:
bioactivity_threshold = []
for i in df4.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [19]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df5 = pd.concat([df4, bioactivity_class], axis=1)
df5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL152758,CC(N[C@H](C)C(=O)N1CCC[C@H]1C(=O)O)C(=O)O,90.0,active
1,CHEMBL291381,O=C(CCC(=O)N1CCCC1C(=O)O)NO,50000.0,inactive
2,CHEMBL358439,C[C@@H](NCC(=O)O)C(=O)N1CCC[C@H]1C(=O)O,2400.0,intermediate
3,CHEMBL1237,NCCCC[C@H](N[C@@H](CCc1ccccc1)C(=O)O)C(=O)N1CC...,1.2,active
4,CHEMBL293213,CC(CCC(=O)N1CCCC1C(=O)O)C(=O)O,260000.0,inactive
...,...,...,...,...
703,CHEMBL1201405,COc1cc2c(cc1OC)CN(C(=O)[C@H](C)N[C@@H](CCc1ccc...,3.0,active
704,CHEMBL5287967,C[C@H](N[C@@H](CCc1ccccc1)C(=O)O)C(=O)N1CCc2cc...,5.8,active
705,CHEMBL1192519,O=C(O)CN1C(=O)[C@@H](N[C@@H](CCc2ccccc2)C(=O)O...,1.7,active
706,CHEMBL49920,C[C@H](NC(=O)[C@@H](CS)Cc1ccccc1)C(=O)N1Cc2ccc...,14.0,active


In [20]:
df5.to_csv('ACE_03_bioactivity_data_curated.csv', index=False)


In [21]:

! zip acetylcholinesterase.zip *.csv

  adding: ACE_01_bioactivity_data_raw.csv (deflated 90%)
  adding: ACE_02_bioactivity_data_preprocessed.csv (deflated 80%)
  adding: ACE_03_bioactivity_data_curated.csv (deflated 81%)


---