In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from chembl_webresource_client.new_client import new_client

In [2]:
target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,27.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,27.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
2,[],Drosophila melanogaster,Acetylcholinesterase,18.0,False,CHEMBL2242744,"[{'accession': 'P07140', 'component_descriptio...",SINGLE PROTEIN,7227
3,[],Bemisia tabaci,AChE2,16.0,False,CHEMBL2366409,"[{'accession': 'B3SST5', 'component_descriptio...",SINGLE PROTEIN,7038
4,[],Leptinotarsa decemlineata,Acetylcholinesterase,16.0,False,CHEMBL2366490,"[{'accession': 'Q27677', 'component_descriptio...",SINGLE PROTEIN,7539
5,"[{'xref_id': 'P04058', 'xref_name': None, 'xre...",Torpedo californica,Acetylcholinesterase,15.0,False,CHEMBL4780,"[{'accession': 'P04058', 'component_descriptio...",SINGLE PROTEIN,7787
6,"[{'xref_id': 'P21836', 'xref_name': None, 'xre...",Mus musculus,Acetylcholinesterase,15.0,False,CHEMBL3198,"[{'accession': 'P21836', 'component_descriptio...",SINGLE PROTEIN,10090
7,"[{'xref_id': 'P37136', 'xref_name': None, 'xre...",Rattus norvegicus,Acetylcholinesterase,15.0,False,CHEMBL3199,"[{'accession': 'P37136', 'component_descriptio...",SINGLE PROTEIN,10116
8,"[{'xref_id': 'O42275', 'xref_name': None, 'xre...",Electrophorus electricus,Acetylcholinesterase,15.0,False,CHEMBL4078,"[{'accession': 'O42275', 'component_descriptio...",SINGLE PROTEIN,8005
9,"[{'xref_id': 'P23795', 'xref_name': None, 'xre...",Bos taurus,Acetylcholinesterase,15.0,False,CHEMBL4768,"[{'accession': 'P23795', 'component_descriptio...",SINGLE PROTEIN,9913


### Next we extract the Bioactivity data for Homo sapiens which is the first entry.

In [3]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL220'

### Next we extract the Bioactivity data for Human Acetylcholinesterase(CHEMBL220) that are reported as pCHEMBL values.

In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8497,,24510947,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,4.19
8498,,24510948,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,230.0
8499,,24510949,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,14.5
8500,,24510950,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,8000.0


### We now save the bioactivity data.

In [5]:
df.to_csv('acetylcholinesterase_raw.csv', index=False)

### Handling Missing Data.

In [6]:
df1 = df[df.standard_value.notna()]
df1 = df[df.canonical_smiles.notna()]
df1.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8492,8493,8494,8495,8496,8497,8498,8499,8500,8501
activity_comment,,,,,,,,,,,...,,,,,,,,,,
activity_id,33969,37563,37565,38902,41170,42363,43561,44825,45908,45910,...,24510942,24510943,24510944,24510945,24510946,24510947,24510948,24510949,24510950,24647137
activity_properties,[],[],[],[],[],[],[],[],[],[],...,"[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f...","[{'comments': None, 'relation': '=', 'result_f..."
assay_chembl_id,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,CHEMBL643384,...,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5055911,CHEMBL5058677
assay_description,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,Inhibitory concentration against acetylcholine...,...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,Inhibition of recombinant human AChE using ace...,AChE Eurofins SafetyScreen (Other)
assay_type,B,B,B,B,B,B,B,B,B,B,...,B,B,B,B,B,B,B,B,B,B
assay_variant_accession,,,,,,,,,,,...,,,,,,,,,,
assay_variant_mutation,,,,,,,,,,,...,,,,,,,,,,
bao_endpoint,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,...,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190,BAO_0000190
bao_format,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,...,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357,BAO_0000357


#### Let us find out the unique count of the canonical_smiles

In [7]:
df1['canonical_smiles'].nunique()

#or

#len(df1.canonical_smiles.unique())

7059

In [8]:
# Let us find out if there are duplicate values in the canonical_smiles

df1['canonical_smiles'].duplicated().sum()

1408

In [9]:
df1_1 = df1.drop_duplicates(['canonical_smiles'])

df1_1

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8494,,24510944,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,184.0
8495,,24510945,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,13.4
8496,,24510946,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,19.2
8497,,24510947,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5055911,Inhibition of recombinant human AChE using ace...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,nM,UO_0000065,,4.19


In [10]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']

df2 = df1_1[selection]

df2

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0
...,...,...,...
8494,CHEMBL5087646,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3ccccc13)CCCC2,184.0
8495,CHEMBL5078914,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3cc(Cl)ccc13...,13.4
8496,CHEMBL5073819,Oc1cccc(CCCCCCCCNc2c3c(nc4ccccc24)CCCC3)c1,19.2
8497,CHEMBL5072428,Oc1cccc(CCCCCCCCNc2c3c(nc4cc(Cl)ccc24)CCCC3)c1,4.19


#### We save df2 as csv file

In [11]:
df2.to_csv('acetylchol_preprocessed.csv', index=False)

In [12]:
df3 = pd.read_csv('acetylchol_preprocessed.csv')
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.00
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.00
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.00
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.00
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.00
...,...,...,...
7054,CHEMBL5087646,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3ccccc13)CCCC2,184.00
7055,CHEMBL5078914,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3cc(Cl)ccc13...,13.40
7056,CHEMBL5073819,Oc1cccc(CCCCCCCCNc2c3c(nc4ccccc24)CCCC3)c1,19.20
7057,CHEMBL5072428,Oc1cccc(CCCCCCCCNc2c3c(nc4cc(Cl)ccc24)CCCC3)c1,4.19


### Labeling compounds as Active, Intermediate or Inactive


The bioactivity data is in the IC50 units. Counpounds with standard value less than <b>$1000nM$</b> will be labelled as <b>active</b>, compounds with standard value greater than $10,000nM$ will be considered as <b>'inactive'</b> and compounds with standard values between $1,000nM - 10,000nM$ will be considered as <b>intermediate</b>.

In [15]:
bioactivity_class = []
for i in df3.standard_value:
    if float(i) <= 1000:
        bioactivity_class.append('active')
    elif float(i) >= 10000:
        bioactivity_class.append('inactive')
    else:
        bioactivity_class.append('intermediate')

### Next we concatenate df3 and bioactivity_class

In [17]:
bioactivity_concat = pd.Series(bioactivity_class, name='class')
df4 = pd.concat([df3, bioactivity_concat], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.00,active
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.00,active
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.00,inactive
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.00,active
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.00,active
...,...,...,...,...
7054,CHEMBL5087646,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3ccccc13)CCCC2,184.00,active
7055,CHEMBL5078914,O=C(O)c1c(O)cccc1CCCCCCCCNc1c2c(nc3cc(Cl)ccc13...,13.40,active
7056,CHEMBL5073819,Oc1cccc(CCCCCCCCNc2c3c(nc4ccccc24)CCCC3)c1,19.20,active
7057,CHEMBL5072428,Oc1cccc(CCCCCCCCNc2c3c(nc4cc(Cl)ccc24)CCCC3)c1,4.19,active


### We save our new dataframe

In [18]:
df4.to_csv('acetylchol_curated.csv', index=False)