%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Computational drug discovery: part 1

Bioactivity data

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Data collection and pre-processing from the ChEMBL Database

ChEMBL database

In [3]:
# Install libraries
! pip install chembl_webresource_client



In [4]:
# Import libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [10]:
# Target search for diabetic dissease
target = new_client.target
target_query = target.search('Genital herpes') # search the keyword 'Genital herpes' and the result is a dict
targets = pd.DataFrame.from_dict(target_query) # a dataframe with the results of the search
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,Tumor necrosis factor ligand superfamily membe...,11.0,False,CHEMBL3712914,"[{'accession': 'O43557', 'component_descriptio...",SINGLE PROTEIN,9606
1,"[{'xref_id': 'P16753', 'xref_name': None, 'xre...",Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10.0,False,CHEMBL3771,"[{'accession': 'P16753', 'component_descriptio...",SINGLE PROTEIN,10360
2,[],Herpes simplex virus (type 1 / strain F),Herpes simplex virus (type 1 / strain F),10.0,False,CHEMBL613200,[],ORGANISM,10304


In [44]:
# Find Chembl id of target type = single protein
row, col = targets.shape
target_chembl_id_valid = []

for i in range(row):
  if targets.target_type[i] == "SINGLE PROTEIN":
    target_chembl_id_valid.append(targets.target_chembl_id[i])

target_chembl_id_valid

['CHEMBL3712914', 'CHEMBL3771']

In [47]:
# As we have two options, we choose one of them by hand.
selected_target = target_chembl_id_valid[1]
selected_target

'CHEMBL3771'

In [51]:
# Bioactivity data of the selected target
activity = new_client.activity
activity_query = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
activities = pd.DataFrame.from_dict(activity_query)
activities

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32116,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL122071,,CHEMBL122071,,False,http://www.openphacts.org/units/Nanomolar,232631,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
1,,34560,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.26', 'le': '0.36', 'lle': '3.55', ...",CHEMBL124107,,CHEMBL124107,4.17,False,http://www.openphacts.org/units/Nanomolar,232614,=,1,True,=,,IC50,nM,,68000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,68.0
2,,35750,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL120853,,CHEMBL120853,,False,http://www.openphacts.org/units/Nanomolar,232624,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
3,,36956,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.77', 'le': '0.35', 'lle': '3.93', ...",CHEMBL122296,,CHEMBL122296,5.19,False,http://www.openphacts.org/units/Nanomolar,232628,=,1,True,=,,IC50,nM,,6500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,6.5
4,,43015,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '16.54', 'le': '0.31', 'lle': '3.31', ...",CHEMBL333734,,CHEMBL333734,4.57,False,http://www.openphacts.org/units/Nanomolar,232616,=,1,True,=,,IC50,nM,,27000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,,974875,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '15.47', 'le': '0.29', 'lle': '1.42', ...",CHEMBL106921,,CHEMBL106921,5.82,False,http://www.openphacts.org/units/Nanomolar,199398,=,1,True,=,,IC50,nM,,1500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,1.5
234,,974876,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '13.83', 'le': '0.26', 'lle': '1.93', ...",CHEMBL107963,,CHEMBL107963,5.62,False,http://www.openphacts.org/units/Nanomolar,199394,=,1,True,=,,IC50,nM,,2400.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,2.4
235,,979879,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)c1cccc(NO)c1,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '32.80', 'le': '0.62', 'lle': '3.27', ...",CHEMBL106641,,CHEMBL106641,4.96,False,http://www.openphacts.org/units/Nanomolar,199402,=,1,True,=,,IC50,nM,,11000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,11.0
236,,982487,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '17.71', 'le': '0.34', 'lle': '3.01', ...",CHEMBL108677,,CHEMBL108677,7.21,False,http://www.openphacts.org/units/Nanomolar,199396,=,1,True,=,,IC50,nM,,61.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,0.061


In [52]:
# Save all these data in a csv file
activities.to_csv('genital_herpes_01_bioactivity_data_raw.csv', index=False)

In [59]:
# If there is no standard_value or canonical_smiles then we remove it.
activities_clean = activities[activities.standard_value.notna()]
activities_clean = activities_clean[activities_clean.canonical_smiles.notna()]
activities_clean

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32116,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL122071,,CHEMBL122071,,False,http://www.openphacts.org/units/Nanomolar,232631,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
1,,34560,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.26', 'le': '0.36', 'lle': '3.55', ...",CHEMBL124107,,CHEMBL124107,4.17,False,http://www.openphacts.org/units/Nanomolar,232614,=,1,True,=,,IC50,nM,,68000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,68.0
2,,35750,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL120853,,CHEMBL120853,,False,http://www.openphacts.org/units/Nanomolar,232624,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
3,,36956,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.77', 'le': '0.35', 'lle': '3.93', ...",CHEMBL122296,,CHEMBL122296,5.19,False,http://www.openphacts.org/units/Nanomolar,232628,=,1,True,=,,IC50,nM,,6500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,6.5
4,,43015,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '16.54', 'le': '0.31', 'lle': '3.31', ...",CHEMBL333734,,CHEMBL333734,4.57,False,http://www.openphacts.org/units/Nanomolar,232616,=,1,True,=,,IC50,nM,,27000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,,974875,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '15.47', 'le': '0.29', 'lle': '1.42', ...",CHEMBL106921,,CHEMBL106921,5.82,False,http://www.openphacts.org/units/Nanomolar,199398,=,1,True,=,,IC50,nM,,1500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,1.5
234,,974876,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '13.83', 'le': '0.26', 'lle': '1.93', ...",CHEMBL107963,,CHEMBL107963,5.62,False,http://www.openphacts.org/units/Nanomolar,199394,=,1,True,=,,IC50,nM,,2400.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,2.4
235,,979879,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)c1cccc(NO)c1,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '32.80', 'le': '0.62', 'lle': '3.27', ...",CHEMBL106641,,CHEMBL106641,4.96,False,http://www.openphacts.org/units/Nanomolar,199402,=,1,True,=,,IC50,nM,,11000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,11.0
236,,982487,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '17.71', 'le': '0.34', 'lle': '3.01', ...",CHEMBL108677,,CHEMBL108677,7.21,False,http://www.openphacts.org/units/Nanomolar,199396,=,1,True,=,,IC50,nM,,61.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,0.061


In [91]:
# Are there duplicate smiles?
row1 = len(activities_clean.canonical_smiles)
row2 = len(activities_clean.canonical_smiles.unique())

if row1>row2:
  print('Hay smiles duplicados')
else:
  print('No hay smiles duplicados')

Hay smiles duplicados


In [92]:
# Delete duplicate smiles
activities_without_dupl = activities_clean.drop_duplicates(['canonical_smiles'])
activities_without_dupl

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,32116,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL122071,,CHEMBL122071,,False,http://www.openphacts.org/units/Nanomolar,232631,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
1,,34560,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.26', 'le': '0.36', 'lle': '3.55', ...",CHEMBL124107,,CHEMBL124107,4.17,False,http://www.openphacts.org/units/Nanomolar,232614,=,1,True,=,,IC50,nM,,68000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,68.0
2,,35750,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,,CHEMBL120853,,CHEMBL120853,,False,http://www.openphacts.org/units/Nanomolar,232624,>,1,True,>,,IC50,nM,,100000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,100.0
3,,36956,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '18.77', 'le': '0.35', 'lle': '3.93', ...",CHEMBL122296,,CHEMBL122296,5.19,False,http://www.openphacts.org/units/Nanomolar,232628,=,1,True,=,,IC50,nM,,6500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,6.5
4,,43015,[],CHEMBL884029,Concentration required to inhibit the mutant a...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,,,CHEMBL1130748,Bioorg. Med. Chem. Lett.,1998,"{'bei': '16.54', 'le': '0.31', 'lle': '3.31', ...",CHEMBL333734,,CHEMBL333734,4.57,False,http://www.openphacts.org/units/Nanomolar,232616,=,1,True,=,,IC50,nM,,27000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,,974875,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '15.47', 'le': '0.29', 'lle': '1.42', ...",CHEMBL106921,,CHEMBL106921,5.82,False,http://www.openphacts.org/units/Nanomolar,199398,=,1,True,=,,IC50,nM,,1500.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,1.5
234,,974876,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '13.83', 'le': '0.26', 'lle': '1.93', ...",CHEMBL107963,,CHEMBL107963,5.62,False,http://www.openphacts.org/units/Nanomolar,199394,=,1,True,=,,IC50,nM,,2400.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,2.4
235,,979879,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,CC(=O)c1cccc(NO)c1,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '32.80', 'le': '0.62', 'lle': '3.27', ...",CHEMBL106641,,CHEMBL106641,4.96,False,http://www.openphacts.org/units/Nanomolar,199402,=,1,True,=,,IC50,nM,,11000.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,11.0
236,,982487,[],CHEMBL873639,Inhibitory activity against Human Cytomegalovi...,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,,,CHEMBL1131870,Bioorg. Med. Chem. Lett.,1999,"{'bei': '17.71', 'le': '0.34', 'lle': '3.01', ...",CHEMBL108677,,CHEMBL108677,7.21,False,http://www.openphacts.org/units/Nanomolar,199396,=,1,True,=,,IC50,nM,,61.0,CHEMBL3771,Human cytomegalovirus (strain AD169) (HHV-5) (...,Human herpes virus 5 capsid protein P40,10360,,,IC50,uM,UO_0000065,,0.061


In [98]:
# There are many columns with data that are not important now and therefore only 3 columns are maintained: molecule_chembl_id,canonical_smiles,standard_value and bioactivity_class
selected_columns = ['molecule_chembl_id','canonical_smiles','standard_value']
activities_clean_incomplete = activities_without_dupl[selected_columns]
activities_clean_incomplete

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL122071,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,100000.0
1,CHEMBL124107,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,68000.0
2,CHEMBL120853,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,100000.0
3,CHEMBL122296,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,6500.0
4,CHEMBL333734,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,27000.0
...,...,...,...
233,CHEMBL106921,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,1500.0
234,CHEMBL107963,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,2400.0
235,CHEMBL106641,CC(=O)c1cccc(NO)c1,11000.0
236,CHEMBL108677,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,61.0


In [99]:
# Save all these data in a csv file
activities_clean_incomplete.to_csv('genital_herpes_02_bioactivity_data_preprocessed.csv', index=False)

In [103]:
# Study the bioactivity if the compounds 

# Less than 1000 nM : active, greater than 10,000 nM : inactive, between 1,000 and 10,000 nM : intermediate.

bioactivity_values = []

for i in activities_clean_incomplete.standard_value:
  if float(i) >= 10000:
    bioactivity_values.append("inactive")
  elif float(i) <= 1000:
    bioactivity_values.append("active")
  else:
    bioactivity_values.append("intermediate")

bioactivity_values

['inactive',
 'inactive',
 'inactive',
 'intermediate',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'intermediate',
 'intermediate',
 'intermediate',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'intermediate',
 'inactive',
 'inactive',
 'intermediate',
 'active',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'active',
 'inactive',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'intermediate',
 'active',
 'intermediate',
 'inactive',
 'active',
 'active',
 'active',
 'active',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'inactive',
 'intermediate',
 'inactive',
 'inactive',
 'inactive',
 'intermediate',
 'intermediate',
 'inactive',
 'inactive',
 'inact

In [106]:
# So we can add the activities to activities_clean_incomplete so we will obtain activities_clean

bioactivity_class = pd.Series(bioactivity_values, name='bioactivity')
activities_clean = pd.concat([activities_clean_incomplete, bioactivity_class], axis=1)
activities_clean

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL122071,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,100000.0,inactive
1,CHEMBL124107,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,68000.0,inactive
2,CHEMBL120853,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,100000.0,inactive
3,CHEMBL122296,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,6500.0,intermediate
4,CHEMBL333734,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,27000.0,inactive
...,...,...,...,...
233,CHEMBL106921,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,1500.0,active
234,CHEMBL107963,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,2400.0,inactive
235,CHEMBL106641,CC(=O)c1cccc(NO)c1,11000.0,
236,CHEMBL108677,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,61.0,


In [115]:
# Remove bioactivity with NaN
activities_clean = activities_clean[activities_clean.bioactivity.notna()]
activities_clean

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL122071,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,100000.0,inactive
1,CHEMBL124107,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,68000.0,inactive
2,CHEMBL120853,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,100000.0,inactive
3,CHEMBL122296,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,6500.0,intermediate
4,CHEMBL333734,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,27000.0,inactive
...,...,...,...,...
230,CHEMBL107357,Cc1csc2nc(Cc3ccc(C(=O)c4ccc(CO)cc4)cc3)oc(=O)c12,940.0,intermediate
231,CHEMBL322676,Cc1csc2nc(Cc3cccc(NC(=O)c4cccc(NO)c4)c3)oc(=O)c12,45.0,intermediate
232,CHEMBL104662,Cc1csc2nc(Cc3ccc(C(=O)c4ccccc4)cc3)oc(=O)c12,800.0,inactive
233,CHEMBL106921,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,1500.0,active


In [108]:
# Save all these data in a csv file
activities_clean.to_csv('genital_herpes_03_bioactivity_data_curated.csv', index=False)

In [109]:
# Save all the csv files into one file
! zip acetylcholinesterase.zip *.csv

  adding: genital_herpes_01_bioactivity_data_raw.csv (deflated 92%)
  adding: genital_herpes_02_bioactivity_data_preprocessed.csv (deflated 82%)
  adding: genital_herpes_03_bioactivity_data_curated.csv (deflated 83%)


In [112]:
# Check if we have the zip file and the csv files
! ls -l

total 200
-rw-r--r-- 1 root root  18706 Aug 26 11:14 acetylcholinesterase.zip
-rw-r--r-- 1 root root 132782 Aug 26 10:00 genital_herpes_01_bioactivity_data_raw.csv
-rw-r--r-- 1 root root  19290 Aug 26 11:01 genital_herpes_02_bioactivity_data_preprocessed.csv
-rw-r--r-- 1 root root  21405 Aug 26 11:12 genital_herpes_03_bioactivity_data_curated.csv
drwxr-xr-x 1 root root   4096 Aug 13 13:35 sample_data
