# Data selection

In [1]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
# way to search chembl database from jupyther notebook

In [2]:
target = new_client.target
target_query = target.search('CHEMBL206')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P03372', 'xref_name': None, 'xre...",Homo sapiens,Estrogen receptor alpha,12.0,False,CHEMBL206,"[{'accession': 'P03372', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Estrogen receptor,7.0,False,CHEMBL2093866,"[{'accession': 'Q92731', 'component_descriptio...",PROTEIN FAMILY,9606
2,[],Homo sapiens,Estrogen receptor/E3 ubiquitin-protein ligase ...,5.0,False,CHEMBL4523721,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,Baculoviral IAP repeat-containing protein 2/Es...,4.0,False,CHEMBL3885521,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN COMPLEX,9606
4,[],Homo sapiens,VHL/Estrogen receptor,1.0,False,CHEMBL4523726,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
5,[],Homo sapiens,VHL/Cullin-2/Estrogen receptor alpha,1.0,False,CHEMBL4523754,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
6,[],Homo sapiens,Protein cereblon/Estrogen receptor,0.0,False,CHEMBL4523681,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
7,[],Homo sapiens,Protein cereblon/Cullin-4A/Estrogen receptor,0.0,False,CHEMBL4523713,"[{'accession': 'P03372', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606


## Select bioactivity data for:
### *Estrogen receptor alpha* in humans

Cancer is at present one of the leading causes of death in the world. It accounts for 13% of deaths occurred worldwide and is continuously rising, with an estimated million of deaths up to 2030. Due to poor availability of prevention, diagnosis and treatment of breast cancer, the rate of mortality is at alarming level globally. In women, hormone-dependent estrogen receptor positive (ER+) breast cancer making up approximately 75% of all breast cancers. Hence, it has drawn the extensive attention of researchers towards the development of effective drugs for the treatment of hormone-dependent breast cancer. Estrogen, a female sex hormone has a vital role in the initiation and progression of breast malignancy. Therefore, estrogen receptor is the central target for the treatment of breast cancer.

In [3]:
# chembl id for SGLT2 in humans (id = 0)
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL206'

We are specifically choosing molecules for which the *standard_type* is indicated as **'IC50'**. **'IC50'** is the concentration of a drug or inhibitor needed to inhibit a biological process or response by *50%*. **'IC50'** is commonly used as a measure of drug potency in whole cell assays. **'IC50'** assays are also used for screening in target-based drug discovery campaigns.

In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')

In [5]:
df = pd.DataFrame.from_dict(res)

In [6]:
pd.set_option('display.max_columns', None)
df.head(3)
df.reset_index(drop=True)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72003,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCC1)[C@H](c1ccc(OCCN3CC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '19.57', 'le': '0.38', 'lle': '2.56', ...",CHEMBL431611,,CHEMBL431611,8.60,0,http://www.openphacts.org/units/Nanomolar,154447,=,1,1,=,,IC50,nM,,2.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,2.5
1,,,74062,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.37', 'le': '0.34', 'lle': '1.30', ...",CHEMBL316132,,CHEMBL316132,8.12,0,http://www.openphacts.org/units/Nanomolar,154445,=,1,1,=,,IC50,nM,,7.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.5
2,,,76289,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc([C@H]2Sc3cc(O)ccc3O[C@H]2c2ccc(OCCN3CCC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '18.35', 'le': '0.35', 'lle': '2.58', ...",CHEMBL304552,,CHEMBL304552,8.51,0,http://www.openphacts.org/units/Nanomolar,154440,=,1,1,=,,IC50,nM,,3.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.1
3,,,77402,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](CC1CCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.98', 'le': '0.35', 'lle': '1.59', ...",CHEMBL85881,,CHEMBL85881,8.41,0,http://www.openphacts.org/units/Nanomolar,154443,=,1,1,=,,IC50,nM,,3.9,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.9
4,,,78475,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](Cc1ccccc1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.61', 'le': '0.34', 'lle': '2.04', ...",CHEMBL85536,,CHEMBL85536,8.13,0,http://www.openphacts.org/units/Nanomolar,154451,=,1,1,=,,IC50,nM,,7.4,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5175,"{'action_type': 'ANTAGONIST', 'description': '...",,24911036,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,O=C(O)c1ccc2c(c1)CCCC(c1ccc(Cl)cc1Cl)=C2c1ccc(...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '14.00', 'le': '0.28', 'lle': '-0.04',...",CHEMBL4475463,AMCENESTRANT,CHEMBL4475463,7.76,0,http://www.openphacts.org/units/Nanomolar,3893223,=,1,1,=,,IC50,nM,,17.2,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,17.2
5176,"{'action_type': 'ANTAGONIST', 'description': '...",,24911037,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.62', 'le': '0.27', 'lle': '-0.63',...",CHEMBL5185539,,CHEMBL5185539,7.79,0,http://www.openphacts.org/units/Nanomolar,3893185,=,1,1,=,,IC50,nM,,16.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,16.1
5177,"{'action_type': 'ANTAGONIST', 'description': '...",,24911038,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.62', 'le': '0.27', 'lle': '-0.68',...",CHEMBL5197035,,CHEMBL5197035,7.78,0,http://www.openphacts.org/units/Nanomolar,3893226,=,1,1,=,,IC50,nM,,16.6,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,16.6
5178,"{'action_type': 'ANTAGONIST', 'description': '...",,24926126,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5167822,Antagonist activity at ERalpha expressed in hu...,B,,,BAO_0000190,BAO_0000219,cell-based format,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,,,CHEMBL5154927,J Med Chem,2022.0,"{'bei': '10.40', 'le': None, 'lle': None, 'sei...",CHEMBL5196589,,CHEMBL5196589,6.51,0,http://www.openphacts.org/units/Nanomolar,3896298,=,1,1,=,,IC50,nM,,310.0,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,uM,UO_0000065,,0.31


In [7]:
df['standard_type'].unique()

array(['IC50'], dtype=object)

In [8]:
df.to_csv('Data/estrogen_receptor_alpha_bioactivity_data_raw.csv', index=False)

## Handling missing data

If any compounds has missing value for the standard_value and canonical_smiles column then drop it.


In [9]:
df2 = df[df['standard_value'].notna()]
df2 = df2.loc[df['canonical_smiles'].notna()]
df2 = df2.reset_index(drop=True)
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72003,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCC1)[C@H](c1ccc(OCCN3CC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '19.57', 'le': '0.38', 'lle': '2.56', ...",CHEMBL431611,,CHEMBL431611,8.60,0,http://www.openphacts.org/units/Nanomolar,154447,=,1,1,=,,IC50,nM,,2.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,2.5
1,,,74062,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.37', 'le': '0.34', 'lle': '1.30', ...",CHEMBL316132,,CHEMBL316132,8.12,0,http://www.openphacts.org/units/Nanomolar,154445,=,1,1,=,,IC50,nM,,7.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.5
2,,,76289,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc([C@H]2Sc3cc(O)ccc3O[C@H]2c2ccc(OCCN3CCC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '18.35', 'le': '0.35', 'lle': '2.58', ...",CHEMBL304552,,CHEMBL304552,8.51,0,http://www.openphacts.org/units/Nanomolar,154440,=,1,1,=,,IC50,nM,,3.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.1
3,,,77402,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](CC1CCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.98', 'le': '0.35', 'lle': '1.59', ...",CHEMBL85881,,CHEMBL85881,8.41,0,http://www.openphacts.org/units/Nanomolar,154443,=,1,1,=,,IC50,nM,,3.9,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.9
4,,,78475,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](Cc1ccccc1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.61', 'le': '0.34', 'lle': '2.04', ...",CHEMBL85536,,CHEMBL85536,8.13,0,http://www.openphacts.org/units/Nanomolar,154451,=,1,1,=,,IC50,nM,,7.4,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4135,"{'action_type': 'ANTAGONIST', 'description': '...",,24911036,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,O=C(O)c1ccc2c(c1)CCCC(c1ccc(Cl)cc1Cl)=C2c1ccc(...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '14.00', 'le': '0.28', 'lle': '-0.04',...",CHEMBL4475463,AMCENESTRANT,CHEMBL4475463,7.76,0,http://www.openphacts.org/units/Nanomolar,3893223,=,1,1,=,,IC50,nM,,17.2,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,17.2
4136,"{'action_type': 'ANTAGONIST', 'description': '...",,24911037,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.62', 'le': '0.27', 'lle': '-0.63',...",CHEMBL5185539,,CHEMBL5185539,7.79,0,http://www.openphacts.org/units/Nanomolar,3893185,=,1,1,=,,IC50,nM,,16.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,16.1
4137,"{'action_type': 'ANTAGONIST', 'description': '...",,24911038,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163453,Antagonist activity at ERalpha D538G mutant de...,B,P03372,D538G,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.62', 'le': '0.27', 'lle': '-0.68',...",CHEMBL5197035,,CHEMBL5197035,7.78,0,http://www.openphacts.org/units/Nanomolar,3893226,=,1,1,=,,IC50,nM,,16.6,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,16.6
4138,"{'action_type': 'ANTAGONIST', 'description': '...",,24926126,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5167822,Antagonist activity at ERalpha expressed in hu...,B,,,BAO_0000190,BAO_0000219,cell-based format,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,,,CHEMBL5154927,J Med Chem,2022.0,"{'bei': '10.40', 'le': None, 'lle': None, 'sei...",CHEMBL5196589,,CHEMBL5196589,6.51,0,http://www.openphacts.org/units/Nanomolar,3896298,=,1,1,=,,IC50,nM,,310.0,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,uM,UO_0000065,,0.31


In [10]:
len(df2['canonical_smiles'].unique())

3078

In [11]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr = df2_nr.reset_index(drop=True)
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,72003,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCC1)[C@H](c1ccc(OCCN3CC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '19.57', 'le': '0.38', 'lle': '2.56', ...",CHEMBL431611,,CHEMBL431611,8.60,0,http://www.openphacts.org/units/Nanomolar,154447,=,1,1,=,,IC50,nM,,2.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,2.5
1,,,74062,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](C1CCCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.37', 'le': '0.34', 'lle': '1.30', ...",CHEMBL316132,,CHEMBL316132,8.12,0,http://www.openphacts.org/units/Nanomolar,154445,=,1,1,=,,IC50,nM,,7.5,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.5
2,,,76289,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc([C@H]2Sc3cc(O)ccc3O[C@H]2c2ccc(OCCN3CCC...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '18.35', 'le': '0.35', 'lle': '2.58', ...",CHEMBL304552,,CHEMBL304552,8.51,0,http://www.openphacts.org/units/Nanomolar,154440,=,1,1,=,,IC50,nM,,3.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.1
3,,,77402,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](CC1CCCCC1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.98', 'le': '0.35', 'lle': '1.59', ...",CHEMBL85881,,CHEMBL85881,8.41,0,http://www.openphacts.org/units/Nanomolar,154443,=,1,1,=,,IC50,nM,,3.9,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,3.9
4,,,78475,[],CHEMBL679321,Binding affinity towards human estrogen recept...,B,,,BAO_0000190,BAO_0000357,single protein format,Oc1ccc2c(c1)S[C@H](Cc1ccccc1)[C@H](c1ccc(OCCN3...,,,CHEMBL1146734,Bioorg Med Chem Lett,2004.0,"{'bei': '17.61', 'le': '0.34', 'lle': '2.04', ...",CHEMBL85536,,CHEMBL85536,8.13,0,http://www.openphacts.org/units/Nanomolar,154451,=,1,1,=,,IC50,nM,,7.4,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3073,"{'action_type': 'ANTAGONIST', 'description': '...",,24911033,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163452,Antagonist activity at ERalpha Y537S mutant de...,B,P03372,Y537S,BAO_0000190,BAO_0000219,cell-based format,O=C(O)c1ccc2c(c1)CCCC(c1ccc(Cl)cc1Cl)=C2c1ccc(...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.05', 'le': '0.26', 'lle': '-0.56',...",CHEMBL4475463,AMCENESTRANT,CHEMBL4475463,7.24,0,http://www.openphacts.org/units/Nanomolar,3893223,=,1,1,=,,IC50,nM,,57.7,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,57.7
3074,"{'action_type': 'ANTAGONIST', 'description': '...",,24911034,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163452,Antagonist activity at ERalpha Y537S mutant de...,B,P03372,Y537S,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.96', 'le': '0.28', 'lle': '-0.43',...",CHEMBL5185539,,CHEMBL5185539,7.99,0,http://www.openphacts.org/units/Nanomolar,3893185,=,1,1,=,,IC50,nM,,10.3,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,10.3
3075,"{'action_type': 'ANTAGONIST', 'description': '...",,24911035,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5163452,Antagonist activity at ERalpha Y537S mutant de...,B,P03372,Y537S,BAO_0000190,BAO_0000219,cell-based format,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,,,CHEMBL5154828,J Med Chem,2022.0,"{'bei': '13.80', 'le': '0.28', 'lle': '-0.58',...",CHEMBL5197035,,CHEMBL5197035,7.88,0,http://www.openphacts.org/units/Nanomolar,3893226,=,1,1,=,,IC50,nM,,13.1,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,nM,UO_0000065,,13.1
3076,"{'action_type': 'ANTAGONIST', 'description': '...",,24926126,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5167822,Antagonist activity at ERalpha expressed in hu...,B,,,BAO_0000190,BAO_0000219,cell-based format,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,,,CHEMBL5154927,J Med Chem,2022.0,"{'bei': '10.40', 'le': None, 'lle': None, 'sei...",CHEMBL5196589,,CHEMBL5196589,6.51,0,http://www.openphacts.org/units/Nanomolar,3896298,=,1,1,=,,IC50,nM,,310.0,CHEMBL206,Homo sapiens,Estrogen receptor alpha,9606,,,IC50,uM,UO_0000065,,0.31


## Data pre-processing of the bioactivity data

Combine the 3 columns (molecule_chembl_id, canonical_smiles, standard_value) and bioactivity_class into a DataFrame

In [12]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL431611,Oc1ccc2c(c1)S[C@H](C1CCCC1)[C@H](c1ccc(OCCN3CC...,2.5
1,CHEMBL316132,Oc1ccc2c(c1)S[C@H](C1CCCCCC1)[C@H](c1ccc(OCCN3...,7.5
2,CHEMBL304552,Oc1ccc([C@H]2Sc3cc(O)ccc3O[C@H]2c2ccc(OCCN3CCC...,3.1
3,CHEMBL85881,Oc1ccc2c(c1)S[C@H](CC1CCCCC1)[C@H](c1ccc(OCCN3...,3.9
4,CHEMBL85536,Oc1ccc2c(c1)S[C@H](Cc1ccccc1)[C@H](c1ccc(OCCN3...,7.4
...,...,...,...
3073,CHEMBL4475463,O=C(O)c1ccc2c(c1)CCCC(c1ccc(Cl)cc1Cl)=C2c1ccc(...,57.7
3074,CHEMBL5185539,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,10.3
3075,CHEMBL5197035,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,13.1
3076,CHEMBL5196589,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,310.0


In [13]:
df3.to_csv('Data/estrogen_receptor_alpha_bioactivity_data_preprocessed.csv', index=False)

### Labeling compounds as either being active, inactive or intermediate
The bioactivity data is in the **'IC50'** unit. Compounds having values of less than 1000 nM will be considered to be *active* while those greater than 10,000 nM will be considered to be *inactive*. As for those values in between 1,000 and 10,000 nM will be referred to as *intermediate*.

In [14]:
bioactivity_threshold = []
for i in df3['standard_value']:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [15]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3[['molecule_chembl_id', 'canonical_smiles', 'standard_value']], bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL431611,Oc1ccc2c(c1)S[C@H](C1CCCC1)[C@H](c1ccc(OCCN3CC...,2.5,active
1,CHEMBL316132,Oc1ccc2c(c1)S[C@H](C1CCCCCC1)[C@H](c1ccc(OCCN3...,7.5,active
2,CHEMBL304552,Oc1ccc([C@H]2Sc3cc(O)ccc3O[C@H]2c2ccc(OCCN3CCC...,3.1,active
3,CHEMBL85881,Oc1ccc2c(c1)S[C@H](CC1CCCCC1)[C@H](c1ccc(OCCN3...,3.9,active
4,CHEMBL85536,Oc1ccc2c(c1)S[C@H](Cc1ccccc1)[C@H](c1ccc(OCCN3...,7.4,active
...,...,...,...,...
3073,CHEMBL4475463,O=C(O)c1ccc2c(c1)CCCC(c1ccc(Cl)cc1Cl)=C2c1ccc(...,57.7,active
3074,CHEMBL5185539,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,10.3,active
3075,CHEMBL5197035,CC(F)(F)c1cc(Cl)ccc1-c1sc2c(ccc3[nH]ncc32)c1Oc...,13.1,active
3076,CHEMBL5196589,O=S(=O)(Oc1cccc2ccccc12)C1CC2OC1C(c1ccc([Se]c3...,310.0,active


In [16]:
df4['class'].unique()

array(['active', 'intermediate', 'inactive'], dtype=object)

In [17]:
df4.to_csv('Data/estrogen_receptor_alpha_bioactivity_data_curated.csv', index=False)