In [133]:
from IPython.display import display, HTML

In [134]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    ChEMBL Database Query
</div>
<center>
    <img src="https://upload.wikimedia.org/wikipedia/commons/a/a1/Chembl_logo.png" width="400" style="margin-top: 20px;">
</center>
"""

# Display the banner
display(HTML(banner_html))

ChEMBL 34, released on 28/03/2024, includes a full update to drug and clinical candidate data. This version contains 2,431,025 compounds (with 2,409,270 having associated mol files), 3,106,257 compound records, 20,772,701 activity data points, 1,644,390 assays, 15,598 targets, and 89,892 documents.

In [135]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Import Libraries
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [136]:
! pip install chembl_webresource_client



In [137]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [138]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Search for Target Protein
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))

In [139]:
search_term ="RIPK1" # Term you wish to search
target = new_client.target
target_query = target.search(search_term)
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Rattus norvegicus,Receptor (TNFRSF)-interacting serine-threonine...,16.0,False,CHEMBL4523112,"[{'accession': 'D3ZYL0', 'component_descriptio...",SINGLE PROTEIN,10116
1,[],Mus musculus,Receptor-interacting serine/threonine-protein ...,13.0,False,CHEMBL3784911,"[{'accession': 'Q60855', 'component_descriptio...",SINGLE PROTEIN,10090
2,[],Homo sapiens,Receptor-interacting serine/threonine-protein ...,12.0,False,CHEMBL5464,"[{'accession': 'Q13546', 'component_descriptio...",SINGLE PROTEIN,9606
3,[],Homo sapiens,Cereblon/Receptor-interacting serine/threonine...,12.0,False,CHEMBL5465217,"[{'accession': 'Q13546', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
4,[],Homo sapiens,Mitogen-activated protein kinase 10/Receptor-i...,8.0,False,CHEMBL3885602,"[{'accession': 'P53779', 'component_descriptio...",PROTEIN FAMILY,9606


**Select and retrieve bioactivity data for desired protein**

We will assign the index from above table

In [140]:
index_of_target = 2 # Index from table
selected_target = targets.target_chembl_id[index_of_target]
selected_target

'CHEMBL5464'

In [141]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Filtering of Target Protein by IC50 Values
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))

Here, we will retrieve only bioactivity data for *coronavirus 3C-like proteinase* (CHEMBL3927) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [142]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [143]:
df = pd.DataFrame.from_dict(res)

In [144]:
df.head(10)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13890064,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.2
1,,,13890065,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,7.9
2,,,13890066,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.79
3,,,13890067,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,7.9
4,,,13890068,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.25
5,,,13890069,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,10.0
6,,,13890070,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,1.3
7,,,13890071,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,10.0
8,,,13890072,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.25
9,,,13890073,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,5.0


Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [145]:
df.to_csv('bioactivity_data_raw.csv', index=False)

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [146]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13890064,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.2
1,,,13890065,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,7.9
2,,,13890066,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.79
3,,,13890067,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,7.9
4,,,13890068,[],CHEMBL3110728,Inhibition of RIP1 in human U937 cells assesse...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,"{'action_type': 'INHIBITOR', 'description': 'N...",,25710194,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5388027,Inhibition of RIPK1 (unknown origin) using MBP...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.12
626,"{'action_type': 'INHIBITOR', 'description': 'N...",,25710195,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5388027,Inhibition of RIPK1 (unknown origin) using MBP...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.1
627,"{'action_type': 'INHIBITOR', 'description': 'N...",,25710196,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5388027,Inhibition of RIPK1 (unknown origin) using MBP...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.088
628,"{'action_type': 'INHIBITOR', 'description': 'N...",,25710197,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5388027,Inhibition of RIPK1 (unknown origin) using MBP...,B,,,BAO_0000190,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,IC50,uM,UO_0000065,,0.1


In [147]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Labeling bioactivity based on IC50
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))


The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [148]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")


### **Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame**

In [149]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL3109221,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3coc4ncnc(N)c34)cc...,200.0
1,CHEMBL396107,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4cc(C(F)(F)F)cc(C(F...,7900.0
2,CHEMBL436806,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4ccc5ccccc5c4)cc3)c12,790.0
3,CHEMBL3109220,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4cccc(Cl)c4)cc3)c12,7900.0
4,CHEMBL3109219,CCc1cccc(NC(=O)Nc2ccc(-c3coc4ncnc(N)c34)cc2)c1,250.0
...,...,...,...
625,CHEMBL5434486,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCCCC1,120.0
626,CHEMBL5404074,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCNCC1,100.0
627,CHEMBL5427291,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCOCC1,88.0
628,CHEMBL5434132,CN1CCN(C(=O)c2ccc(-c3cc4cccnc4[nH]3)cc2)CC1,100.0


In [150]:
bioactivity_class = pd.Series(bioactivity_class, name='bioactivity').reset_index(drop=True)
df4 = pd.concat([df3.reset_index(drop=True), bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL3109221,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3coc4ncnc(N)c34)cc...,200.0,active
1,CHEMBL396107,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4cc(C(F)(F)F)cc(C(F...,7900.0,intermediate
2,CHEMBL436806,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4ccc5ccccc5c4)cc3)c12,790.0,active
3,CHEMBL3109220,Nc1ncnc2occ(-c3ccc(NC(=O)Nc4cccc(Cl)c4)cc3)c12,7900.0,intermediate
4,CHEMBL3109219,CCc1cccc(NC(=O)Nc2ccc(-c3coc4ncnc(N)c34)cc2)c1,250.0,active
...,...,...,...,...
616,CHEMBL5434486,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCCCC1,120.0,active
617,CHEMBL5404074,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCNCC1,100.0,active
618,CHEMBL5427291,O=C(c1ccc(-c2cc3cccnc3[nH]2)cc1)N1CCOCC1,88.0,active
619,CHEMBL5434132,CN1CCN(C(=O)c2ccc(-c3cc4cccnc4[nH]3)cc2)CC1,100.0,active


In [151]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Exporting CSV file of IC50 dataset
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))

In [152]:
df4.to_csv('IC50_bioactivity_data_preprocessed.csv', index=False)

In [166]:
from google.colab import files
files.download('IC50_bioactivity_data_preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---

In [154]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Filtering of Target Protein by EC50 Values
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))

Here, we will retrieve only bioactivity data for *coronavirus 3C-like proteinase* (CHEMBL3927) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [155]:
activity = new_client.activity
res1 = activity.filter(target_chembl_id=selected_target).filter(standard_type="EC50")

In [156]:
dfe1 = pd.DataFrame.from_dict(res1)

In [157]:
dfe1.head(10)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,3395880,[],CHEMBL1221002,Inhibition of endogenous RIP1 autophosphorylat...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,182.0
1,,,3395881,[],CHEMBL1221002,Inhibition of endogenous RIP1 autophosphorylat...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,18.0
2,,,17953082,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,74.0
3,,,17953083,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,49.0
4,,,17953084,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,262.0
5,,,17953085,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,56.0
6,,,17953086,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,198.0
7,,,17953087,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,64.0
8,,,17953088,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,28.0
9,,,17953089,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,10000.0


Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [158]:
dfe1.to_csv('ebioactivity_data_raw.csv', index=False)

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [159]:
dfe2 = dfe1[dfe1.standard_value.notna()]
dfe2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,3395880,[],CHEMBL1221002,Inhibition of endogenous RIP1 autophosphorylat...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,182.0
1,,,3395881,[],CHEMBL1221002,Inhibition of endogenous RIP1 autophosphorylat...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,18.0
2,,,17953082,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,74.0
3,,,17953083,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,49.0
4,,,17953084,[],CHEMBL3993115,Inhibition of RIP1 in human HT-29 cells assess...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,262.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,"{'action_type': 'INHIBITOR', 'description': 'N...",,25609917,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5360422,Inhibition of recombinant GST tagged RIPK1 (1 ...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,16.7
120,"{'action_type': 'INHIBITOR', 'description': 'N...",,25609918,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5360422,Inhibition of recombinant GST tagged RIPK1 (1 ...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,17.1
121,"{'action_type': 'INHIBITOR', 'description': 'N...",,25609919,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5360422,Inhibition of recombinant GST tagged RIPK1 (1 ...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,15.75
122,"{'action_type': 'INHIBITOR', 'description': 'N...",,25609920,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5360422,Inhibition of recombinant GST tagged RIPK1 (1 ...,B,,,BAO_0000188,...,Homo sapiens,Receptor-interacting serine/threonine-protein ...,9606,,,EC50,nM,UO_0000065,,16.1


In [160]:
# @title
# Define the HTML and CSS for the banner
banner_html = """
<div style="padding:20px;
            color:#150d0a;
            margin:0px;
            font-size:220%;
            text-align:center;
            display:block;
            border-width: 5px;
            border-style: solid;
            border-color: #150d0a;
            background-color: #90EE90;
            overflow:hidden;
            font-weight:500;
            border-radius: 0;">
    Labeling bioactivity based on EC50
</div>
<center>
"""

# Display the banner
display(HTML(banner_html))


The bioactivity data is in the EC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [161]:
bioactivity_class_e = []
for i in dfe2.standard_value:
  if float(i) >= 10000:
    bioactivity_class_e.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class_e.append("active")
  else:
   bioactivity_class_e.append("intermediate")

### **Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame**

In [162]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
dfe3 = dfe2[selection]
dfe3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL195008,CN1C(=O)C(Cc2c[nH]c3ccccc23)NC1=S,182.0
1,CHEMBL370438,CN1C(=O)N[C@H](Cc2c[nH]c3c(Cl)cccc23)C1=O,18.0
2,CHEMBL4061192,CCC(C)(C)C(=O)N(O)Cc1cc(F)cc(F)c1F,74.0
3,CHEMBL4083065,CCC(C)(C)C(=O)N(O)Cc1cc(F)c(F)c(F)c1,49.0
4,CHEMBL4099502,CCC(C)(C)C(=O)N(O)Cc1ccc(F)c(F)c1F,262.0
...,...,...,...
119,CHEMBL5411784,O=c1n(C23CC(F)(C2)C3)nc2n1[C@H](c1cc(F)c(Cl)c(...,16.7
120,CHEMBL5418714,O=c1n(C23CC(F)(C2)C3)nc2n1[C@H](c1ccc(Cl)c(F)c...,17.1
121,CHEMBL5428474,Cc1c(F)cc([C@@H]2CCc3nn(C45CC(F)(C4)C5)c(=O)n3...,15.75
122,CHEMBL5413896,Cc1c(F)cc([C@@H]2CCc3nn(C45CC(C#N)(C4)C5)c(=O)...,16.1


In [163]:
bioactivity_class_e = pd.Series(bioactivity_class_e, name='bioactivity').reset_index(drop=True)
dfe4 = pd.concat([dfe3.reset_index(drop=True), bioactivity_class_e], axis=1)
dfe4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL195008,CN1C(=O)C(Cc2c[nH]c3ccccc23)NC1=S,182.0,active
1,CHEMBL370438,CN1C(=O)N[C@H](Cc2c[nH]c3c(Cl)cccc23)C1=O,18.0,active
2,CHEMBL4061192,CCC(C)(C)C(=O)N(O)Cc1cc(F)cc(F)c1F,74.0,active
3,CHEMBL4083065,CCC(C)(C)C(=O)N(O)Cc1cc(F)c(F)c(F)c1,49.0,active
4,CHEMBL4099502,CCC(C)(C)C(=O)N(O)Cc1ccc(F)c(F)c1F,262.0,active
...,...,...,...,...
77,CHEMBL5411784,O=c1n(C23CC(F)(C2)C3)nc2n1[C@H](c1cc(F)c(Cl)c(...,16.7,active
78,CHEMBL5418714,O=c1n(C23CC(F)(C2)C3)nc2n1[C@H](c1ccc(Cl)c(F)c...,17.1,active
79,CHEMBL5428474,Cc1c(F)cc([C@@H]2CCc3nn(C45CC(F)(C4)C5)c(=O)n3...,15.75,active
80,CHEMBL5413896,Cc1c(F)cc([C@@H]2CCc3nn(C45CC(C#N)(C4)C5)c(=O)...,16.1,active


Saves dataframe to CSV file

In [164]:
dfe4.to_csv('EC50_bioactivity_data_preprocessed.csv', index=False)

In [167]:
from google.colab import files
files.download('EC50_bioactivity_data_preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>