Coded with: https://www.youtube.com/watch?v=jBlTQjcKuaY

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Search for the target protein

## Target search for coronavirus

In [None]:
target = new_client.target
target_query = target.search("coronavirus")
targets = pd.DataFrame.from_dict(target_query)
targets

## Select and retrieve bioactivity data SARS coronavirus 3C-like proteinase

### We will assign the fifth entry to the selected_target.

In [None]:
selected_target = targets.target_chembl_id[4]
selected_target

### Here, we will retrieve only bioactivity data for coronavirus 3C-like proteinase (CHEMBL3927) that are reported as IC50 values in nM (nanomolar) unit.

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [None]:
df = pd.DataFrame.from_dict(res)

In [None]:
df

In [None]:
df.standard_type.unique()

### Finally, we will save the resulting bioactivity to a CSV file "bioactivity_data.csv".

In [None]:
df.to_csv("bioactivity_data.csv", index=False)

# Handling missing data

### If any compounds has missing value for the standard_value column then drop it.

In [None]:
df2 = df[df.standard_value.notna()]
df2

### Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein.

# Data pre-processing of the bioactivity data

## Labeling compounds as either being active, inactive or intermediate

The bioactivity data is in the IC50 unit. Compounds having values of less than 1k nM will be considered to be active while those greater than 10k nM will be considered to be inactive. As for those values in between 1k and 10k nM will be referred to as intermediate.

In [None]:
bioactivity_class = []

for i in df2.standard_value:
    if float(i) >= 10000:
        bioactivity_class.append("inactive")
    elif float(i) <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

## Iterate the molecule_chembl_id to a list

In [None]:
mol_cid = []

for i in df2.molecule_chembl_id:
    mol_cid.append(i)

## Iterate canonical_smiles to a list

In [None]:
canonical_smiles = []

for i in df2.canonical_smiles:
    canonical_smiles.append(i)

## Iterate standard_value to a list

In [None]:
standard_value = []

for i in df2.standard_value:
    standard_value.append(i)

## Combine the 4 lists into a dataframe

In [None]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame(data_tuples, columns=["molecule_chembl_id", "canonical_smiles", "bioactivity_class", "standard_value"])

In [None]:
df3

### Saves dataframe to CSV file

In [None]:
df3.to_csv("bioactivity_preprocessed_data.csv", index=False)