# **Pre-processing of the bioactivity data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load dataset
import pandas as pd
file1 = "/content/drive/MyDrive/bioactivity/beta_secretase1_bioactivity_data.csv"
df = pd.read_csv(file1)
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,78857,[],CHEMBL653511,Inhibitory activity against Beta-secretase 1 w...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,nM,UO_0000065,,413.0
1,,,391560,[],CHEMBL653332,Compound was tested for its inhibitory activit...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.002
2,,,391983,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.46
3,,,395858,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,9.0
4,,,395859,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,5.6


# **Handling missing data**

If any compounds has missing value for the standard_value column then drop it

In [None]:
#Count how many compounds have missing values in the 'standard_value' column
missing_values_count = df['standard_value'].isna().sum()
print(f"Number of compounds with missing standard_value: {missing_values_count}")

Number of compounds with missing standard_value: 148


In [None]:
# drop compounds compounds that has missing values for the standard_value column
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,78857,[],CHEMBL653511,Inhibitory activity against Beta-secretase 1 w...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,nM,UO_0000065,,413.000
1,,,391560,[],CHEMBL653332,Compound was tested for its inhibitory activit...,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.002
2,,,391983,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,0.460
3,,,395858,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,9.000
4,,,395859,[],CHEMBL653512,Inhibition of human Beta-secretase 1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,5.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10717,"{'action_type': 'INHIBITOR', 'description': 'N...",,25076257,[],CHEMBL5254629,Inhibition of human BACE-1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,2.800
10718,"{'action_type': 'INHIBITOR', 'description': 'N...",,25076258,[],CHEMBL5254629,Inhibition of human BACE-1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,1.600
10719,"{'action_type': 'INHIBITOR', 'description': 'N...",,25076259,[],CHEMBL5254629,Inhibition of human BACE-1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,4.500
10720,"{'action_type': 'INHIBITOR', 'description': 'N...",,25076260,[],CHEMBL5254629,Inhibition of human BACE-1,B,,,BAO_0000190,...,Homo sapiens,Beta-secretase 1,9606,,,IC50,uM,UO_0000065,,2.400


# **Labeling compounds as either being active, inactive or intermediate**

The bioactivity data is in the IC50 unit. Compounds having values of less than 100 nM will be considered to be active while those greater than 1000 nM will be considered to be inactive. As for those values in between 1,00 and 1000 nM will be referred to as intermediate.

In [None]:
## classify molecules based on bioactivity and assign the activity class into a list
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 1000:
    bioactivity_class.append("inactive")
  elif float(i) <= 100:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [None]:
# Count the occurrences of each class

from collections import Counter

class_counts = Counter(bioactivity_class)

# Print the number of compounds in each class
print(f"Number of active compounds: {class_counts['active']}")
print(f"Number of inactive compounds: {class_counts['inactive']}")
print(f"Number of intermediate compounds: {class_counts['intermediate']}")

Number of active compounds: 4477
Number of inactive compounds: 3599
Number of intermediate compounds: 2498


**Assign the molecule_chembl_id to a list**

In [None]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

**Assign canonical_smiles to a list**

In [None]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

**Assign IC50(nM) to a list**

In [None]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

**Combine the 4 lists into a dataframe**

In [None]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [None]:
df3.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL406146,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,intermediate,413.0
1,CHEMBL78946,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,active,2.0
2,CHEMBL324109,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,intermediate,460.0
3,CHEMBL114147,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,inactive,9000.0
4,CHEMBL419949,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,inactive,5600.0


In [None]:
df3.to_csv('beta_secretase1_bioactivity_preprocessed_data.csv', index=False)

In [None]:
! cp beta_secretase1_bioactivity_preprocessed_data.csv "/content/drive/MyDrive/bioactivity/"

In [2]:
#! ls "/content/drive/MyDrive/bioactivity/"