**PROJECT:** Integration of machine learning, QSAR, and polypharmacology for multitarget drug discovery in neuropsychiatric disorders: Prediction of serotonergic and dopaminergic receptor inhibitors

MSc. Caroline Mensor Folchini (UFPR)

***Code by Alexandre de F. Cobre*** [Github](https://github.com/AlexandreCOBRE/code)


#**Combining the compound fingerprint descriptors dataset and the biological activity dataset (pIC50)**

In [None]:
## Tasks to be performed:
## Step 1: import the descriptors dataset (independent variables)
## step 2: Treat the descriptors dataset
## step 3: import the dataset of "pIC50" values ​​(dependent variable)
## Step 4: Treat the dependent variable dataset
## Step 5: Combine the two datasets (independent and dependent variables)
## Step 6: Save the final dataset

## **Step 1: import the descriptors dataset (independent variables)**

In [None]:
from google.colab import files
uploaded = files.upload()

Saving DA_5HT_part4.csv to DA_5HT_part4.csv


In [None]:
## 1.1. Viewing the imported dataset

import pandas as pd
df1 = pd.read_csv("DA_5HT_part4.csv")
df1

Unnamed: 0.1,Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,0,CHEMBL303519,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CHEMBL292943,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CHEMBL61682,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CHEMBL64487,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CHEMBL64597,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5623,5623,CHEMBL4864918,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5624,5624,CHEMBL5398630,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5625,5625,CHEMBL3183055,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5626,5626,CHEMBL2017291,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##**Step 2: Treat the descriptors dataset**

In [None]:
## 2.1. Eliminating uninformative variables: "Unnamed: 0" e "Name"
df1 = df1.drop("Unnamed: 0", axis = 1)
df1 = df1.drop("Name", axis = 1)

In [None]:
## 2.2. Visualizando o dataset dos descritores após a remoção das variáveis não informativas
df1

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5623,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5624,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5625,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5626,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


##**Step 3: import the dataset of "pIC50" values ​​(dependent variable)**

In [None]:
## 3.1: Importing the dataset containing the pI50 values ​​(pending variable)
from google.colab import files
uploaded = files.upload()

Saving DA_5HT_dataset_3classes.csv to DA_5HT_dataset_3classes.csv


In [None]:
## 3.2. Viewing the dependent variable data
df2 = pd.read_csv("DA_5HT_dataset_3classes.csv")
display(df2)

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL303519,c1cnc(N2CCN(Cc3cccc4c3Cc3ccccc3-4)CC2)nc1,Intermediate,342.446,3.37000,0.0,4.0,5.008774
1,1,CHEMBL292943,COc1ccc(-c2cccc(CN3CCN(c4ncccn4)CC3)c2)cc1,Active,360.461,3.47440,0.0,5.0,7.301030
2,2,CHEMBL61682,Fc1ccc(-c2cncc(CN3CCN(c4ccccc4F)CC3)c2)cc1,Active,365.427,4.34900,0.0,3.0,7.602060
3,3,CHEMBL64487,COc1ccccc1-c1cccc(CN2CCN(c3ncccn3)CC2)c1,Active,360.461,3.47440,0.0,5.0,6.443697
4,4,CHEMBL64597,c1cnc(N2CCN(Cc3cccc(-c4ccsc4)c3)CC2)nc1,Active,336.464,3.52730,0.0,5.0,6.522879
...,...,...,...,...,...,...,...,...,...
5623,5623,CHEMBL4864918,CCCn1c(-c2ccccc2)cc(C(=O)NCCCN2CCN(c3cccc(Cl)c...,Inactive,479.068,5.46902,1.0,4.0,6.563837
5624,5624,CHEMBL5398630,CCCn1c(-c2ccccc2)cc(C(=O)NCCCN2CCN(c3cccc(C)c3...,Active,472.677,5.43246,1.0,4.0,6.647817
5625,5625,CHEMBL3183055,CCCn1c(-c2ccccc2)cc(C(=O)NCCCN2CCN(c3cccc(Cl)c...,Active,513.513,6.12242,1.0,4.0,6.202040
5626,5626,CHEMBL2017291,CCCn1c(-c2ccccc2)cc(C(=O)NCCCN2CCN(c3ccc(Cl)c(...,Inactive,549.974,6.54422,1.0,4.0,6.489455


##**Passo 4: Tratar o dataset da variável dependente**

In [None]:
## 4.1. Selecting only the dependent variable
df2 = df2["pIC50"]

In [None]:
## 4.2. Visualizando a variável dependente
display(df2)

Unnamed: 0,pIC50
0,5.008774
1,7.301030
2,7.602060
3,6.443697
4,6.522879
...,...
5623,6.563837
5624,6.647817
5625,6.202040
5626,6.489455


## **Step 5: Combine the two datasets (independent and dependent variables)**

In [None]:
## 5.1. Observing whether the number of lines (samples) of both variables are equal
df1.shape, df2.shape

((5628, 881), (5628,))

In [None]:
## 5.2. Combining the independent and dependent variables

In [None]:
df3 = pd.concat([df1, df2], axis = 1)

In [None]:
## 5.3. Viewing the combined data
df3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.008774
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.301030
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.602060
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.443697
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.522879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5623,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.563837
5624,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.647817
5625,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.202040
5626,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.489455


In [None]:
# Remover linhas com valores ausentes
df3_cleaned = df3.dropna()

In [None]:
df3

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.008774
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.301030
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.602060
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.443697
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.522879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5623,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.563837
5624,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.647817
5625,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.202040
5626,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.489455


##**Step 6: Save the final dataset**

In [None]:
df3.to_csv("DA_5HT_finaldataset.csv")