<a href="https://colab.research.google.com/github/KarimeZeraik/QSAR-and-ML/blob/main/QSAR_Trypanosoma_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PROJECT:** Integration of machine learning, QSAR, and polypharmacology for multitarget drug discovery in neuropsychiatric disorders: Prediction of serotonergic and dopaminergic receptor inhibitors

MSc. Caroline Mensor Folchini (UFPR)

***Code by Alexandre de F. Cobre*** [Github](https://github.com/AlexandreCOBRE/code)





# Performing the necessary installations/uninstallations:

In [1]:
!pip install fastapi kaleido python-multipart uvicorn
!pip install chembl_webresource_client


Collecting kaleido
  Downloading kaleido-1.1.0-py3-none-any.whl.metadata (5.6 kB)
Collecting choreographer>=1.0.10 (from kaleido)
  Downloading choreographer-1.1.1-py3-none-any.whl.metadata (6.8 kB)
Collecting logistro>=1.0.8 (from kaleido)
  Downloading logistro-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pytest-timeout>=2.4.0 (from kaleido)
  Downloading pytest_timeout-2.4.0-py3-none-any.whl.metadata (20 kB)
Downloading kaleido-1.1.0-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading choreographer-1.1.1-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading logistro-1.1.0-py3-none-any.whl (7.9 kB)
Downloading pytest_timeout-2.4.0-py3-none-any.whl (14 kB)
Installing collected packages: logistro, pytest-timeout, choreographer, kaleido
Successfully installed choreogr

# Importing necessary libraries:

In [2]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

#**DATASET SELECTION:**

ChEMBL database (https://www.ebi.ac.uk/chembl/).

### Searching for target datasets

In [3]:
alvo = new_client.target
pesquisa_alvo = alvo.search('Serotonin')
ds = pd.DataFrame.from_dict(pesquisa_alvo)
ds

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Ovis aries,Serotonin N-acetyltransferase,14.0,False,CHEMBL5452,"[{'accession': 'Q29495', 'component_descriptio...",SINGLE PROTEIN,9940
1,[],Rattus norvegicus,Serotonin N-acetyltransferase,14.0,False,CHEMBL1075242,"[{'accession': 'Q64666', 'component_descriptio...",SINGLE PROTEIN,10116
2,[],Homo sapiens,Sodium-dependent serotonin transporter,12.0,False,CHEMBL228,"[{'accession': 'P31645', 'component_descriptio...",SINGLE PROTEIN,9606
3,[],Rattus norvegicus,Sodium-dependent serotonin transporter,12.0,False,CHEMBL313,"[{'accession': 'P31652', 'component_descriptio...",SINGLE PROTEIN,10116
4,[],Mus musculus,Sodium-dependent serotonin transporter,12.0,False,CHEMBL4642,"[{'accession': 'Q60857', 'component_descriptio...",SINGLE PROTEIN,10090
...,...,...,...,...,...,...,...,...,...
93,[],Mus musculus,5-hydroxytryptamine receptor 2C,7.0,False,CHEMBL3006,"[{'accession': 'P34968', 'component_descriptio...",SINGLE PROTEIN,10090
94,[],Rattus norvegicus,5-hydroxytryptamine receptor 2C,7.0,False,CHEMBL324,"[{'accession': 'P08909', 'component_descriptio...",SINGLE PROTEIN,10116
95,[],Homo sapiens,Monoamine transporter,7.0,False,CHEMBL2363064,"[{'accession': 'P31645', 'component_descriptio...",PROTEIN FAMILY,9606
96,[],Bos taurus,5-hydroxytryptamine 1D receptor,7.0,False,CHEMBL3638336,"[{'accession': 'Q8MI13', 'component_descriptio...",SINGLE PROTEIN,9913


### Searching the dataset for a specific target:

In [4]:
# Listing all possible targets ("alvos") for visualization
lista_alvos = ds['pref_name'].tolist()
print(lista_alvos)

['Serotonin N-acetyltransferase', 'Serotonin N-acetyltransferase', 'Sodium-dependent serotonin transporter', 'Sodium-dependent serotonin transporter', 'Sodium-dependent serotonin transporter', 'Sodium-dependent serotonin transporter', 'Monoamine transporters; Norepinephrine & serotonin', '5-hydroxytryptamine receptor 1D', '5-hydroxytryptamine receptor 1B', '5-hydroxytryptamine receptor 2B', '5-hydroxytryptamine receptor 1B', 'Serotonin 3 (5-HT3) receptor', 'Serotonin (5-HT) receptor', 'Serotonin 3 (5-HT3) receptor', 'Serotonin 1 (5-HT1) receptor', 'Serotonin 1 receptors; 5-HT1B & 5-HT1D', 'Serotonin (5-HT) receptor', 'Serotonin 3 (5-HT3) receptor', 'Serotonin 3 receptor (5HT3)', 'Serotonin receptor 2a and 2b (5HT2A and 5HT2B)', '5-hydroxytryptamine receptor 3A/3E', '5-hydroxytryptamine receptor 3A/3C', '5-HT1B/5-HT1F', '5-HT1D/5-HT1F', 'Serotonin 1 (5-HT1) receptor', '5-hydroxytryptamine receptor 3A', '5-hydroxytryptamine receptor 3A', 'Serotonin 4 (5-HT4) receptor', '5-hydroxytryptami

In [5]:
# Defining the target to be searched
alvo = "Serotonin"

# Checking if any element in pref_name contains this target:
contains_alvo = ds['pref_name'].str.contains(alvo)

# Getting the indices of the rows with the defined target:
indices_com_alvo = ds[contains_alvo].index.tolist()

if contains_alvo.any():
    print(f"At least one element contains the term: {alvo}")
    print(f"Indices of the rows with the term'{alvo}': {indices_com_alvo}")
else:
    print(f"No element contains the term: {alvo}")

At least one element contains the term: Serotonin
Indices of the rows with the term'Serotonin': [0, 1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 24, 27, 30, 31, 32, 33, 34, 76, 77, 78, 79, 80, 91, 97]


### Converting IC50 values to a standard concentration unit (molar - M) and generating a single dataframe:

In [7]:
# prompt: select rows from df['target_chembl_id']

ensaios = ds[ds['target_chembl_id'].isin(['CHEMBL224', 'CHEMBL3155', 'CHEMBL225', 'CHEMBL214'])]


In [8]:
indices_com_ensaio = ensaios.index
indices_com_ensaio

Index([38, 39, 45, 87], dtype='int64')

In [9]:
# Creating a list to store individual DataFrames

dfs = []




# Iterating over the different indices:

for i in indices_com_ensaio :

    df_nM_i = []

    df_uM_i = []

    df_mM_i = []

    df_M_i = []

    ds_selecionado_i = ds.target_chembl_id[i]




    # Filtering bioactive compounds with IC50 data, in nM units, for each index:

    atividade = new_client.activity

    resultado_nM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="nM")




    # Filtering bioactive compounds with IC50 data, in µM units, for each index:

    resultado_uM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="uM")




    # Filtering bioactive compounds with IC50 data, in mM units, for each index:

    resultado_mM = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="mM")




    # Filtering bioactive compounds with IC50 data, in M units, for each index:

    resultado_M = atividade.filter(target_chembl_id=ds_selecionado_i).filter(standard_type="IC50").filter(units="M")




    # Creating a DataFrame for each unit:

    df_nM_i = pd.DataFrame.from_dict(resultado_nM)

    df_uM_i = pd.DataFrame.from_dict(resultado_uM)

    df_mM_i = pd.DataFrame.from_dict(resultado_mM)

    df_M_i = pd.DataFrame.from_dict(resultado_M)




    # Converting each DataFrame to a standard unit (M):

    if not df_nM_i.empty and 'value' in df_nM_i:

        df_nM_i['value'] = df_nM_i['value'].astype(float)

        df_nM_i['value'] *= 1e-9

    else:

        pass




    if not df_uM_i.empty and 'value' in df_uM_i:

        df_uM_i['value'] = df_uM_i['value'].astype(float)

        df_uM_i['value'] *= 1e-6

    else:

        pass




    if not df_mM_i.empty and 'value' in df_mM_i:

        df_mM_i['value'] = df_mM_i['value'].astype(float)

        df_mM_i['value'] *= 1e-3

    else:

        pass




    if not df_M_i.empty and 'value' in df_M_i:

        df_M_i['value'] = df_M_i['value'].astype(float)

    else:

        pass




    # Adding the DataFrames to the list:

    dfs.append(df_nM_i)

    dfs.append(df_uM_i)

    dfs.append(df_mM_i)

    dfs.append(df_M_i)




# Concatenating the individual DataFrames into a single DataFrame:

df_assays = pd.concat(dfs, ignore_index=True)

df_assays['units'] = 'M'




# Displaying the final DataFrame:

display(df_assays)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,221588,[],CHEMBL615582,Compound was evaluated for the binding affinit...,B,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,1.000000e-06
1,,,431267,[],CHEMBL615748,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,3.600000e-08
2,,,431269,[],CHEMBL615750,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,1.300000e-07
3,,Not Determined,437720,[],CHEMBL615749,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,
4,,Not Determined,437721,[],CHEMBL616259,Forskolin stimulated cAMP assessment of agonis...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3592,"{'action_type': 'ANTAGONIST', 'description': '...",,25986621,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5548750,Antagonist activity at recombinant human 5-HT2...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,2.950000e-06
3593,"{'action_type': 'ANTAGONIST', 'description': '...",,25986622,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5548750,Antagonist activity at recombinant human 5-HT2...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,5.530000e-06
3594,,,26298118,[],CHEMBL5697018,Inhibition of human 5-HT2C by intracellular Ca...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,4.000000e-05
3595,,,26298119,[],CHEMBL5697018,Inhibition of human 5-HT2C by intracellular Ca...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,4.000000e-05


In [10]:
df_assays["value"].isnull().sum()

np.int64(19)

In [11]:
# Assuming your DataFrame is df_assays
df_assays.dropna(subset=['value'], inplace=True)

In [12]:
df_assays

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,221588,[],CHEMBL615582,Compound was evaluated for the binding affinit...,B,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,1.000000e-06
1,,,431267,[],CHEMBL615748,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,3.600000e-08
2,,,431269,[],CHEMBL615750,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,1.300000e-07
13,,,449757,[],CHEMBL616260,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,2.500000e-08
14,,,449759,[],CHEMBL615750,Agonistic activity of compound towards 5-hydro...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 1A,9606,,,IC50,M,UO_0000065,,2.100000e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3592,"{'action_type': 'ANTAGONIST', 'description': '...",,25986621,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5548750,Antagonist activity at recombinant human 5-HT2...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,2.950000e-06
3593,"{'action_type': 'ANTAGONIST', 'description': '...",,25986622,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5548750,Antagonist activity at recombinant human 5-HT2...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,5.530000e-06
3594,,,26298118,[],CHEMBL5697018,Inhibition of human 5-HT2C by intracellular Ca...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,4.000000e-05
3595,,,26298119,[],CHEMBL5697018,Inhibition of human 5-HT2C by intracellular Ca...,F,,,BAO_0000190,...,Homo sapiens,5-hydroxytryptamine receptor 2C,9606,,,IC50,M,UO_0000065,,4.000000e-05


In [13]:
# Assigning the class of the compounds

classe_bioatividade = []
for i in df_assays.standard_value:
    if float(i) > 0.0010:
        classe_bioatividade.append("Inactive")
    elif float(i) <= 0.00010:
        classe_bioatividade.append("Active")
    else:
        classe_bioatividade.append("Intermediate")

In [14]:
# Visualizing bioactive compounds
df_assays.molecule_chembl_id

Unnamed: 0,molecule_chembl_id
0,CHEMBL26560
1,CHEMBL148860
2,CHEMBL148860
13,CHEMBL151266
14,CHEMBL151266
...,...
3592,CHEMBL5594692
3593,CHEMBL5567330
3594,CHEMBL1232461
3595,CHEMBL2017291


In [15]:
# Iterating over bioactive compounds
mol_cid = []
for i in df_assays.molecule_chembl_id:
    mol_cid.append(i)

In [16]:
# Printing the variable mol_cid
mol_cid

['CHEMBL26560',
 'CHEMBL148860',
 'CHEMBL148860',
 'CHEMBL151266',
 'CHEMBL151266',
 'CHEMBL146751',
 'CHEMBL146751',
 'CHEMBL355953',
 'CHEMBL553331',
 'CHEMBL305660',
 'CHEMBL338611',
 'CHEMBL359124',
 'CHEMBL359124',
 'CHEMBL153174',
 'CHEMBL153174',
 'CHEMBL39',
 'CHEMBL39',
 'CHEMBL39',
 'CHEMBL39',
 'CHEMBL155615',
 'CHEMBL155615',
 'CHEMBL284820',
 'CHEMBL423045',
 'CHEMBL423045',
 'CHEMBL31115',
 'CHEMBL135076',
 'CHEMBL133455',
 'CHEMBL133868',
 'CHEMBL27173',
 'CHEMBL39',
 'CHEMBL275854',
 'CHEMBL25800',
 'CHEMBL131736',
 'CHEMBL134519',
 'CHEMBL412876',
 'CHEMBL275008',
 'CHEMBL267559',
 'CHEMBL60318',
 'CHEMBL293923',
 'CHEMBL64878',
 'CHEMBL60447',
 'CHEMBL42',
 'CHEMBL59741',
 'CHEMBL60885',
 'CHEMBL64610',
 'CHEMBL292935',
 'CHEMBL61869',
 'CHEMBL64167',
 'CHEMBL59637',
 'CHEMBL61818',
 'CHEMBL60047',
 'CHEMBL418499',
 'CHEMBL61819',
 'CHEMBL64845',
 'CHEMBL54',
 'CHEMBL294730',
 'CHEMBL304438',
 'CHEMBL59823',
 'CHEMBL116735',
 'CHEMBL116463',
 'CHEMBL326538',
 'CHEMBL3

In [17]:
# Iterating canonical SMILES into a list
canonical_smiles = []
for i in df_assays.canonical_smiles:
    canonical_smiles.append(i)

In [18]:
# Iterating standard_value into a list
standard_value = []
for i in df_assays.standard_value:
    standard_value.append(i)

In [19]:
# Combining the four variables into the same DataFrame
dados_tupla = list(zip(mol_cid, canonical_smiles, classe_bioatividade, standard_value))
df3 = pd.DataFrame( dados_tupla,  columns=['molecule_chembl_id', 'canonical_smiles', 'classe_bioatividade', 'standard_value'])

In [20]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,classe_bioatividade,standard_value
0,CHEMBL26560,CCCN1CC[C@H]2c3cccc(O)c3CC[C@H]21,Inactive,1000.0
1,CHEMBL148860,COc1cccc2c1O[C@@H](CN1[C@@H]3CC[C@H]1C[C@](O)(...,Inactive,36.0
2,CHEMBL148860,COc1cccc2c1O[C@@H](CN1[C@@H]3CC[C@H]1C[C@](O)(...,Inactive,130.0
3,CHEMBL151266,COc1cccc2c1O[C@@H](CN1[C@@H]3CC[C@H]1C[C@](O)(...,Inactive,25.0
4,CHEMBL151266,COc1cccc2c1O[C@@H](CN1[C@@H]3CC[C@H]1C[C@](O)(...,Inactive,21.0
...,...,...,...,...
3573,CHEMBL5594692,COc1ccc2c3ccnc(C)c3n(CCN(C)C)c2c1.O=C(O)/C=C/C...,Inactive,2950.0
3574,CHEMBL5567330,COc1ccc2c(c1)N(CCN(C)C)/C(=C/C(C)=O)S2.O=C(O)/...,Inactive,5530.0
3575,CHEMBL1232461,CCNC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-...,Inactive,40000.0
3576,CHEMBL2017291,COc1cc2c(cc1-c1c(C)noc1C)ncc1[nH]c(=O)n([C@H](...,Inactive,40000.0


In [21]:

# Saving the DataFrame to a CSV file

df3.to_csv('dados_preprocessados.csv', index=False)
