#**Calculando os descritores de fingerprint das moléculas**

In [1]:
# Tarefas a ser exucutadas
## Passo 1:Instalar a biblioteca Padelpy
## Passo 2: Preparar o arquivo fingerprint.xml
## Passo 3: Importar o dataset tratada
## Passo 4: Preparar o subconjunto de dados para entrada no padelpy
## Passo 5: Calcular os descritores de fingerprint
## Passo 6: Visualizar os descritores calculados
## Passo 7: Salvar o conjunto de dados

### **Passo 1:Instalar a biblioteca Padelpy**

In [2]:
! pip install padelpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting padelpy
  Downloading padelpy-0.1.12-py2.py3-none-any.whl (20.9 MB)
[K     |████████████████████████████████| 20.9 MB 1.4 MB/s 
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.12


###**Passo 2: Preparar o arquivo fingerprint.xml**

In [13]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2022-09-09 23:23:26--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2022-09-09 23:23:26--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2022-09-09 23:23:26 (111 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFing

### **2.1. Criar uma lista e organizar o arquivos xml**

In [14]:
import glob
arquivos_xml = glob.glob("*.xml")
arquivos_xml.sort()
arquivos_xml

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [15]:
lista_FP = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

###**Criando um dicionário**

In [16]:

fp = dict(zip(lista_FP, arquivos_xml))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [17]:
fp['PubChem']

'PubchemFingerprinter.xml'

###**## Passo 3: Importar o dataset tratado**

In [20]:
from google.colab import files 
ploaded = files.upload() 

Saving dataset_3classes.csv to dataset_3classes (1).csv


In [23]:
import pandas as pd
df = pd.read_csv("dataset_3classes.csv")
display(df.head())

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,classe_bioatividade,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL105460,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,Ativo,595.704,5.7114,0.0,9.0,6.474955
1,1,CHEMBL105606,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,Ativo,518.618,4.6494,0.0,8.0,7.552842
2,2,CHEMBL105570,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,Ativo,581.717,6.5817,0.0,7.0,8.0
3,3,CHEMBL83338,CN(C[C@@H](CCN1CCC2(CC1)C[S+]([O-])c1ccccc12)c...,Ativo,557.181,5.2894,0.0,4.0,7.455932
4,4,CHEMBL325357,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,Ativo,567.69,6.4076,0.0,7.0,7.823909


##**Passo 4: Preparar o subconjunto de dados para entrada no padelpy**

In [24]:
df2 = pd.concat( [df['canonical_smiles'],df['molecule_chembl_id']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,CHEMBL105460
1,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,CHEMBL105606
2,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,CHEMBL105570
3,CN(C[C@@H](CCN1CCC2(CC1)C[S+]([O-])c1ccccc12)c...,CHEMBL83338
4,C=CCN(C(=O)OCc1ccc([N+](=O)[O-])cc1)C1CCN(CCC(...,CHEMBL325357
...,...,...
2924,CC(=O)N[C@@H]1C[C@H](NC(C)(C)C)CC[C@@H]1N1CC[C...,CHEMBL4594419
2925,CC(=O)N[C@@H]1C[C@H](NC(C)(C)C)CC[C@@H]1N1CC[C...,CHEMBL4781426
2926,CC(=O)N[C@@H]1C[C@H](NC(C)(C)C)CC[C@@H]1N1CC[C...,CHEMBL4594419
2927,CC(=O)N[C@@H]1C[C@H](NC(C)(C)C)CC[C@@H]1N1CC[C...,CHEMBL4781426


##**Passo 5: Calcular os descritores de fingerprint**

In [25]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [31]:
# Calculando os descritores. Neste caso escolhi o Pubchem


from padelpy import padeldescriptor

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Pubchem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Pubchem.csv'
                #descriptortypes='PubChemFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

##**Passo 6: Visualizar os descritores calculados**

In [32]:
descritores = pd.read_csv(fingerprint_output_file)
descritores

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL105606,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL105460,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL105570,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL83338,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL105572,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,CHEMBL4781426,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2925,CHEMBL4594419,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2926,CHEMBL4781426,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2927,CHEMBL4594419,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Passo 7: Salvar o conjunto de dados**

In [33]:
descritores.to_csv("pubchem_descritores_3classes.csv")