## **Data curation**

This section was adapted by Alberto Marban from https://difacquim.gitbook.io/quimioinformatica

Manual review of the original references was conducted to resolve discrepancies in the pIC<sub>50</sub> values of duplicate *S. aureus* compounds recorded in ChEMBL from df_fabi_s_aureus_IC50_for_manual_revision.xlsx. The selected entries were written to a new file named df_fabi_s_aureus_IC50_removal_same_smiles_assays_manual_selection.xlsx.

In [None]:
!pip install -q rdkit 
!pip install -q molvs

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from molvs.standardize import Standardizer
from molvs.charge import Uncharger, Reionizer
from molvs.fragment import LargestFragmentChooser
from molvs.tautomer import TautomerCanonicalizer
from rdkit.Chem.rdmolops import GetFormalCharge, RemoveStereochemistry

In [11]:
# Load excel file
data = pd.read_excel('df_fabi_s_aureus_IC50_removal_samesmailes_assays_munualselection.xlsx')

In [12]:
#########################################Funciones MolVS######################################################################
#Definir funciones
STD = Standardizer() # Get the standardized version of a given SMILES string (canonical SMILES).
LFC = LargestFragmentChooser() # Select the largest fragment from a salt (ionic compound).
UC = Uncharger() # Charge corrections are applied to ensure, for example, that free metals are correctly ionized.
RI = Reionizer() # Neutralize molecule by adding/removing hydrogens.
TC = TautomerCanonicalizer()  # Return a tautormer “reasonable” from a chemist’s point, but isn’t guarante

In [5]:
data.columns

Index(['action_type', 'assay_chembl_id', 'assay_description',
       'canonical_smiles', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'standard_type', 'standard_units',
       'standard_value', 'target_chembl_id', 'target_organism', 'type',
       'units', 'value'],
      dtype='object')

In [6]:
len(data)

248

In [10]:
#Función para curado
def pretreatment(smi):
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol == None:
            #If rdkit could not parse the smiles, returns Error 1
            return "Error 1"
        else:
            mol = STD(mol)
            mol = LFC(mol)

            allowed_elements = {"H","B","C","N","O","F","Si","P","S","Cl","Se","Br","I"}
            actual_elements = set([atom.GetSymbol() for atom in mol.GetAtoms()])
            if len(actual_elements-allowed_elements) == 0:
                mol = UC(mol)
                mol = RI(mol)
                #RemoveStereochemistry(mol)
                mol = TC(mol)
                return Chem.MolToSmiles(mol)
            else:
                # If molecule contains other than the allowed elements, return "Error 2"
                return "Error 2"
    except:
        return "Something else was found"

In [11]:
#Nueva columna de SMILES
data["NEW_SMILES"] = [pretreatment(x) for x in data["canonical_smiles"]]
data.head()

Unnamed: 0,action_type,assay_chembl_id,assay_description,canonical_smiles,ligand_efficiency,molecule_chembl_id,molecule_pref_name,standard_type,standard_units,standard_value,target_chembl_id,target_organism,type,units,value,NEW_SMILES
0,,CHEMBL819485,Concentration required for the 50% inhibition ...,CC(=O)N(C)Cc1cc(C(=O)N(C)Cc2cc3ccccc3n2C)ccc1N,"{'bei': '13.67', 'le': '0.25', 'lle': '2.16', ...",CHEMBL109420,,IC50,nM,6700.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,6.7,CC(=O)N(C)Cc1cc(C(=O)N(C)Cc2cc3ccccc3n2C)ccc1N
1,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN(Cc1cc2ccccc2n1C)C(=O)CCc1ccc(N)nc1,"{'bei': '12.49', 'le': '0.23', 'lle': '1.28', ...",CHEMBL109484,,IC50,nM,93700.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,93.7,CN(Cc1cc2ccccc2n1C)C(=O)CCc1ccc(=N)[nH]c1
2,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN1Cc2cc(C(=O)N(C)Cc3cc4ccccc4n3C)ccc2NCC1=O,"{'bei': '12.41', 'le': '0.23', 'lle': '1.84', ...",CHEMBL109511,,IC50,nM,21200.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,21.2,CN1Cc2cc(C(=O)N(C)Cc3cc4ccccc4n3C)ccc2NCC1=O
3,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN(Cc1cc2ccccc2n1C)C(=O)c1ccccc1,,CHEMBL109805,,IC50,nM,100000.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,100.0,CN(Cc1cc2ccccc2n1C)C(=O)c1ccccc1
4,,CHEMBL819485,Concentration required for the 50% inhibition ...,COC(=O)CC1Nc2ccc(C(=O)N(C)Cc3cc4ccccc4n3C)cc2C...,"{'bei': '10.66', 'le': '0.20', 'lle': '2.01', ...",CHEMBL110480,,IC50,nM,16500.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,16.5,COC(=O)CC1Nc2ccc(C(=O)N(C)Cc3cc4ccccc4n3C)cc2C...


In [12]:
# Eliminar smiles no leído por RDKit
data = data[data["NEW_SMILES"] != "Error 1"]
# Eliminar smiles que contiene átomos no permitidos
data = data[data["NEW_SMILES"] != "Error 2"]
# Eliminar otros errores
data = data[data["NEW_SMILES"] != "Something else was found"].reset_index(drop=True)

In [13]:
# Eliminar duplicados
data = data.drop_duplicates(subset=["NEW_SMILES"], keep="first").reset_index(drop=True)
print(data.shape)
data

(246, 16)


Unnamed: 0,action_type,assay_chembl_id,assay_description,canonical_smiles,ligand_efficiency,molecule_chembl_id,molecule_pref_name,standard_type,standard_units,standard_value,target_chembl_id,target_organism,type,units,value,NEW_SMILES
0,,CHEMBL819485,Concentration required for the 50% inhibition ...,CC(=O)N(C)Cc1cc(C(=O)N(C)Cc2cc3ccccc3n2C)ccc1N,"{'bei': '13.67', 'le': '0.25', 'lle': '2.16', ...",CHEMBL109420,,IC50,nM,6700.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,6.70,CC(=O)N(C)Cc1cc(C(=O)N(C)Cc2cc3ccccc3n2C)ccc1N
1,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN(Cc1cc2ccccc2n1C)C(=O)CCc1ccc(N)nc1,"{'bei': '12.49', 'le': '0.23', 'lle': '1.28', ...",CHEMBL109484,,IC50,nM,93700.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,93.70,CN(Cc1cc2ccccc2n1C)C(=O)CCc1ccc(=N)[nH]c1
2,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN1Cc2cc(C(=O)N(C)Cc3cc4ccccc4n3C)ccc2NCC1=O,"{'bei': '12.41', 'le': '0.23', 'lle': '1.84', ...",CHEMBL109511,,IC50,nM,21200.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,21.20,CN1Cc2cc(C(=O)N(C)Cc3cc4ccccc4n3C)ccc2NCC1=O
3,,CHEMBL819485,Concentration required for the 50% inhibition ...,CN(Cc1cc2ccccc2n1C)C(=O)c1ccccc1,,CHEMBL109805,,IC50,nM,100000.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,100.00,CN(Cc1cc2ccccc2n1C)C(=O)c1ccccc1
4,,CHEMBL819485,Concentration required for the 50% inhibition ...,COC(=O)CC1Nc2ccc(C(=O)N(C)Cc3cc4ccccc4n3C)cc2C...,"{'bei': '10.66', 'le': '0.20', 'lle': '2.01', ...",CHEMBL110480,,IC50,nM,16500.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,16.50,COC(=O)CC1Nc2ccc(C(=O)N(C)Cc3cc4ccccc4n3C)cc2C...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,,CHEMBL677574,Antibacterial activity against Staphylococcus ...,Cc1cc(C(=O)N2CCc3c(n(Cc4ccc(O)cc4)c4ccccc34)C2...,"{'bei': '12.98', 'le': '0.24', 'lle': '0.62', ...",CHEMBL73455,,IC50,nM,2900.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,2.90,Cc1cc(C(=O)N2CCc3c(n(Cc4ccc(O)cc4)c4ccccc34)C2...
242,,CHEMBL677574,Antibacterial activity against Staphylococcus ...,O=C(c1ccc(O)c(Cl)c1)N1CCc2c(n(Cc3ccc(O)cc3)c3c...,"{'bei': '15.70', 'le': '0.30', 'lle': '1.85', ...",CHEMBL73580,,IC50,nM,160.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,0.16,O=C(c1ccc(O)c(Cl)c1)N1CCc2c(n(Cc3ccc(O)cc3)c3c...
243,,CHEMBL677574,Antibacterial activity against Staphylococcus ...,Cc1cc(C(=O)N2CCc3c(n(Cc4ccc(O)cc4)c4ccccc34)C2...,"{'bei': '16.35', 'le': '0.30', 'lle': '2.13', ...",CHEMBL73953,,IC50,nM,180.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,0.18,Cc1cc(C(=O)N2CCc3c(n(Cc4ccc(O)cc4)c4ccccc34)C2...
244,,CHEMBL677574,Antibacterial activity against Staphylococcus ...,O=C(c1ccc(O)cc1)N1CCc2c([nH]c3ccccc23)C1c1ccc(...,"{'bei': '11.84', 'le': '0.21', 'lle': '0.18', ...",CHEMBL74072,,IC50,nM,28100.0,CHEMBL3994,Staphylococcus aureus,IC50,uM,28.10,O=C(c1ccc(O)cc1)N1CCc2c([nH]c3ccccc23)C1c1ccc(...


In [14]:
len(data)

246

In [None]:
# Guardar dataframe como archivo csv
data.to_excel("s_aureus_fabI_curated_data.xlsx", index=False)