In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolops import GetFormalCharge
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from molvs.standardize import Standardizer
from molvs.charge import Uncharger
from molvs.charge import Reionizer
from molvs.fragment import LargestFragmentChooser
from molvs.tautomer import TautomerCanonicalizer
from rdkit.Chem.rdmolops import RemoveStereochemistry
import pandas as pd

In [6]:
STD = Standardizer()
LFC = LargestFragmentChooser()
UC = Uncharger()
RI = Reionizer()
TC = TautomerCanonicalizer()

In [7]:

def MasterStandarization(smi):
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol == None:
            # If rdkit could not parse the smiles, returns Error 1
            return "Error 1"
        else:            
            mol = STD(mol)
            mol = LFC(mol)
            
            allowed_elements = {"H", "B", "C", "N", "O", "F", "Si", "P", "S", "Cl", "Se", "Br", "I"}
            actual_elements = set([atom.GetSymbol() for atom in mol.GetAtoms()])
            if len(actual_elements-allowed_elements) == 0:
                mol = UC(mol)
                mol = RI(mol)
                RemoveStereochemistry(mol)
                mol = TC(mol)
                return Chem.MolToSmiles(mol)
            else:
                # If molecule contains other than the allowed elements, returns "Error 2"
                return "Error 2"
    except:
        return "Algo mas"   

In [10]:
import pandas as pd
#import pandas

In [28]:
DB = pd.read_csv("APEXBT_DiscoveryProbe-Epigenetics-Compound_Library.csv")
DB.columns
DB.head(2)

Unnamed: 0,ID,SMILES,NEW_SMILES,AMW
0,A1206,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...,CCCCCCCCCC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)NC(CC(...,1619.710366
1,A1402,Cl.Nc1ccn([C@H]2O[C@@H](CO)[C@H](O)C2(F)F)c(=O)n1,N=c1ccn(C2OC(CO)C(O)C2(F)F)c(=O)[nH]1,263.071762


In [29]:
# Número de filas o número de compuestos
len(DB)

310

In [30]:
#Columnas de la base de datos
DB.columns

Index(['ID', 'SMILES', 'NEW_SMILES', 'AMW'], dtype='object')

In [34]:
#Retener columnas de interés
DB = DB[["ID","SMILES"]]
# primera fila del dataframe (base de datos)
DB.head(1)

Unnamed: 0,ID,SMILES
0,A1206,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...


In [38]:
#Cambiar nombre a las columnas
DB.columns = ['ID', "SMILES"]
# primera fila del dataframe (base de datos)
DB.head(1)

Unnamed: 0,ID,SMILES
0,A1206,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...


In [39]:
# Curado de la base de datos
DB['NEW_SMILES'] = [MasterStandarization(i) for i in DB['SMILES']]

In [40]:
len(DB)

310

In [41]:
DB.tail(2)

Unnamed: 0,ID,SMILES,NEW_SMILES
308,N2060,CCCCCCCCCCCCCC(=O)O[C@H]1[C@H](C)[C@]2(O)[C@H]...,CCCCCCCCCCCCCC(=O)OC1C(C)C2(O)C(C=C(CO)CC3(O)C...
309,N2487,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)ccc12,O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)ccc12


In [43]:
DB = DB[DB["NEW_SMILES"] != "Error 1"] #smiles (moléculas) que no se pudieron leer por RDkit
DB = DB[DB["NEW_SMILES"] != "Error 2"] #smiles (moléculas) que tienen diferentes átomos de los permitidos
DB = DB[DB["NEW_SMILES"] != "Algo mas"].reset_index(drop=True)

In [44]:
len(DB)

310

In [45]:
# ver el número de smiles nuevos y únicos
len(DB['NEW_SMILES'].unique())

310

In [46]:
#Retener SMILES únicos
DB = DB.drop_duplicates(subset=["NEW_SMILES"], keep ="first").reset_index(drop = True)
DB.shape

(310, 3)

In [47]:
#Calcular peso molécular
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
#Definir función (AMW) para calcular peso molécular
def AMW (Smiles):
    mol = Chem.MolFromSmiles(Smiles)
    AMW = rdMolDescriptors.CalcExactMolWt(mol)
    return AMW

In [48]:
# Peso molécular para cada molécula
DB["AMW"] = [AMW(x) for x in DB["NEW_SMILES"]]
DB.head(2)

Unnamed: 0,ID,SMILES,NEW_SMILES,AMW
0,A1206,CCCCCCCCCC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)N...,CCCCCCCCCC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)NC(CC(...,1619.710366
1,A1402,Cl.Nc1ccn([C@H]2O[C@@H](CO)[C@H](O)C2(F)F)c(=O)n1,N=c1ccn(C2OC(CO)C(O)C2(F)F)c(=O)[nH]1,263.071762


In [22]:
# Guardar Base de datos curada y con compuestos únicos
DB.to_csv('APEXBT_DiscoveryProbe-Epigenetics-Compound_Library_CURADA.csv', sep = ',', index=False)