In [None]:
# prepare the environment
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import rdMolDescriptors
from molvs.standardize import Standardizer
from molvs.charge import Uncharger, Reionizer
from molvs.fragment import LargestFragmentChooser
from molvs.tautomer import TautomerCanonicalizer
from rdkit.Chem.rdmolops import GetFormalCharge, RemoveStereochemistry
from rdkit.Chem import PandasTools

In [None]:
# import the data set
data = pd.read("foodb.csv")
data.head(2)

In [None]:
data.columns

In [None]:
data = data[['public_id', 'cas_number']]
data.head(3)

In [None]:
data.columns =['ID', 'SMILES']
data.head(2)

In [None]:
# select the first 50 rows to perform a test
"""
data = data.head(50)
data.head(2)
"""

In [None]:
# drop the empty rows
print(data.shape)
data.dropna(subset=["SMILES"])
print(data.shape)

In [None]:
# perform a test on molecule drawing
smi = list(data["SMILES"])[0]
smi
from rdkit.Chem.Draw import IPythonConsole
Chem.MolFromSmiles(smi)

In [None]:
# define functions
STD = Standardizer() # get the standardized version of a given SMILES string (canonical SMILES)
LFC = LargestFragmentChooser() # select the largest fragment from a salt (ionic compound)
UC = Uncharger() # charge corrections are applied to ensure correctly ionization
RI = Reionizer() # neutralize molecule by adding/removing hydrogens
TC = TautomerCanonicalizer()  # return a chemically reasonable tautomer

In [None]:
def MasterStandarization_no_chiral(smi):
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol == None:
            # if rdkit could not parse the smiles, returns Error 1
            return "Error 1"
        else:
            mol = STD(mol)
            mol = LFC(mol)

            allowed_elements = {"H","B","C","N","O","F","Si","P","S","Cl","Se","Br","I"}
            actual_elements = set([atom.GetSymbol() for atom in mol.GetAtoms()])
            if len(actual_elements-allowed_elements) == 0:
                mol = UC(mol)
                mol = RI(mol)
                RemoveStereochemistry(mol)
                mol = TC(mol)
                return Chem.MolToSmiles(mol)
            else:
                # if the molecule contains other than the allowed elements return "Error 2"
                return "Error 2"
    except:
        return "Something else was found"

In [None]:
def MasterStandarization_chiral(smi):
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol == None:
            #If rdkit could not parse the smiles, returns Error 1
            return "Error 1"
        else:
            mol = STD(mol)
            mol = LFC(mol)

            allowed_elements = {"H","B","C","N","O","F","Si","P","S","Cl","Se","Br","I"}
            actual_elements = set([atom.GetSymbol() for atom in mol.GetAtoms()])
            if len(actual_elements-allowed_elements) == 0:
                mol = UC(mol)
                mol = RI(mol)
                #RemoveStereochemistry(mol)
                mol = TC(mol)
                return Chem.MolToSmiles(mol)
            else:
                # If molecule contains other than the allowed elements, return "Error 2"
                return "Error 2"
    except:
        return "Something else was found"

In [None]:
data["SMILES_chiral"] = [MasterStandarization_chiral(x) for x in data["SMILES"]]

In [None]:
data["SMILES_no_chiral"] = [MasterStandarization_no_chiral(x) for x in data["SMILES"]]
data.shape

In [None]:
len(data[data["SMILES_chiral"] == "Something else was found"])

In [None]:
len(data[data["SMILES_no_chiral"] == "Something else was found"])

In [None]:
# delete smiles that rdkit could not read
data = data[data["SMILES_chiral"] != "Error 1"]
# delete smiles that no contain allowed atoms
data = data[data["SMILES_chiral"] != "Error 2"]
# delete other errors
data = data[data["SMILES_chiral"] != "Something else was found"].reset_index(drop=True)

In [None]:
# delete smiles that rdkit could not read
data = data[data["SMILES_no_chiral"] != "Error 1"]
# delete smiles that no contain allowed atoms
data = data[data["SMILES_no_chiral"] != "Error 2"]
# delete other errors
data = data[data["SMILES_no_chiral"] != "Something else was found"].reset_index(drop=True)

In [None]:
data.head(2)

In [None]:
# delete duplicates
data = data.drop_duplicates(subset=["SMILES_chiral"], keep="first").reset_index(drop=True)
print(data.shape)
print(data.head(10))

In [None]:
data.to_csv("foodb_curated.csv", sep=",", index=False)