In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem as Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdMolDescriptors

In [None]:
def standardize_mol(mol):
    # Standardize the molecule
    mol.UpdatePropertyCache(strict=False)
    Chem.SetConjugation(mol)
    Chem.SetHybridization(mol)
    # Normalize the molecule
    Chem.SanitizeMol(mol, sanitizeOps=(Chem.SANITIZE_ALL ^ Chem.SANITIZE_CLEANUP ^ Chem.SANITIZE_PROPERTIES))
    rdMolStandardize.NormalizeInPlace(mol)
    # kekulize the molecule
    # Chem.Kekulize(mol)
    # Update the properties
    mol.UpdatePropertyCache(strict=False)
    return mol

In [None]:
data = pd.read_csv("../data/atlas_data_R.csv.zip")
print("data columns", data.columns, flush=True)
# check how many of kegg_id are present
print("kegg_id", data["kegg_id"].nunique(), flush=True)


In [None]:
# Look at the compound data
data_c = pd.read_csv("../data/kegg_data_C.csv.zip")
print("data columns", data_c.columns, flush=True)
# sort the compound so that the lowest 10 molecular weight are at the top
data_c = data_c.sort_values(by="molecular_weight")
# print only the compound_id and the smiles
print(data_c[["compound_id", "smiles", "formula"]].head(50), flush=True)

In [None]:
smi = "[H:0]/C=C/[H:0]"
print(f"input: {smi}", flush=True)
# replace all occurrences of [H:0] with *
smi = smi.replace("[H:0]", "*")
print(f"replaced: {smi}", flush=True)

mol = Chem.MolFromSmiles(smi)
mol = standardize_mol(mol)
Chem.Kekulize(mol)
smi_out = Chem.MolToSmiles(mol)
print(f"output: {smi_out}", flush=True)
# print the formula
print(f"formula: {rdMolDescriptors.CalcMolFormula(mol)}", flush=True)