In [1]:
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd

data = pd.read_csv("catalysts.csv")
catalysts = list(data["smiles"])
labels = list(data["label"])
mols = [Chem.MolFromSmiles(s) for s in catalysts]

In [2]:
# Catalysts Cores
patt_1 = Chem.MolFromSmarts("O=P(Oc1c2c3ccccc3cc1)(O)Oc4c2c5ccccc5cc4")
patt_2 = Chem.MolFromSmarts("O=P(Oc1c2c3c(CCCC3)cc1)(O)Oc4c2c(CCCC5)c5cc4")

In [3]:
# Extract substituent from catalysts, with dummy atoms indicating attachment point
subs = {}
for mol, label in zip(mols, labels):
    if mol.HasSubstructMatch(patt_1):
        tmp = Chem.ReplaceCore(mol, patt_1)
    elif mol.HasSubstructMatch(patt_2):
        tmp = Chem.ReplaceCore(mol, patt_2)
    rs = Chem.GetMolFrags(tmp, asMols=True)
    subs[label.split("_")[0]] = Chem.MolToSmiles(rs[0])

In [4]:
# Add substituent to NMe4+ core
results = {}
core = Chem.AddHs(Chem.MolFromSmiles("C[N+](C)(C)C"))
patt = Chem.MolFromSmarts("[H]")
for label, sub in subs.items():
    repl = Chem.RWMol(Chem.MolFromSmiles(sub))
    repl.RemoveAtom(0)    # remove dummy atom
    Chem.SanitizeMol(repl)
    rms = Chem.ReplaceSubstructs(core, patt, repl)
    res = Chem.RemoveHs(rms[0])
    res.SetProp("Label", str(label))
    results[label] = res

In [5]:
# Draw
img = Draw.MolsToGridImage(results.values(), molsPerRow=6, subImgSize=(400, 400),
                           legends=[m.GetProp("Label") for m in results.values()])
img.save("subs.png")