In [1]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem, Draw
import pandas as pd
import re
import os
import numpy as np

from smarts_preprocess import get_correct_order_of_rxn_smarts
from rxn_smiles_preprocess import clean_rxn_smiles
from labling_rxn import labling_rxn

## SMARTS preprocess

In [2]:
# retro ugi
print(get_correct_order_of_rxn_smarts("[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2]>>[C-:1]#[NH0+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O]).[CX3:7](=[O:8])[OX2H1:9]"))
# 4c ugi
print(get_correct_order_of_rxn_smarts("[C-:6]#[NH0+:8].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O]).[CX3:1](=[O:2])[OX2H1:7]>>[C:1](=[O:2])[N:3]([#6:4])[C:5][C+0:6](=[OH0+0:7])[N+0:8]"))
# 4c ugi with and H2O
print(get_correct_order_of_rxn_smarts("[C-:6]#[NH0+:8].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:10]).[CX3:1](=[O:2])[OX2H1:7]>>[C:1](=[O:2])[N:3]([#6:4])[C:5][C+0:6](=[OH0+0:7])[N+0:8].[O:10]"))
# retro amino acid 4r3c ugi
print(get_correct_order_of_rxn_smarts("[C:4]1(=[O:5])[#6:3][#6:2][N+0:1]1[C:7][C+0:9](=[OH0:6])[N+0:10].[O:8]>>[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2][#6:3][CX3:4](=[O:5])[OX1H0-,OX2H1:6].[CX3;!$(*O);!$(*S);!$(*N):7](=[O:8]).[C-:9]#[N+:10]"))
# amino acid 4r3c ugi
print(get_correct_order_of_rxn_smarts("[NX3H2,NX4H3+:100][#6;!$(C=[C,O,N,S]);!$(C#*):4][#6:3][CX3:1](=[O:2])[OX1H0-,OX2H1:8].[CX3;!$(*O);!$(*S);!$(*N):6](=[O:10]).[C-:7]#[N+:9]>>[C:1]1(=[O:2])[#6:3][#6:4][N+0:100]1[C:6][C+0:7](=[OH0:8])[N+0:9].[O:10]"))

[C:1](=[O:2])[N:3]([#6:4])[C:5][C+0:6](=[OH0+0:7])[N+0:8]>>[C-:6]#[NH0+:8].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O]).[CX3:1](=[O:2])[OX2H1:7]
[C-:1]#[NH0+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O]).[CX3:6](=[O:7])[OX2H1:8]>>[C:6](=[O:7])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:8])[N+0:2]
[C-:1]#[NH0+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:6]).[CX3:7](=[O:8])[OX2H1:9]>>[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2].[O:6]
[C:1]1(=[O:2])[#6:3][#6:4][N+0:5]1[C:6][C+0:7](=[OH0:8])[N+0:9].[O:10]>>[NX3H2,NX4H3+:5][#6;!$(C=[C,O,N,S]);!$(C#*):4][#6:3][CX3:1](=[O:2])[OX1H0-,OX2H1:8].[CX3;!$(*O);!$(*S);!$(*N):6](=[O:10]).[C-:7]#[N+:9]
[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2][#6:3][CX3:4](=[O:5])[OX1H0-,OX2H1:6].[CX3;!$(*O);!$(*S);!$(*N):7](=[O:8]).[C-:9]#[N+:10]>>[C:4]1(=[O:5])[#6:3][#6:2][N+0:1]1[C:7][C+0:9](=[OH0:6])[N+0:10].[O:8]


## Rxn SMILES preprocess

In [4]:
# oxo acid 5r-4c-3c ugi, with 2 products
print(clean_rxn_smiles("C[C@@H](C(O)=O)[C@@H](C(C)=O)O.COC1=CC=C(C=C1)CN.COC(OC)CC2=C([N+]#[C-])C=CC=C2>>COC(OC)CC3=C(C=CC=C3)NC([C@@]4([C@H]([C@H](C(C4CC5=CC=C(C=C5)OC)=O)C)O)C)=O.COC(OC)CC6=C(C=CC=C6)NC([C@]7([C@H]([C@H](C(C7CC8=CC=C(C=C8)OC)=O)C)O)C)=O",\
       "[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2].[CX3;!$(*O);!$(*S);!$(*N):3](=[O:4])[#6:5]~[#6:6][CX3:7](=[O:8])[OX1H0-,OX2H1:9].[C-:10]#[N+:11]>>[O:8]=[C:7]1[N+0:1]([#6:2])[C:3]([C+0:10]([N+0:11])=[OH0:9])[#6:5]~[#6:6]1.[O:4]"))
# amino acid 4r-4c-3c ugi
print(clean_rxn_smiles("CC(CC(O)=O)N.COC(C[N+]#[C-])=O.O=CCC1=CC=CC=C1>>COC(CNC(C(N2C(CC2=O)C)CC3=CC=CC=C3)=O)=O",\
       "[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2][#6:3][CX3:4](=[O:5])[OX1H0-,OX2H1:6].[CX3;!$(*O);!$(*S);!$(*N):7](=[O:8]).[C-:9]#[N+:10]>>[C:4]1(=[O:5])[#6:3][#6:2][N+0:1]1[C:7][C+0:9](=[OH0:6])[N+0:10].[O:8]"))
# 4c Ugi
print(clean_rxn_smiles("[C-]#[N+]C3CCCCC3.NCc1cc2c(OCO2)cc1.CC(C)(C)OC(C(C(OC(C)(C)C)=O)Oc1ccc(C([H])=O)cc1)=O.CC(C1=CC=CC=C1)C(CC(O)=O)=O>>CCCC",\
       "[C-:1]#[N+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:6]).[CX3:7](=[O:8])[OX1H0-,OX2H1:9]>>[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2].[O:6]"))
# 4c Ugi with salt
print(clean_rxn_smiles("[Cl-].[K+].C=O.OC(C1=CC2=CC(O)=CC=C2N1)=O.[C-]#[N+]C3CCCCC3.NCC4=C(C=C(C=C4)Cl)Cl>>OC5=CC=C6NC(C(N(CC7=CC=C(C=C7Cl)Cl)CC(NC8CCCCC8)=O)=O)=CC6=C5",\
       "[C-:1]#[N+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:6]).[CX3:7](=[O:8])[OX1H0-,OX2H1:9]>>[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2].[O:6]"))
# 4c Ugi with wildcard
print(clean_rxn_smiles("[*]C=O.OC(C1=CC2=CC(O)=CC=C2N1)=O.[C-]#[N+]C3CCCCC3.NCC4=C(C=C(C=C4)Cl)Cl>>OC5=CC=C6NC(C(N(CC7=CC=C(C=C7Cl)Cl)CC(NC8CCCCC8)=O)=O)=CC6=C5",\
       "[C-:1]#[N+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:6]).[CX3:7](=[O:8])[OX1H0-,OX2H1:9]>>[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2].[O:6]"))

COc1ccc(CN)cc1.CC(=O)[C@@H](O)[C@@H](C)C(=O)O.[C-]#[N+]c1ccccc1CC(OC)OC
CC(N)CC(=O)O.O=CCc1ccccc1.[C-]#[N+]CC(=O)OC
[C-]#[N+]C1CCCCC1.NCc1ccc2c(c1)OCO2.CC(C)(C)OC(=O)C(Oc1ccc(C=O)cc1)C(=O)OC(C)(C)C.CC(C(=O)CC(=O)O)c1ccccc1
[C-]#[N+]C1CCCCC1.NCc1ccc(Cl)cc1Cl.C=O.O=C(O)c1cc2cc(O)ccc2[nH]1
None


## Labling Rxn

In [15]:
# 4c Ugi
patt_0 = "[C-:1]#[N+:2].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:6]).[CX3:7](=[O:8])[OX1H0-,OX2H1:9]>>[C:7](=[O:8])[N:3]([#6:4])[C:5][C+0:1](=[OH0+0:9])[N+0:2].[O:6]"
react_smi_0 = "[C-]#[N+]C1CCCCC1.NCc1ccc2c(c1)OCO2.CC(C)(C)OC(=O)C(Oc1ccc(C=O)cc1)C(=O)OC(C)(C)C.CC(C(=O)CC(=O)O)c1ccccc1"

# amino acid 4r-4c-3c ugi
patt_1 = "[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2][#6:3][CX3:4](=[O:5])[OX1H0-,OX2H1:6].[CX3;!$(*O);!$(*S);!$(*N):7](=[O:8]).[C-:9]#[N+:10]>>[C:4]1(=[O:5])[#6:3][#6:2][N+0:1]1[C:7][C+0:9](=[OH0:6])[N+0:10].[O:8]"
react_smi_1 = "CC(N)CC(=O)O.O=CCc1ccccc1.[C-]#[N+]CC(=O)OC"

# oxo acid 5r-4c-3c ugi
patt_2 = "[NX3H2,NX4H3+:1][#6;!$(C=[C,O,N,S]);!$(C#*):2].[CX3;!$(*O);!$(*S);!$(*N):3](=[O:4])[#6:5]~[#6:6][CX3:7](=[O:8])[OX1H0-,OX2H1:9].[C-:10]#[N+:11]>>[O:8]=[C:7]1[N+0:1]([#6:2])[C:3]([C+0:10]([N+0:11])=[OH0:9])[#6:5]~[#6:6]1.[O:4]"
react_smi_2 = "COc1ccc(CN)cc1.CC(=O)[C@@H](O)[C@@H](C)C(=O)O.[C-]#[N+]c1ccccc1CC(OC)OC"

# retro 4c ugi
patt_3 = "[C:1](=[O:2])[N:3]([#6:4])[C:5][C+0:6](=[OH0+0:7])[N+0:8].[O:9]>>[C-:6]#[NH0+:8].[NX3H2:3][#6;!$(C=[C,O,N,S]);!$(C#*):4].[CX3;!$(*O);!$(*S);!$(*N):5](=[O:9]).[CX3:1](=[O:2])[OX2H1:7]"
react_smi_3 = "CC(C)(OC(C(C(OC(C)(C)C)=O)Oc1ccc(C(C(NC2CCCCC2)=O)N(C(c3ccc(OC(C(OC(C)(C)C)=O)C(OC(C)(C)C)=O)cc3)=O)Cc4cc5c(OCO5)cc4)cc1)=O)C.O"

test_dict = {patt_0: react_smi_0, patt_1: react_smi_1, patt_2: react_smi_2, patt_3: react_smi_3}
idx = 0

for rxn_smarts, rxn_smiles in test_dict.items():
    rxn_smi = labling_rxn(rxn_smarts, rxn_smiles)
    rxn = AllChem.ReactionFromSmarts(rxn_smi)
    img = Draw.ReactionToImage(rxn, subImgSize=(600, 600))
    img.save(f"./labling_rxn_test/test{idx}.png")
    print(f"Labeled test rxn SMILES {idx} successfully")
    idx += 1

Labeled test rxn SMILES 0 successfully
Labeled test rxn SMILES 1 successfully
Labeled test rxn SMILES 2 successfully
Labeled test rxn SMILES 3 successfully
