# 1 Start

~~~
conda create -n ENZRetro python=3.10
conda activate ENZRetro
cd path/to/this/project
pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
pip install -r requirements.txt
~~~

# 2 train

Before training, make sure that you have "model/pretrain/model" in this project. You can download it.
~~~shell
python train.py --help
python train.py --task <task_name> --ec <ec_level>
~~~
Model files will be generated.  
You can also download "model.zip" and unzip it to get all the trained models.  

# 3 val

~~~
python test.py --help
python test.py --task <task_name> --ec <ec_level>
~~~
File "result.txt" and "target.txt" will be generated. You can see how to use it in "result_summary.ipynb".  
You can also download "test_result.zip" and unzip it.

# 4 Call as API

### 4.1 Obtain SSREdits based on product

In [2]:
from product2SSREdits import Product2SSREdits
p2s = Product2SSREdits()

The input can be either a list of SMILES or a SMILES string.

In [3]:
smiles_lst = ["O=C(NCc1ccccc1S(=O)(=O)C1CC1)C(F)(F)F",
                    "C[Si](C)(C)OC(=O)C=CCBr"]
# return (dec_output, score_tensor), sentence_lst
result = p2s(smiles_lst)

Each output element contains `len(smiles_lst) * 10` data points. For example, if 20 data points are output, the first 10 correspond to the result of the first SMILES in `smiles_lst`, and the remaining 10 correspond to the result of the next SMILES.

score

In [4]:
result[0][1]

[0.43010473251342773,
 0.2086336314678192,
 0.0927426666021347,
 0.05440981686115265,
 0.04763583093881607,
 0.036632318049669266,
 0.03546731919050217,
 0.035047732293605804,
 0.03366789221763611,
 0.025658011436462402,
 0.6153191924095154,
 0.10858681052923203,
 0.052005402743816376,
 0.04673460125923157,
 0.033947933465242386,
 0.0335654616355896,
 0.031513746827840805,
 0.02667132206261158,
 0.026163635775446892,
 0.025491988286376]

SSREdits

In [5]:
result[1]

['[Delete][Bond]CN:1.2[Attaching][Group]*OC(=O)C(F)(F)F:1',
 '[Delete][Bond]CN:1.2[Attaching][Group]*OCC:1',
 '[Delete][Bond]CN:1.2[Attaching][Group]*O:1',
 '[Delete][Bond]CN:1.2[Attaching][Group]*OC:1',
 '[Delete][Bond]CN:2.3[Attaching][Group]*Br:3',
 '[Delete][Bond]CC:3.4[Attaching][Group]*Br:3[Attaching][Group]*B(O)O:4',
 '[Delete][Bond]CC:13.14[Delete][Bond]CC:14.15[Change][Bond]C=C:13.15[Attaching][Group]*[S+](C)(C)=O:14',
 '[Delete][Bond]CN:2.3[Delete][Bond]CC:3.4[Attaching][Group]*=O:3',
 '[Delete][Bond]CC:13.14[Delete][Bond]CC:14.15[Change][Bond]C=C:13.15[Attaching][Group]*I:14[Attaching][Group]*I:14',
 '[Delete][Bond]CN:1.2[Attaching][Group]*OC1CCCCO1:1',
 '[Delete][Bond]CBr:9.10[Attaching][Group]*N1C(=O)CCC1=O:10',
 '[Delete][Bond]O[SiH3]:1.4[Attaching][Group]*Cl:1',
 '[Delete][Bond]CBr:9.10[Attaching][Group]*C(Br)(Br)Br:10[Attaching][Group]*O:9',
 '[Delete][Bond]O[SiH3]:1.4[Attaching][Group]*Br:1',
 '[Change][Bond]C=C:7.8',
 '[Delete][Bond]CO:4.5[Attaching][Group]*Br:5',
 '[

original token ids

In [6]:
result[0][0]

tensor([[  1,  29,  36,  72,  81, 100,  74,  99,  80,  27,  31, 101,  78,  72,
          75,  77,  78,  79,  72,  75,  87,  79,  75,  87,  79,  87, 100,  74,
           2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6],
        [  1,  29,  36,  72,  81, 100,  74,  99,  80,  27,  31, 101,  78,  72,
          72, 100,  74,   2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6],
        [  1,  29,  36,  72,  81, 100,  74,  99,  80,  27,  31, 101,  78, 100,
          74,   2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6],
        [  1,  29,  36,  72,  81, 100,  74,  99,  80,  27,  31, 101,  78,  72,
         100,  74,   2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           

In [7]:
SMILES = "O=C(NCc1ccccc1S(=O)(=O)C1CC1)C(F)(F)F"
result = p2s(SMILES)
result[0][1], result[1]

([0.43011507391929626,
  0.2086338996887207,
  0.09273155778646469,
  0.05441027879714966,
  0.04763505607843399,
  0.036633994430303574,
  0.035466790199279785,
  0.03504745662212372,
  0.033667389303445816,
  0.02565857209265232],
 ['[Delete][Bond]CN:1.2[Attaching][Group]*OC(=O)C(F)(F)F:1',
  '[Delete][Bond]CN:1.2[Attaching][Group]*OCC:1',
  '[Delete][Bond]CN:1.2[Attaching][Group]*O:1',
  '[Delete][Bond]CN:1.2[Attaching][Group]*OC:1',
  '[Delete][Bond]CN:2.3[Attaching][Group]*Br:3',
  '[Delete][Bond]CC:3.4[Attaching][Group]*Br:3[Attaching][Group]*B(O)O:4',
  '[Delete][Bond]CC:13.14[Delete][Bond]CC:14.15[Change][Bond]C=C:13.15[Attaching][Group]*[S+](C)(C)=O:14',
  '[Delete][Bond]CN:2.3[Delete][Bond]CC:3.4[Attaching][Group]*=O:3',
  '[Delete][Bond]CC:13.14[Delete][Bond]CC:14.15[Change][Bond]C=C:13.15[Attaching][Group]*I:14[Attaching][Group]*I:14',
  '[Delete][Bond]CN:1.2[Attaching][Group]*OC1CCCCO1:1'])

The usage of the other two modules is almost the same.

In [8]:
from product2SSREdits import Product2EC, Product2SSREditsEC

### 4.2 obtain the EC number based on the product

In [9]:
p2e = Product2EC(ec_num=2)
smiles_lst = ["CCC=CCC=CCC=CCC=CCC=CCC=CCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCC=CCC=CCCCCC)COP(=O)(O)OCC[N+](C)(C)C", 
                  "C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1C[C@H](O)C2=O"]
result = p2e(smiles_lst)
result[1][:3]

['2.7', '2.1', '2.3']

### 4.3 Obtain EC numbers and SSREdits based on the product.

In [10]:
'''
[EC][Backward]CCCCC=CCCCCCCCC(=O)OC[C@H](COC(=O)CCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCC[EC]2
[EC][Backward]*SC(=O)C=CCCCCCCCCCCCCC[EC]2
    
[EC]2.3[Delete][Bond]CO:13.15[Attaching][Group]*SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O:13[Change][Atom]C@@:17
[EC]1.3[Change][Bond]CC:4.5[Attaching][Group]NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](OP(=O)(O)O)[C@@H]3O)[C@@H](O)[C@H]2O)c1:-1
'''
p2se = Product2SSREditsEC(ec_num=2)    
smiles_lst = ["CCCCC=CCCCCCCCC(=O)OC[C@H](COC(=O)CCCCCCCCCCCCC)OC(=O)CCCCCCCCCCCCC", 
              "*SC(=O)C=CCCCCCCCCCCCCC"]
result = p2se(smiles_lst)
result[1][:3]

['[Delete][Bond]CO:13.15[Attaching][Group]*SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O:13[Change][Atom]C@@:17[EC]2.3',
 '[Delete][Bond]CO:13.15[Attaching][Group]*SCCNC(=O)CCNC(=O)C(O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)(O)O:13[Change][Atom]C@@:17[EC]2.3',
 '[Delete][Bond]CO:13.15[Attaching][Group]*SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@@H](O)C1OP(=O)(O)O:13[Change][Atom]C@@:17[EC]2.3']

### 4.4 Predict EC numbers based on the product and SSREdits.

In [11]:
from product2SSREdits import ProductSSREdits2EC
'''
[EC]4[EC]CCC=CCC=CCC=CCC=CCC=CCC=CCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCC=CCC=CCCCCC)COP(=O)(O)OCC[N+](C)(C)C<<[Delete][Bond]CN:58.61[Attaching][Group]*[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O:61[Change][Atom]C:24
[EC]4[EC]O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1<<[Change][Bond]C=O:5.6[Attaching][Group][H+]:-1[Attaching][Group]NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1:-1[Change][Atom]C:5
    
2.1.1.71
1.1.1.25
'''
po2e = ProductSSREdits2EC(4)
input_lst = ["CCC=CCC=CCC=CCC=CCC=CCC=CCCC(=O)O[C@H](COC(=O)CCCCCCCCCCCC=CCC=CCCCCC)COP(=O)(O)OCC[N+](C)(C)C<<[Delete][Bond]CN:58.61[Attaching][Group]*[S+](CC[C@H](N)C(=O)O)C[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1O:61[Change][Atom]C:24",
            "O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1<<[Change][Bond]C=O:5.6[Attaching][Group][H+]:-1[Attaching][Group]NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)C=CC1:-1[Change][Atom]C:5"]
result = po2e(input_lst)
result[1][:3]

['2.1.1.71', '2.1.1.76', '2.1.7.2']