## 0. 设置参数

In [1]:
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
import numpy as np



sv_path = "/gaozhangyang/experiments/MotifRetro/data/uspto_50k/"
train = pd.read_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_50k/raw_train.csv")
valid = pd.read_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_50k/raw_val.csv")
test = pd.read_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_50k/raw_test.csv")
data = pd.concat([train,valid, test], axis=0)
tasks = []
for idx in tqdm(range(len(data))):
    target, source = data.iloc[idx]["reactants>reagents>production"].split(">>")
    tasks.append([target, source])
    
    
    
# sv_path = "/gaozhangyang/experiments/MotifRetro/data/uspto_hard/"
# data = pd.read_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_hard/USPTOHard.csv")
# tasks = []
# for idx in tqdm(range(len(data))):
#     target, source = data.loc[idx,"substrates"], data.loc[idx,"product"]
#     tasks.append([target, source])


# sv_path = "/gaozhangyang/experiments/MotifRetro/data/uspto_full/"
# data = pd.read_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_full/USPTOFull.csv")
# tasks = []
# for idx in tqdm(range(len(data))):
#     target, source = data.loc[idx,"rxn_smiles"].split(">>")
#     tasks.append([target, source])
    

100%|██████████| 50016/50016 [00:03<00:00, 13221.98it/s]


## 1.提取fragment

In [4]:
import sys; sys.path.append("/gaozhangyang/experiments/MotifRetro")
import pandas as pd
from rdkit import Chem
import json
from copy import copy
from src.utils.feat_utils import atom_to_edit_tuple, get_bond_tuple, fix_incomplete_mappings, reac_to_canonical, renumber_atoms_for_mapping, mark_reactants
from rdkit.Chem import Draw
from src.utils.chem_utils import remove_am
from src.feat.reaction_actions import StopAction, AddMotifAction
from tqdm import tqdm
import numpy as np
from src.feat.featurize_gzy_psvae import ReactionSampleGenerator as ReactionSampleGenerator_gzy

from joblib import Parallel, delayed, cpu_count
from tqdm import tqdm

def pmap_multi(pickleable_fn, data, n_jobs=None, verbose=1, desc=None, **kwargs):
    """

    Parallel map using joblib.

    Parameters
    ----------
    pickleable_fn : callable
        Function to map over data.
    data : iterable
        Data over which we want to parallelize the function call.
    n_jobs : int, optional
        The maximum number of concurrently running jobs. By default, it is one less than
        the number of CPUs.
    verbose: int, optional
        The verbosity level. If nonzero, the function prints the progress messages.
        The frequency of the messages increases with the verbosity level. If above 10,
        it reports all iterations. If above 50, it sends the output to stdout.
    kwargs
        Additional arguments for :attr:`pickleable_fn`.

    Returns
    -------
    list
        The i-th element of the list corresponds to the output of applying
        :attr:`pickleable_fn` to :attr:`data[i]`.
    """
    if n_jobs is None:
        n_jobs = cpu_count() - 1

    results = Parallel(n_jobs=n_jobs, verbose=verbose, timeout=None)(
    delayed(pickleable_fn)(*d, **kwargs) for i, d in tqdm(enumerate(data),desc=desc)
    )

    return results

# utils
action_vocab = json.load(open("/gaozhangyang/experiments/MotifRetro/dataset_code/action_vocab.json", 'r'))

props = action_vocab['prop2oh']
prop2oh = {'atom': {}, 'bond': {}}

for type_key in prop2oh.keys():
    oh_dict = props[type_key]
    for key, values in oh_dict.items():
        converted_values = {}
        for prop_val, val_oh in values.items():
            try:
                prop_val = int(prop_val)
            except ValueError:
                pass
            converted_values[prop_val] = val_oh
        prop2oh[type_key][key] = converted_values

action_vocab['prop2oh'] = prop2oh



# visualization
from rdkit import Chem
from rdkit.Chem.Draw import rdMolDraw2D, MolsToGridImage

drawOptions = rdMolDraw2D.MolDrawOptions()
drawOptions.prepareMolsBeforeDrawing = False
drawOptions.bondLineWidth = 4
drawOptions.minFontSize = 12


def prepare_mol(mol, new_am):
    highlight_idx = []
    for i, atom in enumerate(mol.GetAtoms()):
        am = atom.GetAtomMapNum()
        if am in new_am:
            highlight_idx.append(i)
            
    try:
        mol_draw = rdMolDraw2D.PrepareMolForDrawing(mol)
    except Chem.KekulizeException:
        mol_draw = rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
        Chem.SanitizeMol(mol_draw, Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE)
    
    
    return mol_draw, highlight_idx

def plot_states(states, target_mol, source_mol):
    target_am = set([atom.GetAtomMapNum() for atom in  target_mol.GetAtoms()])
    source_am = set([atom.GetAtomMapNum() for atom in  source_mol.GetAtoms()])
    new_am = list(target_am - source_am)

    mol_list = []
    highlightAtomLists = []
    for one in states:
        mol, highlight = prepare_mol(one, new_am)
        mol_list.append(mol)
        highlightAtomLists.append(highlight)

    return MolsToGridImage(mol_list, molsPerRow=5,  subImgSize=(500, 500), drawOptions=drawOptions, highlightBondLists = highlightAtomLists)


from src.utils.chem_utils import MultiElement
from src.utils.retro_utils import preprocess_mols, get_synthons, match_synthons_reactants, get_frag_mol




def handle_per_task(target, source):
    try:
        target_mol, source_mol = preprocess_mols(target, source)

        motifretro_sample_generator = ReactionSampleGenerator_gzy(Chem.rdchem.RWMol(source_mol), target_mol, keep_actions_list=None, action_vocab=action_vocab, use_motif_action=True, only_get_synthon=True)

        synthons = get_synthons(motifretro_sample_generator)

        synthons = MultiElement(synthons).mols
        reactants = MultiElement(target_mol).mols
        reactants = match_synthons_reactants(synthons, reactants)
        frag_mol_list = []
        for k in range(len(reactants)):
            frag_mol = get_frag_mol(synthons[k], reactants[k])
            frag_mol_list.append((frag_mol, source_mol, target_mol))
            # frag_smiles_list.append(Chem.MolToSmiles(remove_am(frag_mol)))
        return frag_mol_list
    except:
        print(source)
        print(target)
        print()
        return None
    


In [5]:
frag_mol_list_raw = pmap_multi(handle_per_task, [(target, source) for (target, source) in tasks])

[Parallel(n_jobs=126)]: Using backend LokyBackend with 126 concurrent workers.
315it [00:19, 55.86it/s][Parallel(n_jobs=126)]: Done 198 tasks      | elapsed:    6.8s
[Parallel(n_jobs=126)]: Done 548 tasks      | elapsed:    8.8s
[Parallel(n_jobs=126)]: Done 998 tasks      | elapsed:   10.7s
[Parallel(n_jobs=126)]: Done 1548 tasks      | elapsed:   12.6s
[Parallel(n_jobs=126)]: Done 2198 tasks      | elapsed:   15.0s


[C:1]([CH2:2][CH:3]=[CH2:4])([C:5](=[O:6])[O:7][CH2:8][CH3:9])([CH:10]([F:11])[F:12])[NH2:13]
O=C([C:1]([CH2:2][CH:3]=[CH2:4])([C:5](=[O:6])[O:7][CH2:8][CH3:9])[CH:10]([F:11])[F:12])[NH2:13]



[Parallel(n_jobs=126)]: Done 2948 tasks      | elapsed:   17.6s


[CH3:1][C:2](=[O:3])[c:8]1[cH:7][cH:6][c:5]([OH:4])[cH:10][cH:9]1
[CH3:1][C:2](=[O:3])[O:4][c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1



[Parallel(n_jobs=126)]: Done 3798 tasks      | elapsed:   20.3s
[Parallel(n_jobs=126)]: Done 4748 tasks      | elapsed:   22.9s
[Parallel(n_jobs=126)]: Done 5798 tasks      | elapsed:   26.0s
[Parallel(n_jobs=126)]: Done 6948 tasks      | elapsed:   27.7s


[O:1]=[C:2]([O:3][CH2:4][c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1)[N:11]1[CH2:12][CH2:13][C:14]2([CH2:15][CH2:16]1)[CH2:17][C:18](=[O:20])[NH:19][c:21]1[cH:22][cH:23][cH:24][cH:25][c:26]12
[O:1]=[C:2]([O:3][CH2:4][c:5]1[cH:6][cH:7][cH:8][cH:9][cH:10]1)[N:11]1[CH2:12][CH2:13][C:14]2([CH2:15][CH2:16]1)[CH2:17][C:18](=[N:19][OH:20])[c:21]1[cH:22][cH:23][cH:24][cH:25][c:26]12



[Parallel(n_jobs=126)]: Done 8198 tasks      | elapsed:   29.8s


[O:1]=[C:3]1[NH:2][CH2:4][CH2:5][c:6]2[o:7][c:8]3[cH:9][cH:10][cH:11][cH:12][c:13]3[c:14]21
[OH:1][N:2]=[C:3]1[CH2:4][CH2:5][c:6]2[o:7][c:8]3[cH:9][cH:10][cH:11][cH:12][c:13]3[c:14]21



[Parallel(n_jobs=126)]: Done 9548 tasks      | elapsed:   32.3s
[Parallel(n_jobs=126)]: Done 10998 tasks      | elapsed:   35.7s


[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]2[c:7]([cH:8]1)[C:9](=[O:11])[NH:10][CH2:12][CH2:13]2
[CH3:1][O:2][c:3]1[cH:4][cH:5][c:6]2[c:7]([cH:8]1)[C:9](=[N:10][OH:11])[CH2:12][CH2:13]2



[Parallel(n_jobs=126)]: Done 12548 tasks      | elapsed:   39.5s


[C@H:1]1([CH3:3])[C@H:2]([NH:15][CH:16]2[CH2:17][CH2:18][CH2:19][CH2:20][CH2:21]2)[CH2:14][CH2:13][N:5]([C:6](=[O:7])[O:8][C:9]([CH3:10])([CH3:11])[CH3:12])[CH2:4]1
O=[C:1]1[CH:2]([CH3:3])[CH2:4][N:5]([C:6](=[O:7])[O:8][C:9]([CH3:10])([CH3:11])[CH3:12])[CH2:13][CH2:14]1.[NH2:15][CH:16]1[CH2:17][CH2:18][CH2:19][CH2:20][CH2:21]1





[NH2:1][C@H:17]1[CH2:33][CH2:4][N:5]([C:6](=[O:7])[c:8]2[cH:9][cH:10][c:11]([CH2:12][N:13]3[CH2:14][CH2:15][N:16]([CH:2]4[CH2:3][CH2:19][N:20]([c:21]5[n:22][cH:23][c:24]([C:25]([F:26])([F:27])[F:28])[cH:29][c:30]5[F:31])[CH2:32][CH2:18]4)[C:34]3=[O:35])[cH:36][cH:37]2)[CH2:38]1
CC(C)(C)OC(=O)[NH:1][C@H:2]1[CH2:3][CH2:4][N:5]([C:6](=[O:7])[c:8]2[cH:9][cH:10][c:11]([CH2:12][N:13]3[CH2:14][CH2:15][N:16]([CH:17]4[CH2:18][CH2:19][N:20]([c:21]5[n:22][cH:23][c:24]([C:25]([F:26])([F:27])[F:28])[cH:29][c:30]5[F:31])[CH2:32][CH2:33]4)[C:34]3=[O:35])[cH:36][cH:37]2)[CH2:38]1





[C@@H:1]1([O:3][CH3:4])[CH2:8][O:6][CH2:7][CH2:29][C@@H:30]1[NH:31][C@@H:16]1[CH2:5][CH2:28][C@:11]([CH2:10][CH3:9])([C:12](=[O:13])[N:20]2[C@@H:2]3[CH2:15][N:14]([C:21](=[O:22])[O:23][C:24]([CH3:25])([CH3:26])[CH3:27])[C@@H:18]([CH2:17]3)[CH2:19]2)[CH2:32]1
O=[C:1]1[CH:2]([O:3][CH3:4])[CH2:5][O:6][CH2:7][CH2:8]1.[CH3:9][CH2:10][C@:11]1([C:12](=[O:13])[N:14]2[CH2:15][C@@H:16]3[CH2:17][C@H:18]2[CH2:19][N:20]3[C:21](=[O:22])[O:23][C:24]([CH3:25])([CH3:26])[CH3:27])[CH2:28][CH2:29][C@@H:30]([NH2:31])[CH2:32]1



[Parallel(n_jobs=126)]: Done 14198 tasks      | elapsed:   43.6s


[N:1]#[C:2][c:3]1[cH:4][n:5](-[c:6]2[c:7]([Cl:8])[cH:9][c:10]([C:11]([F:12])([F:13])[F:14])[cH:15][c:16]2[Cl:17])[n:18][c:19]1[N+:20](=[O:21])[O-:22]
[N:1]#[C:2][c:3]1[cH:4][n:5](-[c:6]2[c:7]([Cl:8])[cH:9][c:10]([C:11]([F:12])([F:13])[F:14])[cH:15][c:16]2[Cl:17])[n:18][c:19]1[NH2:20].[OH:21][OH:22]



[Parallel(n_jobs=126)]: Done 15948 tasks      | elapsed:   49.2s


[c:1]1([NH2:33])[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[c:28]([F:29])[cH:30]2)[c:31]1[Cl:32]
O=C([c:1]1[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[c:28]([F:29])[cH:30]2)[c:31]1[Cl:32])[NH2:33]

[C:1](=[O:2])([c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1)[NH:18][C@@H:14]1[CH2:12][C@@H:13]([CH3:16])[CH2:19][CH2:11][C@@H:17]1[CH:10]([CH3:9])[CH3:15]
Cl[C:1](=[O:2])[c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1.[CH3:9][CH:10]1[CH2:11][CH2:12][CH:13]([CH:14]([CH3:15])[CH3:16])[CH:17]([NH2:18])[CH2:19]1



[Parallel(n_jobs=126)]: Done 17798 tasks      | elapsed:   52.7s


[CH3:1][CH2:2][C:3](=[O:4])[NH:5][c:6]1[c:7]([O:8][CH3:9])[cH:10][cH:11][c:12]([C:15](=[O:16])[CH2:17][CH3:18])[c:13]1[OH:14]
[CH3:1][CH2:2][C:3](=[O:4])[NH:5][c:6]1[c:7]([O:8][CH3:9])[cH:10][cH:11][cH:12][c:13]1[O:14][C:15](=[O:16])[CH2:17][CH3:18]



[Parallel(n_jobs=126)]: Done 19748 tasks      | elapsed:   55.9s
[Parallel(n_jobs=126)]: Done 21798 tasks      | elapsed:   59.4s


[O:1]=[C:3]1[NH:2][CH2:4][CH2:5][S:6][c:7]2[s:8][cH:9][cH:10][c:11]21
[OH:1][N:2]=[C:3]1[CH2:4][CH2:5][S:6][c:7]2[s:8][cH:9][cH:10][c:11]21





[C@H:1]1([CH3:3])[C@@H:2]([N:18]2[CH2:17][CH:16]=[C:15]([c:14]3[c:8]([CH3:7])[cH:9][c:10]([Br:11])[cH:12][n:13]3)[CH2:20][CH2:19]2)[CH2:6][CH2:5][O:4]1
O=[C:1]1[CH:2]([CH3:3])[O:4][CH2:5][CH2:6]1.[CH3:7][c:8]1[cH:9][c:10]([Br:11])[cH:12][n:13][c:14]1[C:15]1=[CH:16][CH2:17][NH:18][CH2:19][CH2:20]1



[Parallel(n_jobs=126)]: Done 23948 tasks      | elapsed:  1.1min


[C@@H:1]12[C@:3]3([OH:12])[CH2:4][CH2:15][C@H:10]([NH:25][CH3:24])[C@H:2]4[C@:11]3([CH2:13][CH2:5][N:6]1[CH2:7][CH:8]=[CH2:9])[c:17]1[c:16]([cH:22][cH:21][c:19]([OH:20])[c:18]1[O:23]4)[CH2:14]2
O=[C:1]1[C@H:2]2[C@@:3]34[CH2:4][CH2:5][N:6]([CH2:7][CH:8]=[CH2:9])[C@@H:10]([C@:11]3([OH:12])[CH2:13][CH2:14]1)[CH2:15][c:16]1[c:17]4[c:18]([c:19]([OH:20])[cH:21][cH:22]1)[O:23]2.[CH3:24][NH2:25]



[Parallel(n_jobs=126)]: Done 26198 tasks      | elapsed:  1.1min


[O:1]=[C:3]1[NH:2][CH:14]2[CH2:13][CH:12]([c:11]3[c:4]1[cH:5][c:6]([Br:7])[c:8]([F:9])[cH:10]3)[CH2:15]2
[OH:1][N:2]=[C:3]1[c:4]2[cH:5][c:6]([Br:7])[c:8]([F:9])[cH:10][c:11]2[CH:12]2[CH2:13][CH:14]1[CH2:15]2





[c:1]1([NH2:13])[c:2]([C:3]([F:4])([F:5])[F:6])[cH:7][c:8]([CH3:9])[n:10][c:11]1[Cl:12]
O=C([c:1]1[c:2]([C:3]([F:4])([F:5])[F:6])[cH:7][c:8]([CH3:9])[n:10][c:11]1[Cl:12])[NH2:13]



[Parallel(n_jobs=126)]: Done 28548 tasks      | elapsed:  1.3min


[NH2:1][c:2]1[cH:3][c:4]([O:5][c:6]2[cH:7][cH:8][c:9]([N:10]([CH2:11][c:12]3[cH:13][cH:14][c:15]([F:16])[cH:17][cH:18]3)[C:19](=[O:20])[C:21]3([C:22]([NH2:23])=[O:24])[CH2:25][CH2:26]3)[cH:27][c:28]2[F:29])[cH:30][cH:31][n:32]1
O=C([NH2:1])[c:2]1[cH:3][c:4]([O:5][c:6]2[cH:7][cH:8][c:9]([N:10]([CH2:11][c:12]3[cH:13][cH:14][c:15]([F:16])[cH:17][cH:18]3)[C:19](=[O:20])[C:21]3([C:22]([NH2:23])=[O:24])[CH2:25][CH2:26]3)[cH:27][c:28]2[F:29])[cH:30][cH:31][n:32]1





[C@@H:1]1([c:14]2[cH:15][cH:16][cH:17][cH:18][cH:19]2)[CH2:12][c:11]2[c:6]([cH:7][cH:8][cH:9][cH:10]2)[NH:5][C:3](=[O:4])[C@H:13]1[NH2:2]
COC(=O)[C:1]1([NH2:2])[C:3](=[O:4])[NH:5][c:6]2[cH:7][cH:8][cH:9][cH:10][c:11]2[CH2:12][CH:13]1[c:14]1[cH:15][cH:16][cH:17][cH:18][cH:19]1



[Parallel(n_jobs=126)]: Done 30998 tasks      | elapsed:  1.4min


[CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:8]1[CH2:9][CH2:10][C@H:12]([C@H:11]([OH:13])[CH:15]2[CH2:16][CH2:17]2)[CH2:14]1
[CH3:1][C:2]([CH3:3])([CH3:4])[O:5][C:6](=[O:7])[N:8]1[CH2:9][CH2:10][C@H:11]([CH:12]=[O:13])[CH2:14]1.[Mg+][CH:15]1[CH2:16][CH2:17]1



[Parallel(n_jobs=126)]: Done 33548 tasks      | elapsed:  1.4min


[NH2:1][c:2]1[cH:3][c:4]([O:5][c:6]2[cH:7][cH:8][c:9]([NH:10][C:11](=[O:12])[C:13]3([C:14](=[O:15])[NH:16][c:17]4[cH:18][cH:19][c:20]([F:21])[cH:22][cH:23]4)[CH2:24][CH2:25]3)[c:26]([F:27])[cH:28]2)[cH:29][cH:30][n:31]1
O=C([NH2:1])[c:2]1[cH:3][c:4]([O:5][c:6]2[cH:7][cH:8][c:9]([NH:10][C:11](=[O:12])[C:13]3([C:14](=[O:15])[NH:16][c:17]4[cH:18][cH:19][c:20]([F:21])[cH:22][cH:23]4)[CH2:24][CH2:25]3)[c:26]([F:27])[cH:28]2)[cH:29][cH:30][n:31]1

[CH2:1]([CH:2]([CH:3]1[O:4][CH2:5][CH:6]([CH2:7][CH2:8][CH2:9][CH2:10][OH:11])[O:12]1)[Cl:14])[Cl:13]
[CH2:1]=[CH:2][CH:3]1[O:4][CH2:5][CH:6]([CH2:7][CH2:8][CH2:9][CH2:10][OH:11])[O:12]1.[Cl:13][Cl:14]





[c:1]1([NH2:33])[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[cH:28][c:29]2[F:30])[c:31]1[Cl:32]
O=C([c:1]1[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[cH:28][c:29]2[F:30])[c:31]1[Cl:32])[NH2:33]



[Parallel(n_jobs=126)]: Done 36198 tasks      | elapsed:  1.5min


[O:1]=[C:2]([c:5]1[c:4]([OH:3])[cH:9][cH:8][cH:7][cH:6]1)[O:10][c:11]1[cH:12][cH:13][cH:14][cH:15][cH:16]1
[O:1]=[C:2]([O:3][c:4]1[cH:5][cH:6][cH:7][cH:8][cH:9]1)[O:10][c:11]1[cH:12][cH:13][cH:14][cH:15][cH:16]1





[c:1]1([NH2:32])[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[cH:28][c:29]2[Cl:30])[cH:31]1
O=C([c:1]1[n:2][cH:3][cH:4][c:5]([O:6][c:7]2[cH:8][cH:9][c:10]([NH:11][C:12]([c:13]3[c:14]([CH3:15])[n:16]([CH3:17])[n:18](-[c:19]4[cH:20][cH:21][cH:22][cH:23][cH:24]4)[c:25]3=[O:26])=[O:27])[cH:28][c:29]2[Cl:30])[cH:31]1)[NH2:32]



[Parallel(n_jobs=126)]: Done 38948 tasks      | elapsed:  1.7min


[C:1]1([CH3:2])([NH2:15])[CH2:3][CH2:4][N:5]([C:6]([O:7][C:8]([CH3:9])([CH3:10])[CH3:11])=[O:12])[CH2:13][CH2:14]1
O=C([C:1]1([CH3:2])[CH2:3][CH2:4][N:5]([C:6]([O:7][C:8]([CH3:9])([CH3:10])[CH3:11])=[O:12])[CH2:13][CH2:14]1)[NH2:15]





[C:1]1(=[O:2])[CH2:3][CH2:4][C:5]([CH2:6][NH:7][C:8](=[O:9])[C:10]1
C1C[O:2][C:1]2(O1)[CH2:3][CH2:4][C:5]([CH2:6][NH:7][C:8](=[O:9])[C:10]([F:11])([F:12])[F:13])([c:14]1[cH:15][cH:16][cH:17][c:18]([Cl:19])[cH:20]1)[CH2:21][CH2:22]2



[Parallel(n_jobs=126)]: Done 41798 tasks      | elapsed:  1.8min


[C@H:1]12[CH2:2][CH2:24][C@@H:23]([N:6]3[CH2:5][CH2:37][CH2:38][CH2:34]3)[CH2:41][N:40]1[CH2:39][C@@H:4]([CH2:3][c:25]1[cH:26][nH:27][c:28]3[cH:29][cH:30][cH:31][cH:32][c:33]13)[N:35]([C:7](=[O:8])[c:9]1[cH:10][c:11]([C:12]([F:13])([F:14])[F:15])[cH:16][c:17]([C:18]([F:19])([F:20])[F:21])[cH:22]1)[CH2:36]2
O=[C:1]1[CH2:2][CH2:3][C@@H:4]2[CH2:5][N:6]([C:7](=[O:8])[c:9]3[cH:10][c:11]([C:12]([F:13])([F:14])[F:15])[cH:16][c:17]([C:18]([F:19])([F:20])[F:21])[cH:22]3)[C@H:23]([CH2:24][c:25]3[cH:26][nH:27][c:28]4[cH:29][cH:30][cH:31][cH:32][c:33]34)[CH2:34][N:35]2[CH2:36]1.[CH2:37]1[CH2:38][CH2:39][NH:40][CH2:41]1





[c:1]1([NH:33][C@H:18]2[CH2:31][CH2:30][N:21]([C@H:20]3[CH2:19][CH2:17][C@@H:28]([N:29]([CH3:22])[CH:32]([CH3:24])[CH3:25])[CH2:26][C@H:23]3[CH2:27][CH2:16][CH3:15])[C:34]2=[O:35])[cH:2][cH:3][n:4][c:5]2[cH:6][cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][c:14]12
Cl[c:1]1[cH:2][cH:3][n:4][c:5]2[cH:6][cH:7][c:8]([C:9]([F:10])([F:11])[F:12])[cH:13][c:14]12.[CH3:15][CH2:16][CH2:17][C@@H:18]1[CH2:19][C@H:20]([N:21]([CH3:22])[CH:23]([CH3:24])[CH3:25])[CH2:26][CH2:27][C@@H:28]1[N:29]1[CH2:30][CH2:31][C@H:32]([NH2:33])[C:34]1=[O:35]



[Parallel(n_jobs=126)]: Done 44748 tasks      | elapsed:  1.9min


[C:1]([NH:3][C@@H:2]([C:5]([CH3:6])([CH3:7])[CH3:35])[C:28]([N:26]([CH3:27])[C@H:30](/[CH:21]=[C:19](/[C:17]([O:16][CH2:15][CH3:14])=[O:18])[CH3:20])[CH:23]([CH3:24])[CH3:25])=[O:29])(=[O:13])[C@H:22]([NH:31][CH3:4])[C:32]([c:8]1[cH:9][cH:10][cH:11][s:12]1)([CH3:33])[CH3:34]
O[C:1]([C@@H:2]([NH:3][CH3:4])[C:5]([CH3:6])([CH3:7])[c:8]1[cH:9][cH:10][cH:11][s:12]1)=[O:13].[CH3:14][CH2:15][O:16][C:17](=[O:18])/[C:19]([CH3:20])=[CH:21]/[C@H:22]([CH:23]([CH3:24])[CH3:25])[N:26]([CH3:27])[C:28](=[O:29])[C@@H:30]([NH2:31])[C:32]([CH3:33])([CH3:34])[CH3:35]

[CH2:1]=[CH:2][C:3]([NH:4][C:6]([CH3:5])([CH3:7])[CH3:8])=[O:9]
[CH2:1]=[CH:2][C:3]#[N:4].[CH3:5][C:6]([CH3:7])([CH3:8])[OH:9]





[CH3:1][C:2]([CH3:3])([CH3:4])[NH:6][C:7](=[O:5])[CH:8]1[CH2:9][NH:10][CH2:11][CH2:12][NH:13]1
[CH3:1][C:2]([CH3:3])([CH3:4])[OH:5].[N:6]#[C:7][CH:8]1[CH2:9][NH:10][CH2:11][CH2:12][NH:13]1



[Parallel(n_jobs=126)]: Done 47798 tasks      | elapsed:  2.0min
50016it [02:02, 409.37it/s]
[Parallel(n_jobs=126)]: Done 50016 out of 50016 | elapsed:  2.0min finished


In [6]:
frag_mol_list_raw = sum([one for one in frag_mol_list_raw if one is not None],[])

frag_source_target_list = [(Chem.MolToSmiles(remove_am(one[0])), Chem.MolToSmiles(one[1]), Chem.MolToSmiles(one[2])) for one in frag_mol_list_raw]

print(f"valid frag mol:{len(frag_source_target_list)}")

valid frag mol:85544


In [7]:
frag_smiles_list = [frag_smi for (frag_smi,source_smi, target_smi) in frag_source_target_list]
frag_smiles_list_splitted = sum([one.split(".") for one in frag_smiles_list], [])

# 我们只考虑单点编辑的情况

def canonical_smiles_per_task(smi):
    # 设置sanitize=False防止MolFromSmiles时自动将C变成c
    return Chem.MolToSmiles(Chem.MolFromSmiles(smi, sanitize=False))

frag_smiles_list_splitted = [ canonical_smiles_per_task(one)  for one in frag_smiles_list_splitted]



from collections import Counter
frag_smiles_count = dict(Counter(frag_smiles_list_splitted))
frag_smiles_count = sorted(frag_smiles_count.items(),key = lambda x:x[1],reverse = True)
frag_smiles_count = {key:val for key, val in frag_smiles_count}
json.dump(frag_smiles_count, open(sv_path+"frag_smiles_count.json", "w"), indent=4)
 
with open(sv_path+"frag_smiles.txt","w") as f:
    f.writelines("\n".join(list(frag_smiles_count.keys())))

In [8]:
data_with_frag = pd.DataFrame(columns=["frag","source", "target"])
for i, (frag,source, target) in enumerate(frag_source_target_list):
    data_with_frag.loc[i,"frag"] = frag
    data_with_frag.loc[i,"source"] = source
    data_with_frag.loc[i,"target"] = target
data_with_frag.to_csv("/gaozhangyang/experiments/MotifRetro/data/uspto_50k/data_with_frag.csv")

In [28]:
for idx, one in enumerate(frag_smiles_list):
    for smi in one.split("."):
        if canonical_smiles_per_task(smi) == "O=c1*c(=O)c2ccccc12":
            print(idx)
            break

368
409
948
1296
1541
1637
2705
2722
3458
3956
4383
5127
6101
7453
7578
7878
8203
8308
8314
8945
9409
10583
10717
11437
11523
12776
13048
13207
13269
14570
15651
18407
18759
20203
20214
20754
20772
20866
21411
22325
23671
24376
24891
24958
25516
26657
26658
26702
26832
26965
28561
29157
29949
30928
31018
31514
31928
32191
32337
32393
32902
33254
35595
37185
37888
38493
41887
42164
42753
42853
43519
44912
45478
46905
47963
48499
49631
49632
50971
51249
51293
51665
51803
52357
52931
53683
53718
53780
53903
54248
54551
56198
57092
57235
57340
57729
59882
60643
61188
61217
61373
62284
63336
63382
63506
64281
64500
65007
65010
65933
66860
66863
68895
68912
69096
73058
73360
75295
75622
75834
76027
76348
76591
77480
77580
77735
78628
79306
79860
80192
80202
80461
80878
81114
81204
81896
82106
82121
82367
82605
82647
84780
