In [189]:
import logging
import os.path
import networkx as nx
from PIL import Image
import numpy as np
import re
import selfies as sf
import sys
import time
import argparse
import torch
from torch.utils.data import DataLoader
from rdkit import Chem
from torch.utils.data import Dataset
from typing import Dict, List, Tuple
from utils.chem_utils import ATOM_FDIM, BOND_FDIM, get_atom_features_sparse, get_bond_features
from utils.rxn_graphs import RxnGraph
from utils.data_utils import get_graph_features_from_smi, load_vocab, make_vocab, \
    tokenize_selfies_from_smiles, tokenize_smiles, S2SDataset, G2SDataset

In [168]:
sum(ATOM_FDIM)

105

In [192]:
from rdkit import Chem
from rdkit.Chem import Draw
import time
from rdkit.Chem import AllChem
def DrawSMILES(smiles):
    print("SMILES:", smiles)
    mol = Chem.MolFromSmiles(smiles)
    timestamp = time.time()
    Draw.MolToFile(mol, "./images/output" + str(timestamp) + ".png")
    print("图片已保存为" + str(timestamp) + ".png")
DrawSMILES("CC(C)[C@@H](N)C")

SMILES: CC(C)[C@@H]C


START /usr/bin/eog "/tmp/tmpg34kxo4x.PNG"





In [183]:
smiles = "CC1=CC(=CC=C1)N(C=C(COC2=CC=C(C=C2)C(C)C)N)N.C1[C@@H]2[C@H]([C@@H]3[C@H]1C(=O)OC3=O)C(=O)OC2=O"
get_graph_features_from_smi((0,smiles,False))[2]

array([[ 0, 24],
       [24, 17]], dtype=int32)

In [114]:
args = argparse.Namespace()
args.train_bin = "/home/chenlidong/poly2SMILES/preprocessed/sample_g2s_series_rel_smiles_smiles/train_0.npz"
args.vocab_file = "/home/chenlidong/poly2SMILES/preprocessed/sample_g2s_series_rel_smiles_smiles/vocab_smiles.txt"
args.verbose = True
args.batch_type = "tokens"
args.train_batch_size = 64
args.enable_amp = False
args.compute_graph_distance = True
args.task = "reaction_prediction"

In [115]:
train_dataset = G2SDataset(args, file=args.train_bin)

In [116]:
var = vars(train_dataset)
print(var.keys())

dict_keys(['args', 'a_scopes', 'b_scopes', 'a_features', 'b_features', 'a_graphs', 'b_graphs', 'src_token_ids', 'src_lens', 'tgt_token_ids', 'tgt_lens', 'data_indices', 'batch_sizes', 'batch_starts', 'batch_ends', 'vocab', 'vocab_tokens', 'a_scopes_indices', 'b_scopes_indices', 'a_features_indices', 'b_features_indices', 'data_size'])


In [117]:
print(var['batch_ends'])

[]


In [118]:
train_dataset.sort()
train_dataset.shuffle_in_bucket(bucket_size=1000)
train_dataset.batch(
            batch_type=args.batch_type,
            batch_size=args.train_batch_size
        )

In [119]:
train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda _batch: _batch[0],
        pin_memory=True
    )

In [171]:
# for batch_idx, batch in enumerate(train_loader):
#     print(batch.agraph)
#     print(batch.fnode)
#     break

In [113]:
var = vars(train_dataset)
print(var['batch_ends'])

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100]


In [72]:
def scale_value(value, min_value, max_value):
    scaled_value = (value - min_value) / (max_value - min_value) * 9.999
    return scaled_value

def unscale_value(scaled_value, min_value, max_value):
    value = scaled_value / 9.999 * (max_value - min_value) + min_value
    return value

# 已知的最大值和最小值
min_value = 0.6870349645614624
max_value = 2.5119409561157227

In [None]:
unscale_value(0.6652,min_value,max_value)

: 

In [None]:
def tokenize_smiles(smi: str) -> str:
    pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == "".join(tokens), f"Tokenization mismatch. smi: {smi}, tokens: {tokens}"

    return " ".join(tokens)

def canonicalize_smiles(smiles, remove_atom_number=False, trim=True, suppress_warning=False):
    cano_smiles = ""

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        cano_smiles = ""

    else:
        if trim and mol.GetNumHeavyAtoms() < 2:
            if not suppress_warning:
                logging.info(f"Problematic smiles: {smiles}, setting it to 'CC'")
            cano_smiles = "CC"          # TODO: hardcode to ignore
        else:
            if remove_atom_number:
                [a.ClearProp('molAtomMapNumber') for a in mol.GetAtoms()]
            cano_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)

    return cano_smiles

def escape_special_characters(string):
    escaped_string = string.replace('[', r'\[').replace(']', r'\]').replace('*', r'\*')
    return escaped_string


: 

In [None]:
text = "*CCCc1nccnc1CCCN1C(=O)[C@@H]2[C@@H](CC(C)=C3CO[C@H]4[C@H]5OCC6=C(C)C[C@@H]7C(=O)N(*)C(=O)[C@@H]7[C@@H]6[C@H]5C[C@H]4[C@@H]32)C1=O"
print(r"{}".format(tokenize_smiles(((text)))))

: 

In [None]:
"* C C C c 1 n c c n c 1 C C C N 1 C ( = O ) [C@@H] 2 [C@@H] ( C C ( C ) = C 3 C O [C@H] 4 [C@H] 5 O C C 6 = C ( C ) C [C@@H] 7 C ( = O ) N ( * ) C ( = O ) [C@@H] 7 [C@@H] 6 [C@H] 5 C [C@H] 4 [C@@H] 3 2 ) C 1 = O"
"* C C C c 1 n c c n c 1 C C C N 1 C ( = O ) [C@H] 2 [C@@H] 3 C ( = C ( C ) C [C@H] 2 C 1 = O ) C O [C@@H] 1 [C@H] 3 C [C@H] 2 [C@@H] 1 O C C 1 = C ( C ) C [C@@H] 3 C ( = O ) N ( * ) C ( = O ) [C@@H] 3 [C@@H] 1 2"
"* C C C c 1 n c c n c 1 C C C N 1 C ( = O ) [C@@H] 2 [C@@H] ( C C ( C ) = C 3 C O [C@H] 4 [C@H] 5 O C C 6 = C ( C ) C [C@@H] 7 C ( = O ) N ( * ) C ( = O ) [C@@H] 7 [C@@H] 6 [C@H] 5 C [C@H] 4 [C@@H] 3 2 ) C 1 = O"

: 

In [None]:
smiles = "*CCCc1nccnc1CCCN1C(=O)c2ccc3c4c(Oc5cc(C(C)(C)C)cc(C(C)(C)C)c5)cc5c6c(ccc(c7c(Oc8cc(C(C)(C)C)cc(C(C)(C)C)c8)cc(c2c37)C1=O)c64)C(=O)N(*)C5=O"
print(r"{}".format(escape_special_characters(tokenize_smiles(canonicalize_smiles(smiles)))))


: 

In [None]:
def numEmbedding(num):
    num = str(num)
    regex = re.compile(r"\s*?(\+|-)?(\d+)(\.)?(\d+)?\s*")
    tokens = []
    matched = regex.match(num)
    if matched:
        sign, units, dot, decimals = matched.groups()
        if sign:
            tokens += [f"_{sign}_"]
        tokens += [
            f"_{number}_{position}_" for position, number in enumerate(units[::-1])
        ][::-1]
        if dot:
            tokens += [f"_{dot}_"]
        if decimals:
            tokens += [
                f"_{number}_-{position}_"
                for position, number in enumerate(decimals, 1)
            ]
            
    #for convinence this is to be modified
    for i in range(len(tokens),6):
        tokens += ["_0_-"+ str(i-1) +"_"]
        
    return " ".join(tokens)

: 

In [None]:
val1 = 1.0978717803955078
scale_num = scale_value(val1,min_value,max_value)
scale_num = round(scale_num, 4)
scale_num_embedding = numEmbedding(scale_num)
scale_num_embedding

: 

In [None]:
import os
import pandas as pd
import time
# 指定文件夹路径
folder_path = "/home/chenlidong/data/800w_store/sample"  # 替换为实际的文件夹路径


global index_src
global index_tgt
index_src = 0
index_tgt = 0
def src(val1, val2):
    global index_src
    index_src+=1
    print("src" + str(index_src))
    result = tokenize_smiles(canonicalize_smiles(val1)) + " . " + tokenize_smiles(canonicalize_smiles(val2))  # 示例：将两列相加
    return result
def tgt(val1, val2):
    global index_tgt
    index_tgt+=1
    print("tgt" + str(index_tgt))
    scale_num = scale_value(val1,min_value,max_value)
    scale_num = round(scale_num, 4)
    scale_num_embedding = numEmbedding(scale_num)
    result = scale_num_embedding + " " + tokenize_smiles(canonicalize_smiles(val2))
    # result = tokenize_smiles(canonicalize_smiles(val2))
    return result


df_all = pd.DataFrame(index=None)
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # 确保文件是CSV文件
        file_path = os.path.join(folder_path, filename)
        # 读取CSV文件并进行处理
        data = pd.read_csv(file_path, skiprows=1)
        data = data.sample(frac=1).reset_index(drop=True)
        
        df_cur = pd.DataFrame(index=None)
        df_cur[0] = data.apply(lambda row: src(row[8], row[9]), axis=1)
        df_cur[1] = data.apply(lambda row: tgt(row[1], row[0]), axis=1)
        df_all = pd.concat([df_all,df_cur])
        
        
shuffled_df = df_all.sample(frac=1, random_state=12138)
shuffled_df[0].to_csv("/home/chenlidong/data/800w_process/sample/src.csv",index=False, header=False)
shuffled_df[1].to_csv("/home/chenlidong/data/800w_process/sample/tgt.csv",index=False, header=False)

        # df_src = pd.DataFrame(index=None)
        # df_tgt = pd.DataFrame(index=None)
        # df_tgt = data.apply(lambda row: tgt(row[1], row[0]), axis=1)
        # df_src = data.apply(lambda row: src(row[8], row[9]), axis=1)
        # df_src.to_csv("/home/chenlidong/data/800w_process/sample/src.csv",mode='a',index=False, header=False)
        # df_tgt.to_csv("/home/chenlidong/data/800w_process/sample/tgt.csv",mode='a',index=False, header=False)
        # print(df_tgt)
        # print(df_src)
        # time.sleep(100)






: 

In [None]:
import pandas as pd
df = pd.read_csv("/home/chenlidong/data/grap2smiles/train.txt")

def src(val1, val2):
    # 在这里进行你的自定义操作
    result = tokenize_smiles(canonicalize_smiles(val1)) + " . " + tokenize_smiles(canonicalize_smiles(val2))  # 示例：将两列相加
    return result
def tgt(val1, val2):
    scale_num = scale_value(val1,min_value,max_value)
    scale_num = round(scale_num, 4)
    scale_num_embedding = numEmbedding(scale_num)
    result = scale_num_embedding + " " + tokenize_smiles(canonicalize_smiles(val2))
    # result = tokenize_smiles(canonicalize_smiles(val2))
    return result


df_src = pd.DataFrame(index=None)
df_tgt = pd.DataFrame(index=None)
df_tgt[1] = df.apply(lambda row: tgt(row[1], row[0]), axis=1)
df_src[0] = df.apply(lambda row: src(row[8], row[9]), axis=1)
df_src.to_csv("./data/Ma/test_src.csv",index=False)
df_tgt.to_csv("./data/Ma/train_tgt.csv",index=False)
# print(src("C1=CC(=NC=C1C2=CN=C3N2N=C(C=C3)N)N","C1C[C@H]2[C@@H](C[C@@H]1[C@H]3CC[C@@H]4[C@H](C3)C(=O)OC4=O)C(=O)OC2=O"))
# print(tgt(1.2131241123123124,"*c1ccc(-c2cnc3ccc(N4C(=O)[C@H]5CC[C@@H]([C@H]6CC[C@H]7C(=O)N(*)C(=O)[C@H]7C6)C[C@H]5C4=O)nn23)cn1"))

: 

In [None]:
import pandas as pd
df = pd.read_csv("/home/chenlidong/data/grap2smiles/test.txt")
df.columns

: 

In [None]:
print(df2.iloc[0][0])

: 

In [None]:
numEmbedding("1.11")

: 

In [None]:
train_src = "/home/chenlidong/Graph2SMILES/data/Ma/train_src.csv"
train_tgt = "/home/chenlidong/Graph2SMILES/data/Ma/train_tgt.csv"
val_src = "/home/chenlidong/Graph2SMILES/data/Ma/val_src.csv"
val_tgt = "/home/chenlidong/Graph2SMILES/data/Ma/val_tgt.csv"
test_src = "/home/chenlidong/Graph2SMILES/data/Ma/test_src.csv"
test_tgt = "/home/chenlidong/Graph2SMILES/data/Ma/test_tgt.csv"
fns = {
    "train": [(train_src, train_tgt)],
    "val": [(val_src, val_tgt)],
    "test": [(test_src, test_tgt)]
}
make_vocab(
    fns=fns,
    vocab_file="./vocab.txt",
    tokenized=True
)

: 

In [None]:
names = ['*', 'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'H', 'As', 'Al', 'I', 'B', 'Sb', 'Sn', 'Se', 'Ge', 'In', 'Pb', 'Te', 'Bi']
frequencies = [8205087, 8205087, 8205087, 8205087, 1913804, 422611, 133356, 23862, 302380, 314574, 1358, 85165, 582, 97262, 2910, 2328, 2328, 10282, 1746, 194, 194, 582, 388]

# 使用zip函数将两个列表合并为一个元组列表
combined_list = list(zip(names, frequencies))

# 使用sorted函数对元组列表进行排序，按照元组的第二个元素（频数）进行降序排序
sorted_list = sorted(combined_list, key=lambda x: x[1], reverse=True)

# 获取排序后的名称列表
sorted_names = [item[0] for item in sorted_list]
sorted_freq = [item[1] for item in sorted_list]

print(sorted_names)


: 

In [18]:
import numpy as np
from rdkit import Chem
from typing import List


# Symbols for different atoms
ATOM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
             'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti',
             'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb',
             'W', 'Ru', 'Nb', 'Re', 'Te', 'Rh', 'Ta', 'Tc', 'Ba', 'Bi', 'Hf', 'Mo', 'U', 'Sm', 'Os', 'Ir',
             'Ce', 'Gd', 'Ga', 'Cs', '*', 'unk']
ATOM_DICT = {symbol: i for i, symbol in enumerate(ATOM_LIST)}

MAX_NB = 10
DEGREES = list(range(MAX_NB))
HYBRIDIZATION = [Chem.rdchem.HybridizationType.SP,
                 Chem.rdchem.HybridizationType.SP2,
                 Chem.rdchem.HybridizationType.SP3,
                 Chem.rdchem.HybridizationType.SP3D,
                 Chem.rdchem.HybridizationType.SP3D2]
HYBRIDIZATION_DICT = {hb: i for i, hb in enumerate(HYBRIDIZATION)}

FORMAL_CHARGE = [-1, -2, 1, 2, 0]
FC_DICT = {fc: i for i, fc in enumerate(FORMAL_CHARGE)}

VALENCE = [0, 1, 2, 3, 4, 5, 6]
VALENCE_DICT = {vl: i for i, vl in enumerate(VALENCE)}

NUM_Hs = [0, 1, 3, 4, 5]
NUM_Hs_DICT = {nH: i for i, nH in enumerate(NUM_Hs)}

CHIRAL_TAG = [Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
              Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
              Chem.rdchem.ChiralType.CHI_UNSPECIFIED]
CHIRAL_TAG_DICT = {ct: i for i, ct in enumerate(CHIRAL_TAG)}

RS_TAG = ["R", "S", "None"]
RS_TAG_DICT = {rs: i for i, rs in enumerate(RS_TAG)}

BOND_TYPES = [None,
              Chem.rdchem.BondType.SINGLE,
              Chem.rdchem.BondType.DOUBLE,
              Chem.rdchem.BondType.TRIPLE,
              Chem.rdchem.BondType.AROMATIC]
BOND_FLOAT_TO_TYPE = {
    0.0: BOND_TYPES[0],
    1.0: BOND_TYPES[1],
    2.0: BOND_TYPES[2],
    3.0: BOND_TYPES[3],
    1.5: BOND_TYPES[4],
}

BOND_STEREO = [Chem.rdchem.BondStereo.STEREOE,
               Chem.rdchem.BondStereo.STEREOZ,
               Chem.rdchem.BondStereo.STEREONONE]

BOND_DELTAS = {-3: 0, -2: 1, -1.5: 2, -1: 3, -0.5: 4, 0: 5, 0.5: 6, 1: 7, 1.5: 8, 2: 9, 3: 10}
BOND_FLOATS = [0.0, 1.0, 2.0, 3.0, 1.5]

RXN_CLASSES = list(range(10))

# ATOM_FDIM = len(ATOM_LIST) + len(DEGREES) + len(FORMAL_CHARGE) + len(HYBRIDIZATION) \
#             + len(VALENCE) + len(NUM_Hs) + 1
ATOM_FDIM = [len(ATOM_LIST), len(DEGREES), len(FORMAL_CHARGE), len(HYBRIDIZATION), len(VALENCE),
             len(NUM_Hs), len(CHIRAL_TAG), len(RS_TAG), 2]
# BOND_FDIM = 6
BOND_FDIM = 9
BINARY_FDIM = 5 + BOND_FDIM
INVALID_BOND = -1

In [19]:
def get_atom_features_sparse(atom: Chem.Atom, rxn_class: int = None, use_rxn_class: bool = False) -> List[int]:
    """Get atom features as sparse idx.

    Parameters
    ----------
    atom: Chem.Atom,
        Atom object from RDKit
    rxn_class: int, None
        Reaction class the molecule was part of
    use_rxn_class: bool, default False,
        Whether to use reaction class as additional input
    """
    feature_array = []
    symbol = atom.GetSymbol()
    symbol_id = ATOM_DICT.get(symbol, ATOM_DICT["unk"])
    feature_array.append(symbol_id)

    if symbol in ["*", "unk"]:
        padding = [999999999] * len(ATOM_FDIM) if use_rxn_class else [999999999] * (len(ATOM_FDIM) - 1)
        feature_array.extend(padding)

    else:
        degree_id = atom.GetDegree()
        if degree_id not in DEGREES:
            degree_id = 9
        formal_charge_id = FC_DICT.get(atom.GetFormalCharge(), 4)
        hybridization_id = HYBRIDIZATION_DICT.get(atom.GetHybridization(), 4)
        valence_id = VALENCE_DICT.get(atom.GetTotalValence(), 6)
        num_h_id = NUM_Hs_DICT.get(atom.GetTotalNumHs(), 4)
        chiral_tag_id = CHIRAL_TAG_DICT.get(atom.GetChiralTag(), 2)

        rs_tag = atom.GetPropsAsDict().get("_CIPCode", "None")
        rs_tag_id = RS_TAG_DICT.get(rs_tag, 2)

        is_aromatic = int(atom.GetIsAromatic())
        feature_array.extend([degree_id, formal_charge_id, hybridization_id,
                              valence_id, num_h_id, chiral_tag_id, rs_tag_id, is_aromatic])

        if use_rxn_class:
            feature_array.append(rxn_class)

    return feature_array

In [21]:
get_atom_features_sparse(Chem.Atom("*"), use_rxn_class=False, rxn_class=0)

[63,
 999999999,
 999999999,
 999999999,
 999999999,
 999999999,
 999999999,
 999999999,
 999999999]