In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdRGroupDecomposition as rdRGD
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG
import pandas as pd
import json
import numpy as np

In [136]:
def extract_double_bond_positions(mol):
    """提取分子中所有双键的位置。
    
    参数:
    mol : rdkit.Chem.Mol
        需要分析的分子对象。
    
    返回:
    list of tuples
        每个元组包含双键的原子索引。
    """
    double_bond_positions = []
    
    # 遍历所有键
    for bond in mol.GetBonds():
        if bond.GetBondTypeAsDouble() == 2.0:  # 检查是否为双键
            atom1_idx = bond.GetBeginAtomIdx()
            atom2_idx = bond.GetEndAtomIdx()
            double_bond_positions.append((atom1_idx, atom2_idx))
    
    return double_bond_positions

def print_mols(mols,sub_img_size):
    mols_with_bond_ids = []
    for mol in mols:
        drawer = rdMolDraw2D.MolDraw2DSVG(*sub_img_size)
        drawer.drawOptions().addBondIndices = True
        drawer.drawOptions().atomIndices = True
        drawer.DrawMolecule(mol)
        drawer.FinishDrawing()
        mols_with_bond_ids.append(SVG(drawer.GetDrawingText()))
    return mols_with_bond_ids

def merge_svg(svg_list, svg_per_row, offset_x, offset_y):
    # 计算合并后的 SVG 大小
    width = svg_per_row * offset_x
    height = ((len(svg_list) + svg_per_row - 1) // svg_per_row) * offset_y

    # 创建新的 SVG 容器
    merged_svg = f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {width} {height}" width="{width}" height="{height}">\n'
    
    for index, svg in enumerate(svg_list):
        # 提取 SVG 内容
        svg_content = svg.data
        
        # 计算位置
        x_position = (index % svg_per_row) * offset_x
        y_position = (index // svg_per_row) * offset_y
        
        # 添加到合并 SVG 中
        merged_svg += f'<g transform="translate({x_position}, {y_position})">\n'
        merged_svg += svg_content
        merged_svg += '\n</g>\n'

    merged_svg += '</svg>'
    
    return SVG(merged_svg)

def get_best_core(mols,cores):
    core_num_atoms = [core.GetNumAtoms() for core in cores]
    tag_cores = []
    tag_core_ids = []
    for mol in mols:
        best_core = None
        best_core_num_atoms = 0
        best_core_id = None
        for i,core in enumerate(cores):
            if mol.HasSubstructMatch(core):
                if core_num_atoms[i] > best_core_num_atoms:
                    best_core = core
                    best_core_id = i
                    best_core_num_atoms = core_num_atoms[i]
        tag_cores.append(best_core)
        tag_core_ids.append(best_core_id)
    return tag_cores,tag_core_ids

def pred_core_and_chain(mols,cores):
    
    tag_cores,tag_core_ids = get_best_core(mols,cores)

    res_list = []
    unmatched_list = []
    for mol,core,tag_core_id in zip(mols,tag_cores,tag_core_ids):
        if core is None:
            res_list.append(None)
            unmatched_list.append(mol)
        else:
            res,_ = rdRGD.RGroupDecompose([core],[mol])
            res[0]['core_id'] = tag_core_id
            res_list += res
    return res_list,unmatched_list

def convert_to_lib(item:dict,smiles:str):
    lib_item = {
        "smiles": smiles,
        "core": {'abstract_core':item['core_id']},
        "chain": {},
    }
    for k,v in item.items():
        if k == "Core":
            lib_item['core']['smarts'] = Chem.MolToSmarts(v)
            lib_item['core']['double_bond'] = extract_double_bond_positions(v)
        elif k.startswith("R"):
            v_dict = v.GetPropsAsDict()
            lib_item['chain'][v_dict['frag_idx']] = v_dict['internalRgroupSmiles']
    return lib_item

def save_to_json(obj, filename):
    """
    将 Python 对象保存为 JSON 文件。

    参数：
    obj: 要保存的 Python 对象（如字典、列表等）。
    filename: 要保存的文件名（包括路径）。
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)

In [5]:
taxane_smiles = pd.read_excel('/mnt/data/daiql/taxane_datas/taxane_mols_info_v241118.xlsx')['Isomeric SMILES'][1:]

In [6]:
cores_smarts = pd.read_excel('/mnt/data/daiql/taxane_lib/taxane_raw_core_smarts.xlsx')['smarts']

In [7]:
taxane_mols = [Chem.MolFromSmiles(s) for s in taxane_smiles]
core_mols = [Chem.MolFromSmarts(s) for s in cores_smarts]

In [125]:
res, unmatched = pred_core_and_chain(taxane_mols,core_mols)

In [89]:
img = merge_svg(print_mols([i['Core'] for i in res],(200,200)),9,200,200)

In [90]:
with open('/mnt/data/daiql/taxane_lib/taxane_lib_cores.svg','w') as f:
    f.write(img.data)

In [126]:
lib_list = [convert_to_lib(item,s) for item,s in zip(res,taxane_smiles)]

In [92]:
save_to_json(lib_list,'/mnt/data/daiql/taxane_lib/taxane_lib.json')

In [93]:
lib_df = pd.DataFrame(lib_list)

In [94]:
lib_df.to_excel('/mnt/data/daiql/taxane_lib/taxane_lib.xlsx')

In [95]:
all_chains = []
for item in lib_list:
    all_chains += list(item['chain'].values())
unique_chains = np.unique(all_chains)
len(unique_chains),len(all_chains)

(110, 5926)

In [96]:
unique_chain_mols = [Chem.MolFromSmiles(x) for x in unique_chains]

In [97]:
img = merge_svg(print_mols(unique_chain_mols,(200,200)),4,200,200)

In [98]:
with open('/mnt/data/daiql/taxane_lib/taxane_chain.svg','w') as f:
    f.write(img.data)

In [99]:
chains_df = pd.DataFrame({
    "chain":unique_chains
})

In [100]:
chains_df.to_excel("/mnt/data/daiql/taxane_lib/taxane_chain.xlsx")

In [101]:
taxane_st_smiles = pd.read_excel('/mnt/data/daiql/taxane_datas/标品smiles表.xlsx')

In [102]:
taxane_st_smiles = taxane_st_smiles[taxane_st_smiles['smiles'].map(lambda x: isinstance(x, str))]

In [103]:
taxane_st_mols = [Chem.MolFromSmiles(s) for s in taxane_st_smiles['smiles'].to_list()]

In [134]:
res_st, unmatched = pred_core_and_chain(taxane_st_mols,core_mols)

In [137]:
st_lib_dict = {ID:convert_to_lib(item,s) for item,s,ID in zip(res_st,taxane_st_smiles['smiles'],taxane_st_smiles['ID'])}

In [106]:
img = merge_svg(print_mols([i['Core'] for i in res_st],(200,200)),9,200,200)

In [107]:
with open('/mnt/data/daiql/taxane_lib/taxane_st_lib_cores.svg','w') as f:
    f.write(img.data)

In [138]:
save_to_json(st_lib_dict,'/mnt/data/daiql/taxane_lib/taxane_st_lib.json')

In [139]:
st_lib_df = pd.DataFrame(st_lib_dict).transpose()

In [140]:
st_lib_df.to_excel('/mnt/data/daiql/taxane_lib/taxane_st_lib.xlsx')