This document illustrates the data processing workflow of the reactant library used in **editing mode**.  
Because **some** molecular editing reactions follow a **two-component** format, the process involves selecting additional reactants beyond the molecule being optimized.

In [None]:
from dataprocess import *

In [None]:
#Load
df_bb = pd.read_csv("raw/editing/building_block.csv") # The file must provide molecular SMILES
df_r = pd.read_csv("raw/editing/template_edit_rule_v1.csv") # The file must provide reaction template SMARTS, reaction name/code, and SMARTS for reactant 1 and reactant 2

In [None]:
# Based on the Enamine building block library, we further include manually collected building blocks
# from literature related to bi-component molecular editing reactions,
# to broaden the applicability of molecular editing reactions.
df_extra_bb = pd.read_csv("raw/editing/extra_building_blocks.csv")
df_bb = pd.concat([df_bb, df_extra_bb[['SMILES']]], ignore_index=True)

In [None]:
#Add react tag
df_bb['Sanitized_SMILES'] = sanitize_and_strip_salts(df_bb['SMILES'])
df_bb = df_bb[df_bb['Sanitized_SMILES'].notna()]
df_bb.reset_index(drop=True, inplace=True)

# In editing mode, only building blocks compatible with bi-component reactions are considered
df_r = df_r[df_r['uni/bi'] == 'bi']
print(f"Total number of bimolecular reactions: {len(df_r)}")

df_r['mol_1'] = df_r['reactant1'].apply(lambda x: Chem.MolFromSmarts(x) if not pd.isna(x) else None)
df_r['mol_2'] = df_r['reactant2'].apply(lambda x: Chem.MolFromSmarts(x) if not pd.isna(x) else None)
df_r['name'] = df_r['name'].astype(str)
reactants = df_r[['name', 'mol_1', 'mol_2']].values.tolist()

data = [(row['SMILES'], reactants) for _, row in df_bb.iterrows()]

n_processes=80
with Pool(processes=n_processes) as pool:
    results = pool.map(process_row, data)

df_bb['func_group'] = results
df_bb.to_csv('processed/editing/building_block_add_react_tag.csv', index=False)

In [None]:
df_bb = pd.read_csv("processed/editing/building_block_add_react_tag.csv")

In [None]:
#Data post-processing
valid_atoms = ['C', 'O', 'N', 'P', 'S', 'F', 'Cl', 'Br', 'I', 'B']
max_ring_count = 4
max_atoms = 7

df_bb = filter_no_react(df_bb)
df_bb = filter_valid_atoms(df_bb, valid_atoms)
df_bb = filter_by_ring_count(df_bb, max_ring_count)
df_bb = filter_by_max_ring_atoms(df_bb, max_atoms)
# df_bb = filter_by_mw(df_bb,max_mw)

In [None]:
output_filename = f"processed/editing/building_block_processed.csv"

df_bb.to_csv(output_filename, index=False)
print(f'data saved at {output_filename}')

In [None]:
# Generate smiles_list.json.gz
import gzip
output_filename_s = f'final/editing/smiles_list.json.gz'
df = pd.read_csv("processed/editing/building_block_processed.csv")
smiles_list = df['SMILES'].tolist()
with gzip.open(output_filename_s, 'wt', encoding='UTF-8') as f_out:
    json.dump(smiles_list, f_out)
print(f'SMILES list saved at {output_filename_s},total {len(smiles_list)} mols')

In [None]:
# Generate mask_dict.json.gz
df_r = pd.read_csv("raw/editing/template_edit_rule_v1.csv")
df_r['name'] = df_r['name'].astype(str)
bi_list = df_r.loc[df_r["uni/bi"] == "bi", "name"].tolist()
output_list = [name + '_reactant_1' for name in bi_list] + [name + '_reactant_2' for name in bi_list]
mask_dict = {}
for key in output_list:
        dict_value = [] 
        for i, val in enumerate(df['func_group']):
            if key in val:
                dict_value.append(i)  # mask_dict collects the molecule indices corresponding to each reaction tag
        mask_dict[key] = dict_value

output_filename_r = f'final/editing/mask_dict.json.gz'
with gzip.open(output_filename_r, 'wt', encoding='UTF-8') as f_out:
    json.dump(mask_dict, f_out)
print(f'Mask dictionary saved at {output_filename_r}')

max_list_length = max(len(lst) for lst in mask_dict.values())
print(f'The maximum number of selectable molecules for a single tag in mask_dict is {max_list_length}')