In [None]:
import os
from rdkit import Chem

def modify_atoms(mol):
    if not mol:
        return None
    
    for atom in mol.GetAtoms():
        if atom.HasProp('molAtomMapNumber'):
            map_num = atom.GetProp('molAtomMapNumber')
            if map_num in {'1', '2', '3'}:
                atom.SetProp('molAtomMapNumber', '0')
        if atom.GetIsotope() != 0:
            atom.SetIsotope(0)

    return mol

def process_sdf_file(file_path):
    # Read the SDF file
    suppl = Chem.SDMolSupplier(file_path, removeHs=False)
    mols = [mol for mol in suppl if mol is not None]

    # Process each molecule
    processed_mols = [modify_atoms(mol) for mol in mols if mol]

    # Write the processed molecules back to a new SDF file
    output_file = file_path.replace(".sdf", "_processed.sdf")
    writer = Chem.SDWriter(output_file)
    for mol in processed_mols:
        writer.write(mol)
    writer.close()

    # Print out the message indicating the file has been processed
    print(f"File {file_path} has been processed and saved as {output_file}")

# Directory containing the SDF files
directory = "./"

# Process each SDF file in the directory
for filename in os.listdir(directory):
    if filename.startswith("MEL_") and filename[:-4].split('_')[-1].isdigit() and filename.endswith(".sdf"):
        file_path = os.path.join(directory, filename)
        process_sdf_file(file_path)

print("Processing complete.")



In [None]:
import os
from rdkit import Chem
from rdkit.Chem import rdFMCS, AllChem
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

def load_fragments(fragment_file):
    logging.info(f"Loading fragments from {fragment_file}")
    fragments = []
    suppl = Chem.SDMolSupplier(fragment_file, removeHs=False)
    for mol in suppl:
        if mol is not None:
            fragments.append(mol)
    logging.info(f"Loaded {len(fragments)} fragments from {fragment_file}")
    return fragments

def calculate_substructure_similarity(mol, fragment, fragment_index):
    try:
        mcs_result = rdFMCS.FindMCS([mol, fragment], completeRingsOnly=True, ringMatchesRingOnly=True)
        if mcs_result.canceled:
            logging.warning(f"MCS calculation canceled for molecule and fragment {fragment_index}")
            return 0.0, fragment_index
        mcs = Chem.MolFromSmarts(mcs_result.smartsString)
        if mcs is not None:
            common_atoms = mcs.GetNumHeavyAtoms()  # Count only heavy atoms
            total_atoms = mol.GetNumHeavyAtoms() + fragment.GetNumHeavyAtoms() - common_atoms
            similarity = common_atoms / total_atoms
            return similarity, fragment_index
        else:
            return 0.0, fragment_index
    except Exception as e:
        logging.error(f"Error in calculating substructure similarity: {e}")
        return 0.0, fragment_index

def process_sdf_file(file_path, fragment, fragment_index):
    logging.info(f"Processing file {file_path}")
    try:
        suppl = Chem.SDMolSupplier(file_path, removeHs=False)
        mols = [mol for mol in suppl if mol is not None]
        logging.info(f"Loaded {len(mols)} molecules from {file_path}")

        similarities = []
        for j, mol in enumerate(mols):
            max_similarity, best_fragment_index = calculate_substructure_similarity(mol, fragment, fragment_index)
            mol.SetProp("MaxSimilarity", str(max_similarity))
            mol.SetProp("BestFragmentIndex", str(best_fragment_index))
            similarities.append((mol, max_similarity))
            if (j + 1) % 10000 == 0:
                logging.info(f"Processed {j + 1}/{len(mols)} molecules in file {file_path}")

        return similarities
    except Exception as e:
        logging.error(f"Error processing SDF file {file_path}: {e}")
        return []

def main():
    fragment_file = "R284_D.sdf"
    fragments = load_fragments(fragment_file)

    directory = "./"
    files_to_process = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.startswith("MEL_") and filename.endswith("_processed.sdf")]

    output_file = os.path.join(directory, "frags_for_enum_D_250_MCS1.sdf")

    with Chem.SDWriter(output_file) as writer:
        for i, fragment in enumerate(fragments):
            logging.info(f"Processing fragment {i + 1}/{len(fragments)}")

            all_similarities = []
            for file in files_to_process:
                similarities = process_sdf_file(file, fragment, i + 1)
                all_similarities.extend(similarities)
                logging.info(f"Total similarities accumulated so far for fragment {i + 1}: {len(all_similarities)}")

            logging.info("Sorting and selecting top 250 molecules for current fragment")
            all_similarities.sort(key=lambda x: x[1], reverse=True)
            top_250_molecules = all_similarities[:100]

            for mol, _ in top_250_molecules:
                writer.write(mol)

            logging.info(f"Completed processing for fragment {i + 1}")

    logging.info("Processing complete.")

if __name__ == "__main__":
    main()

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to read SDF file and return a dictionary of molecules with their identifiers
def read_sdf(file_path):
    suppl = Chem.SDMolSupplier(file_path)
    mol_dict = {}
    for mol in suppl:
        if mol is not None:
            identifier = (mol.GetProp('synton_id_1'), mol.GetProp('synton_id_2'), mol.GetProp('synton_id_3'), mol.GetProp('rxn_ID'), mol.GetProp('const_synth'))
            mol_dict[identifier] = mol
    return mol_dict

# Function to update structures in the fragment file with the corresponding structures from the parent files
def update_structures(fragment_file, parent_files):
    frag_mols = read_sdf(fragment_file)
    updated_mols = []

    for parent_file in parent_files:
        parent_mols = read_sdf(parent_file)
        for identifier, parent_mol in parent_mols.items():
            if identifier in frag_mols:
                # Preserve MaxSimilarity and BestFragmentIndex properties
                max_similarity = frag_mols[identifier].GetProp("MaxSimilarity") if frag_mols[identifier].HasProp("MaxSimilarity") else ""
                best_fragment_index = frag_mols[identifier].GetProp("BestFragmentIndex") if frag_mols[identifier].HasProp("BestFragmentIndex") else ""
                
                # Replace the structure and set the Parent_rxn property
                frag_mols[identifier] = parent_mol
                frag_mols[identifier].SetProp("Parent_rxn", parent_file)
                
                # Restore MaxSimilarity and BestFragmentIndex properties
                if max_similarity:
                    frag_mols[identifier].SetProp("MaxSimilarity", max_similarity)
                if best_fragment_index:
                    frag_mols[identifier].SetProp("BestFragmentIndex", best_fragment_index)

    # Collect the updated molecules
    for identifier, mol in frag_mols.items():
        for i, prop in enumerate(['synton_id_1', 'synton_id_2', 'synton_id_3', 'rxn_ID', 'const_synth']):
            mol.SetProp(prop, identifier[i])  # Ensure the identifier properties are preserved
        updated_mols.append(mol)

    # Write the updated molecules to a new SDF file
    writer = Chem.SDWriter('updated_frags_for_enum_D_125_MCS.sdf')
    for mol in updated_mols:
        writer.write(mol)
    writer.close()

# Paths to the fragment and parent SDF files
fragment_file = 'frags_for_enum_D_250_MCS1.sdf'
parent_files = [
    'MEL_2comp_charged07172022_chunk_1_200001.sdf',
    'MEL_2comp_charged07172022_chunk_200002_400002.sdf',
    'MEL_2comp_charged07172022_chunk_400003_600003.sdf',
    'MEL_2comp_charged07172022_chunk_600004_800004.sdf',
    'MEL_2comp_charged07172022_chunk_800005_1000005.sdf',
    'MEL_2comp_charged07172022_chunk_1000006_1200006.sdf',
    'MEL_2comp_charged07172022_chunk_1200007_1400007.sdf',
    'MEL_2comp_charged07172022_chunk_1400008_1600008.sdf',
    'MEL_2comp_charged07172022_chunk_1600009_1800009.sdf'
]

# Update the structures
update_structures(fragment_file, parent_files)


