In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import os
from collections import defaultdict

# Folder to save XYZ files
xyz_folder = "/Users/stella/Documents/tierney/project/notebooks/xyz"
os.makedirs(xyz_folder, exist_ok=True)

# Load the dataset
file_path = "/Users/stella/Documents/tierney/project/final_reduced_dataset.csv"  
df = pd.read_csv(file_path)

# Log file for failed SMILES
failed_log = "failed_smiles.txt"

# Track duplicate counts for xyz
xyz_counter = defaultdict(int)

def smiles_to_xyz(smiles, index, role, inchikey):
    """Convert SMILES to XYZ file and handle errors."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES: {smiles}")

        # Skip kekulization for single-atom molecules
        if mol.GetNumAtoms() == 1:
            print(f"Skipping kekulization for single-atom molecule: {smiles}")
        else:
            # Attempt to sanitize (skip kekulization if it fails)
            try:
                Chem.SanitizeMol(mol, Chem.SANITIZE_ALL ^ Chem.SANITIZE_KEKULIZE)
            except:
                print(f"Skipping kekulization for {smiles}")

        # Generate 3D coordinates
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)
        AllChem.MMFFOptimizeMolecule(mol)

        # Save XYZ file with the new naming convention
        current_count = xyz_counter[inchikey]
        xyz_filename = os.path.join(xyz_folder, f"PM6_{inchikey}_{current_count}.xyz")
        
        # Ensure the filename is unique
        while os.path.exists(xyz_filename):
            current_count += 1
            xyz_filename = os.path.join(xyz_folder, f"PM6_{inchikey}_{current_count}.xyz")
        
        with open(xyz_filename, "w") as f:
            f.write(f"{mol.GetNumAtoms()}\n\n")
            for atom in mol.GetAtoms():
                pos = mol.GetConformer().GetAtomPosition(atom.GetIdx())
                f.write(f"{atom.GetSymbol()} {pos.x} {pos.y} {pos.z}\n")
        
        # Update the counter for the next occurrence
        xyz_counter[inchikey] = current_count + 1

    except Exception as e:
        # Log failure
        with open(failed_log, "a") as log_file:
            log_file.write(f"{role}: {smiles}\t{str(e)}\n")
        print(f"Failed to process {role} SMILES: {smiles} - {e}")

# Process all rows in the dataset
for idx, row in df.iterrows():
    solute_smiles = row["solute_smiles"]
    solute_inchikey = row["solute_inchikey"]
    
    # Process solute
    smiles_to_xyz(solute_smiles, idx, "solute", solute_inchikey)

print("XYZ file generation complete!")

XYZ file generation complete!
