In [1]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

df = pd.read_csv("filtered_data_with_smiles.csv")

# Create output directory for XYZ files
output_dir = "xyz_files"
os.makedirs(output_dir, exist_ok=True)

# Function to generate XYZ coordinates from SMILES
def smiles_to_xyz(smiles, inchikey):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"❌ Failed to convert SMILES for InChIKey: {inchikey}")
        return None

    mol = Chem.AddHs(mol)  # Add Hydrogens

    # Generate 3D coordinates
    if AllChem.EmbedMolecule(mol, AllChem.ETKDG()) != 0:
        print(f"⚠️ Embedding failed for InChIKey: {inchikey}")
        return None

    AllChem.UFFOptimizeMolecule(mol)  # Optimize geometry

    # Extract XYZ Coordinates
    conf = mol.GetConformer()
    xyz_data = [f"{mol.GetNumAtoms()}",
                f"InChIKey: {inchikey}"]  # XYZ format starts with atom count + comment line

    for i, atom in enumerate(mol.GetAtoms()):
        pos = conf.GetAtomPosition(i)
        xyz_data.append(f"{atom.GetSymbol()} {pos.x:.4f} {pos.y:.4f} {pos.z:.4f}")

    return "\n".join(xyz_data)

# Loop through dataset and generate XYZ files using InChIKey as filename
for idx, row in df.iterrows():
    smiles = row.get("solute_smiles")
    inchikey = row.get("solute_inchikey")

    if pd.isna(smiles) or pd.isna(inchikey):  
        continue

    xyz_content = smiles_to_xyz(smiles, inchikey)
    if xyz_content:
        # Use InChIKey as the filename
        xyz_filename = os.path.join(output_dir, f"{inchikey}.xyz")
        with open(xyz_filename, "w") as xyz_file:
            xyz_file.write(xyz_content)
        print(f"✅ XYZ file saved: {xyz_filename}")

✅ XYZ file saved: xyz_files/ISAOCJYIOMOJEB-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/NGTRZJDYCCOXBA-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/MKASXAGBWHIGCF-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/RXKNNAKAVAHBNK-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/CWRYPZZKDGJXCA-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/DFYRUELUNQRZTB-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/ZUOUZKKEUPVFJK-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/WBYWAXJHAXSJNI-KZFATGLANA-N.xyz
✅ XYZ file saved: xyz_files/ONUFSRWQCKNVSL-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/KZTYYGOKRVBIMI-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/XMTQQYYKAHVGBJ-XWKXFZRBNA-N.xyz
✅ XYZ file saved: xyz_files/GVEPBJHOBDJJJI-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/CKAPSXZOOQJIBF-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/VHHHONWQHHHLTI-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/BMLIZLVNXIYGCK-WXRBYKJCNA-N.xyz
✅ XYZ file saved: xyz_files/UFWIBTONFRDIAS-UHFFFAOYNA-N.xyz
✅ XYZ file saved: xyz_files/JDCMOHAFGDQQ

In [4]:
import os

xyz_files_count = len([f for f in os.listdir('xyz_files') if f.endswith('.xyz')])

print(f"Total number of XYZ files: {xyz_files_count}")


Total number of XYZ files: 282
