In [6]:
import pandas as pd

# Load dataset
data = pd.read_csv('/Users/stella/Documents/tierney/project/final_reduced_dataset.csv')

# Get unique solvent SMILES (avoid duplicates)
solvent_smiles = data[['solvent_smiles', 'solvent_name']].drop_duplicates().values.tolist()
pd.DataFrame(solvent_smiles, columns=['solvent_smiles', 'solvent_name']).to_csv('solvent_smiles.csv', index=False)

In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem
import os

# Create output directories
os.makedirs("solvent_xyz", exist_ok=True)

# Create/clear the failure log file
failed_log = "failed_smiles_solv.txt"
with open(failed_log, 'w') as f:
    f.write("Failed Solvent SMILES Processing Log\n")
    f.write("="*40 + "\n\n")

def log_failure(solvent_name, smiles, reason):
    """Log failed conversions to the error file"""
    with open(failed_log, 'a') as f:
        f.write(f"Solvent: {solvent_name}\n")
        f.write(f"SMILES: {smiles}\n")
        f.write(f"Reason: {reason}\n")
        f.write("-"*40 + "\n")

for smiles, solvent_name in solvent_smiles:
    # Create valid filename
    solv_name = "".join(c if c.isalnum() else "_" for c in solvent_name)
    filename = f"PM6_{solv_name}.xyz"
    
    # SMILES parsing check
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        error_msg = f"Invalid SMILES string"
        print(f"{error_msg}: {solvent_name} ({smiles})")
        log_failure(solvent_name, smiles, error_msg)
        continue
        
    try:
        # Generate 3D structure
        mol = Chem.AddHs(mol)
        embed_result = AllChem.EmbedMolecule(mol)
        if embed_result == -1:
            error_msg = "Failed to generate 3D coordinates"
            raise RuntimeError(error_msg)
            
        opt_result = AllChem.MMFFOptimizeMolecule(mol)
        if opt_result == -1:
            error_msg = "Failed to optimize geometry"
            raise RuntimeError(error_msg)
        
        # Write XYZ file
        with open(f"solvent_xyz/{filename}", 'w') as f:
            f.write(f"{mol.GetNumAtoms()}\n")
            f.write(f"{smiles}\n")  # Keep original SMILES as comment
            for atom in mol.GetAtoms():
                pos = mol.GetConformer().GetAtomPosition(atom.GetIdx())
                f.write(f"{atom.GetSymbol()} {pos.x:.4f} {pos.y:.4f} {pos.z:.4f}\n")
        
        print(f"Generated: {filename}")
        
    except Exception as e:
        error_msg = str(e)
        print(f"Failed {solvent_name}: {error_msg}")
        log_failure(solvent_name, smiles, error_msg)

print(f"\nProcessing complete. Failures logged to: {failed_log}")

Generated: PM6_1_2_dibromoethane.xyz
Generated: PM6_1_2_dichloroethane.xyz
Generated: PM6_1_2_propanediol.xyz
Generated: PM6_1_4_dioxane.xyz
Generated: PM6_1_butanol.xyz
Generated: PM6_1_butanol.xyz
Generated: PM6_1_chlorobutane.xyz
Generated: PM6_1_chlorooctane.xyz
Generated: PM6_1_decanol.xyz
Generated: PM6_1_decanol.xyz
Generated: PM6_1_heptanol.xyz
Generated: PM6_1_heptanol.xyz
Generated: PM6_1_hexanol.xyz
Generated: PM6_1_hexanol.xyz
Generated: PM6_1_octanol.xyz
Generated: PM6_1_octanol.xyz
Generated: PM6_1_pentanol.xyz
Generated: PM6_1_pentanol.xyz
Generated: PM6_1_propanol.xyz
Generated: PM6_1_propanol.xyz
Generated: PM6_1_propanol.xyz
Generated: PM6_1_tert_butoxy_2_propanol.xyz
Generated: PM6_2_2_4_trimethylpentane.xyz
Generated: PM6_2_butanol.xyz
Generated: PM6_2_butoxyethanol.xyz
Generated: PM6_2_ethoxyethanol.xyz
Generated: PM6_2_ethyl_1_hexanol.xyz
Generated: PM6_2_isopropoxyethanol.xyz
Generated: PM6_2_methyl_1_butanol.xyz
Generated: PM6_2_methyl_1_pentanol.xyz
Generated: 

[14:42:16] SMILES Parse Error: syntax error while parsing: #NAME?
[14:42:16] SMILES Parse Error: check for mistakes around position 1:
[14:42:16] #NAME?
[14:42:16] ^
[14:42:16] SMILES Parse Error: Failed parsing SMILES '#NAME?' for input: '#NAME?'
[14:42:16] SMILES Parse Error: syntax error while parsing: #NAME?
[14:42:16] SMILES Parse Error: check for mistakes around position 1:
[14:42:16] #NAME?
[14:42:16] ^
[14:42:16] SMILES Parse Error: Failed parsing SMILES '#NAME?' for input: '#NAME?'


Generated: PM6_benzyl_alcohol.xyz
Generated: PM6_butanone.xyz
Generated: PM6_butanone.xyz
Generated: PM6_butyl_acetate.xyz
Generated: PM6_butyl_acetate.xyz
Generated: PM6_butyronitrile.xyz
Generated: PM6_butyronitrile.xyz
Generated: PM6_carbon_disulfide.xyz
Generated: PM6_carbon_disulfide.xyz
Generated: PM6_carbon_tetrachloride.xyz
Generated: PM6_carbon_tetrachloride.xyz
Generated: PM6_carbon_tetrachloride.xyz
Generated: PM6_chlorobenzene.xyz
Generated: PM6_chlorobenzene.xyz
Generated: PM6_chlorocyclohexane.xyz
Generated: PM6_chloroform.xyz
Generated: PM6_chloroform.xyz
Generated: PM6_cis_1_2_dimethylcyclohexane.xyz
Generated: PM6_cis_1_3_dimethylcyclohexane.xyz
Generated: PM6_cis_1_4_dimethylcyclohexane.xyz
Generated: PM6_cyclohexane.xyz
Generated: PM6_cyclohexanone.xyz
Generated: PM6_cyclohexanone.xyz
Generated: PM6_cyclohexanone.xyz
Generated: PM6_cyclooctane.xyz
Generated: PM6_cyclopentanol.xyz
Generated: PM6_cyclopentanol.xyz
Generated: PM6_decane.xyz
Generated: PM6_decane.xyz
Gen

[14:42:17] SMILES Parse Error: syntax error while parsing: #NAME?
[14:42:17] SMILES Parse Error: check for mistakes around position 1:
[14:42:17] #NAME?
[14:42:17] ^
[14:42:17] SMILES Parse Error: Failed parsing SMILES '#NAME?' for input: '#NAME?'


Generated: PM6_diisopropyl_ether.xyz
Generated: PM6_dimethylacetamide.xyz
Generated: PM6_dimethylacetamide.xyz
Generated: PM6_dodecane.xyz
Generated: PM6_ethanol.xyz
Generated: PM6_ethanol.xyz
Generated: PM6_ethanol.xyz
Generated: PM6_ethyl_acetate.xyz
Generated: PM6_ethyl_acetate.xyz
Generated: PM6_ethyl_acetate.xyz
Generated: PM6_ethylbenzene.xyz
Generated: PM6_ethylene_glycol.xyz
Generated: PM6_ethylene_glycol.xyz
Generated: PM6_ethylene_glycol.xyz
Generated: PM6_fluorobenzene.xyz
Generated: PM6_formamide.xyz
Generated: PM6_heptane.xyz
Generated: PM6_hexadecane.xyz
Generated: PM6_hexane.xyz
Generated: PM6_isopropyl_myristate.xyz
Generated: PM6_m_xylene.xyz
Generated: PM6_methanol.xyz
Generated: PM6_methanol.xyz
Generated: PM6_methyl_acetate.xyz
Generated: PM6_methyl_butyrate.xyz
Generated: PM6_methyl_tert_butyl_ether.xyz
Generated: PM6_methylcyclohexane.xyz
Generated: PM6_nitromethane.xyz
Generated: PM6_nitromethane.xyz
Generated: PM6_nonane.xyz
Generated: PM6_o_xylene.xyz
Generated

[14:42:21] SMILES Parse Error: syntax error while parsing: #NAME?
[14:42:21] SMILES Parse Error: check for mistakes around position 1:
[14:42:21] #NAME?
[14:42:21] ^
[14:42:21] SMILES Parse Error: Failed parsing SMILES '#NAME?' for input: '#NAME?'


Generated: PM6_undecane.xyz
Generated: PM6_water.xyz

Processing complete. Failures logged to: failed_smiles_solv.txt


In [11]:
from rdkit import Chem
from rdkit.Chem import AllChem
import os

# Create output directory
os.makedirs("solvent_xyz", exist_ok=True)

# List of solvents to reprocess (from your log)
failed_but_now_valid = {
    'acetone': 'CC(=O)C',
    'benzene': 'C1=CC=CC=C1',
    'diethyl ether': 'CCOCC', 
    'toluene': 'CC1=CC=CC=C1'
}

def generate_xyz(solvent_name, smiles):
    """Helper function to generate XYZ file for one solvent"""
    # Create valid filename
    safe_name = "".join(c if c.isalnum() else "_" for c in solvent_name)
    filename = f"PM6_{safe_name}.xyz"
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Still invalid SMILES: {solvent_name} ({smiles})")
        return False
        
    try:
        mol = Chem.AddHs(mol)
        if AllChem.EmbedMolecule(mol) == -1:
            raise RuntimeError("3D embedding failed")
        if AllChem.MMFFOptimizeMolecule(mol) == -1:
            raise RuntimeError("Optimization failed")
            
        with open(f"solvent_xyz/{filename}", 'w') as f:
            f.write(f"{mol.GetNumAtoms()}\n")
            f.write(f"{smiles}\n")
            for atom in mol.GetAtoms():
                pos = mol.GetConformer().GetAtomPosition(atom.GetIdx())
                f.write(f"{atom.GetSymbol()} {pos.x:.4f} {pos.y:.4f} {pos.z:.4f}\n")
        
        print(f"Successfully generated: {filename}")
        return True
        
    except Exception as e:
        print(f"Failed {solvent_name}: {str(e)}")
        return False

# Process the corrected solvents
print("\nReprocessing previously failed solvents:")
success_count = 0
for solvent_name, smiles in failed_but_now_valid.items():
    if generate_xyz(solvent_name, smiles):
        success_count += 1

print(f"\nSuccessfully processed {success_count}/{len(failed_but_now_valid)} solvents")


Reprocessing previously failed solvents:
Successfully generated: PM6_acetone.xyz
Successfully generated: PM6_benzene.xyz
Successfully generated: PM6_diethyl_ether.xyz
Successfully generated: PM6_toluene.xyz

Successfully processed 4/4 solvents


In [12]:
import os
from pathlib import Path

xyz_folder = "solvent_xyz"  # Folder with PM6_solvent.xyz files
output_folder = "/Users/stella/Documents/tierney/project/solvent_mopac_gas"
os.makedirs(output_folder, exist_ok=True)

for xyz_file in Path(xyz_folder).glob("PM6_*.xyz"):
    solvent_name = xyz_file.stem.replace("PM6_", "")
    
    with open(xyz_file) as f:
        xyz_data = f.readlines()[2:]  # Skip first 2 lines
    
    with open(f"{output_folder}/{xyz_file.name.replace('.xyz', '.mop')}", 'w') as f:
        f.write("PM6 PRECISE CHARGE=0:AUX LARGE OPT FORCE THERMO PM6 T=128H RECALC=5 GNORM=0.01 LET SCFCRT=0.0000001\n")  # Corrected keywords
        f.write(f"{solvent_name}\n\n")
        f.writelines(xyz_data)

In [None]:
import zipfile
import os

# Folders to compress
folders_to_compress = ["mopac_solvents_gas"]

# Name of the output zip file
output_zip = "PM6_solvent_mopac_job.zip"

# Create a zip archive
with zipfile.ZipFile(output_zip, "w") as zipf:
    for folder in folders_to_compress:
        for root, dirs, files in os.walk(folder):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, start=os.path.dirname(folder))
                zipf.write(file_path, arcname)

print(f"Compressed folders into {output_zip}")

Compressed folders into PM6_solvent_mopac_job.zip
