In [1]:
import os
import pandas as pd

solvent_eps = pd.read_csv("solvent_eps_csv.csv")
filtered_data = pd.read_csv("filtered_data_with_smiles.csv")

# Merge the datasets on 'solvent_name'
merged_data = filtered_data.merge(solvent_eps[['solvent_name', 'EPS']], on='solvent_name', how='left')

merged_data.to_csv("merged_data.csv", index=False)

# Set up the directory for XYZ files and the output MOP files
xyz_dir = os.path.join(os.getcwd(), "xyz_files")  
mop_dir = os.path.join(os.getcwd(), "xyz_to_mop")  
os.makedirs(mop_dir, exist_ok=True) 

# Track missing files
missing_xyz_files = []

# Loop through the merged data and create new .mop files with the correct EPS values
for index, row in merged_data.iterrows():
    solute_name = row['solute_name']
    solvent_name = row['solvent_name']
    eps_value = row['EPS']
    solute_inchikey = row['solute_inchikey']  
    
    # Skip if the EPS value is missing
    if pd.isna(eps_value):
        print(f"Skipping {solute_inchikey} because EPS value is missing.")
        continue

    rsolv_value = 1.3

    # Determine the mop filename based on the solvent type (water or gas)
    if 'water' in solvent_name.lower():  
        mop_filename = f"{solute_inchikey}_water.mop"  
    else:  
        mop_filename = f"{solute_inchikey}_{solvent_name.replace(' ', '_')}.mop"

    mop_filepath = os.path.join(mop_dir, mop_filename)

    # Look for the corresponding XYZ file based on solute InChIKey
    xyz_filename = f"{solute_inchikey}.xyz" 
    xyz_filepath = os.path.join(xyz_dir, xyz_filename)

    # If the XYZ file exists, create a new .mop file
    if os.path.exists(xyz_filepath):
        with open(xyz_filepath, 'r') as xyz_file, open(mop_filepath, 'w') as mop_file:
            
            mop_header = f"AUX LARGE OPT FORCE THERMO PM6-D3H4X T=128H RECALC=5 GNORM=0.01 LET SCFCRT=0.0000001 RSOLV={rsolv_value} EPS={eps_value}\n"
            mop_file.write(mop_header)
            mop_file.write(f"{xyz_filename}\n\n")  

            # Skip the first two lines (usually header information in the XYZ file)
            lines = xyz_file.readlines()[2:]  

            # Write the atom coordinates to the .mop file
            mop_file.writelines(lines)
            mop_file.write("\n")  # Ensure an empty line at the end

        print(f"Created {mop_filename} with EPS={eps_value} and RSOLV={rsolv_value}")
    else:
        # Track missing XYZ files
        missing_xyz_files.append(xyz_filename)
        print(f"Warning: {xyz_filename} does not exist. Skipping.")

print(f"\nTotal number of missing XYZ files: {len(missing_xyz_files)}")
print(f"Missing XYZ files: {missing_xyz_files}")


Created ISAOCJYIOMOJEB-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created NGTRZJDYCCOXBA-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created MKASXAGBWHIGCF-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created RXKNNAKAVAHBNK-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created CWRYPZZKDGJXCA-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created DFYRUELUNQRZTB-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created ZUOUZKKEUPVFJK-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created WBYWAXJHAXSJNI-KZFATGLANA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created ONUFSRWQCKNVSL-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created KZTYYGOKRVBIMI-UHFFFAOYNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created XMTQQYYKAHVGBJ-XWKXFZRBNA-N_1,2-dichloroethane.mop with EPS=10.45 and RSOLV=1.3
Created GVEPBJHOBDJJJI-UHFFFAOYN