In [2]:
import pandas as pd
import re, os

In [3]:
# path containing gas phase calculations
gas_path = 'output_gas/eloise_clayton_gas_input'
# path containing solution phase calculation
sol_path = 'output_sol/scratch/chmbnn/mopac_jobs/eloise_clayton'

In [None]:
# Function to extract compound name from filename (everything between the first underscore and .aux)
def extract_compound(filename):
    # Split the filename by underscores
    parts = filename.split('_')
    
    # Ensure there are at least two parts to safely extract compound name
    if len(parts) < 2:
        print(f"⚠️ Warning: Filename does not have enough underscores: {filename}")
        return None  # Return None if filename structure is unexpected

    # Compound name is everything between the first underscore and before the file extension (.aux)
    compound_name = "_".join(parts[1:]).split('.')[0]  # Join everything after the first underscore and remove the file extension (.aux)
    return compound_name

# Function to extract data and write to CSV (either gas or solution phase)
def extract_and_write_data(path, phase, output_filename):
    rows = []

    for filename in os.listdir(path):
        if filename.endswith('.aux'):  # Only consider .aux files
            inchikey = filename.split('_')[0]  # Extract InChIKey
            
            # Extract compound name using the helper function
            compound_name = extract_compound(filename)
            if not compound_name:
                continue  # Skip this file if the compound name couldn't be extracted

            full_filename = os.path.join(path, filename)

            # Initialize variables for thermodynamic values
            enthalpy = None
            entropy = None

            print(f"🔍 Searching in: {filename} ({phase} phase)")  # Debugging message

            # Read .aux file to find thermodynamic values
            with open(full_filename, 'r', encoding="utf-8") as f:
                lines = f.readlines()
                for i, line in enumerate(lines):
                    line = line.strip()

                    # Check for enthalpy and entropy lines
                    if 'ENTHALPY_TOT:CAL/MOL' in line:
                        if i + 1 < len(lines):  # Check if there is a next line
                            next_line = lines[i + 1].strip()
                            values = next_line.split()
                            if values:
                                enthalpy = values[0]

                    if 'ENTROPY_TOT:CAL/K/MOL' in line:
                        if i + 1 < len(lines):
                            next_line = lines[i + 1].strip()
                            values = next_line.split()
                            if values:
                                entropy = values[0]

                    # Break once both values are found
                    if enthalpy and entropy:
                        break

            # If both values are found, store them in a row
            if enthalpy and entropy:
                rows.append({'solute_inchikey': inchikey, 'solvent_name': compound_name, f'H_{phase}': enthalpy, f'S_{phase}': entropy})

    df = pd.DataFrame(rows)
    df.to_csv(output_filename, index=False)
    print(f"✅ Data for {phase} phase written to {output_filename}")

# Step 1: Extract data for gas phase and write to CSV
extract_and_write_data(gas_path, 'gas', 'gas_phase_data.csv')

# Step 2: Extract data for solution phase and write to CSV
extract_and_write_data(sol_path, 'sol', 'solution_phase_data.csv')

# Step 3: Combine gas and solution phase data based on InChIKey and Compound
df_gas = pd.read_csv('gas_phase_data.csv')
df_sol = pd.read_csv('solution_phase_data.csv')

🔍 Searching in: ASWVTGNCAZCNNR-WYUMXYHSNA-N_1-octanol.aux (gas phase)
🔍 Searching in: DFXQXFGFOLXAPO-KZFATGLANA-N_1,4-dioxane.aux (gas phase)
🔍 Searching in: ISAOCJYIOMOJEB-UHFFFAOYNA-N_2-butoxyethanol.aux (gas phase)
🔍 Searching in: PJANXHGTPQOBST-VAWYXSNFNA-N_ethylbenzene.aux (gas phase)
🔍 Searching in: WPYMKLBDIGXBTP-FZOZFQFYNA-N_methanol.aux (gas phase)
🔍 Searching in: MWPLVEDNUUSJAV-UHFFFAOYNA-N_THF.aux (gas phase)
🔍 Searching in: BBEAQIROQSPTKN-UHFFFAOYNA-N_2-ethoxyethanol.aux (gas phase)
🔍 Searching in: QQVIHTHCMHWDBS-FLKJISBTNA-N_propyl_acetate.aux (gas phase)
🔍 Searching in: JDCMOHAFGDQQJX-UHFFFAOYNA-N_octane.aux (gas phase)
🔍 Searching in: RCINICONZNJXQF-GXKQXQCDNA-N_acetonitrile.aux (gas phase)
🔍 Searching in: KGCNHWXDPDPSBV-UHFFFAOYNA-N_benzene.aux (gas phase)
🔍 Searching in: SJEBAWHUJDUKQK-UHFFFAOYNA-N_2-propanol.aux (gas phase)
🔍 Searching in: TYFQFVWCELRYAO-FLKJISBTNA-N_2-propanol.aux (gas phase)
🔍 Searching in: WQZGKKKJIJFFOK-GASJEMHNNA-N_methanol.aux (gas phase)
🔍 Sear