In [33]:
import pandas as pd
import os

# path containing gas phase calculations
gas_path = 'output_gas/eloise_clayton_gas_input'
# path containing solution phase calculation
sol_path = 'output_sol/scratch/chmbnn/mopac_jobs/eloise_clayton'

In [10]:
import pandas as pd
import os

# Initialize DataFrame with only four required columns
df_dip_vol = pd.DataFrame(columns=['solute_inchikey', 'solvent_name', 'volume', 'sol_dip'])

# Function to extract data from files
for filename in os.listdir(sol_path):
    if filename.endswith('.aux'):  # Process only .aux files
        # Extract solute InChIKey (first part before first underscore)
        solute_inchikey, solvent_name_with_aux = filename.split('_', 1)  # Split only on first underscore
        solvent_name = solvent_name_with_aux.replace('.aux', '')  # Remove the .aux extension

        # Open the .aux file
        full_filename = os.path.join(sol_path, filename)
        with open(full_filename, 'r') as f:
            sol_dip = None
            volume = None
            
            # Read through the file and extract relevant data
            for line in f:
                if 'DIPOLE' in line:
                    dipole_line = line.split('=')
                    sol_dip_string = dipole_line[1].strip().replace('+', '', 1)
                    sol_dip_string_split = sol_dip_string.split('D')
                    sol_dip = float(sol_dip_string_split[0]) * pow(10, float(sol_dip_string_split[1]))
                
                if 'VOLUME' in line:
                    volume_line = line.split('=')
                    volume_string = volume_line[1].strip().replace('+', '', 1)
                    volume_string_split = volume_string.split('D')
                    volume = float(volume_string_split[0]) * pow(10, float(volume_string_split[1]))

        # Append data only if both values are found
        if sol_dip is not None and volume is not None:
            new_row = pd.DataFrame({
                'solute_inchikey': [solute_inchikey], 
                'solvent_name': [solvent_name], 
                'volume': [volume], 
                'sol_dip': [sol_dip]
            })
            df_dip_vol = pd.concat([df_dip_vol, new_row], ignore_index=True)

# Display the final DataFrame
print(df_dip_vol)
df_dip_vol.to_csv("df_dip_vol.csv", index=False)

  df_dip_vol = pd.concat([df_dip_vol, new_row], ignore_index=True)


                  solute_inchikey     solvent_name   volume    sol_dip
0     ASWVTGNCAZCNNR-WYUMXYHSNA-N        1-octanol  317.959  13.102700
1     DFXQXFGFOLXAPO-KZFATGLANA-N      1,4-dioxane  201.346   7.845330
2     ISAOCJYIOMOJEB-UHFFFAOYNA-N  2-butoxyethanol  261.408   6.429330
3     PJANXHGTPQOBST-VAWYXSNFNA-N     ethylbenzene  236.432   0.001968
4     WPYMKLBDIGXBTP-FZOZFQFYNA-N         methanol  146.053   3.994800
...                           ...              ...      ...        ...
3546  NIHNNTQXNPWCJQ-UHFFFAOYNA-N          heptane  206.519   0.588175
3547  LVHBHZANLOWSRM-HJYFZBQUNA-N    ethyl_acetate  151.416   4.026400
3548  FFGPTBGBLSHEPO-ZHLVXTBQNA-N     acetonitrile  282.696   6.204150
3549  PJANXHGTPQOBST-VAWYXSNFNA-N        1-octanol  236.432   0.004270
3550  BBEAQIROQSPTKN-UHFFFAOYNA-N     acetophenone  242.367   0.003805

[3551 rows x 4 columns]


In [15]:
import pandas as pd
import os

# Initialize DataFrame to store results with columns for solute, solvent, HOMO, and LUMO
df_homo_lumo = pd.DataFrame(columns=['solute_inchikey', 'solvent_name', 'HOMO', 'LUMO'])

# Loop through files in the given directory
for file_name in os.listdir(sol_path):
    if file_name.endswith('.aux'):  # Process only .aux files
        # Extract solute InChIKey (first part before the first underscore)
        solute_inchikey, solvent_with_ext = file_name.split('_', 1)  # Split at the first underscore
        solvent_name = solvent_with_ext.replace('.aux', '')  # Remove the .aux extension

        # Open the .aux file
        full_file_path = os.path.join(sol_path, file_name)
        with open(full_file_path, 'r') as f:
            lines = f.read().split('\n')

        # Locate the EIGENVALUES section
        for i, line in enumerate(lines):
            if 'EIGENVALUES' in line:
                index = i
                break  # Stop after finding the first occurrence

        # Extract MO energies
        MOs = []
        index += 1
        while 'MOLECULAR_ORBITAL_OCCUPANCIES' not in lines[index]:
            MOs.extend(lines[index].strip().split())  # Append energy values
            index += 1

        # Extract MO occupancies
        index += 1
        occupancies = []
        while '###' not in lines[index]:
            occupancies.extend(lines[index].strip().split())  # Append occupancy values
            index += 1

        # Find HOMO and LUMO indices
        lumo_index = occupancies.index('0.0000')  # LUMO is the first unoccupied orbital
        homo_index = lumo_index - 1  # HOMO is the last occupied orbital

        # Extract HOMO and LUMO energy values
        homo = MOs[homo_index]
        lumo = MOs[lumo_index]

        # Create a new DataFrame row
        new_row = pd.DataFrame({
            'solute_inchikey': [solute_inchikey], 
            'solvent_name': [solvent_name], 
            'HOMO': [homo], 
            'LUMO': [lumo]
        })

        # Append to the main DataFrame
        df_homo_lumo = pd.concat([df_homo_lumo, new_row], ignore_index=True)

print(df_homo_lumo)
df_homo_lumo.to_csv('df_homo_lumo.csv', index=False) 

                  solute_inchikey     solvent_name     HOMO    LUMO
0     ASWVTGNCAZCNNR-WYUMXYHSNA-N        1-octanol   -9.005  -0.692
1     DFXQXFGFOLXAPO-KZFATGLANA-N      1,4-dioxane  -10.846  -2.299
2     ISAOCJYIOMOJEB-UHFFFAOYNA-N  2-butoxyethanol   -9.908  -1.074
3     PJANXHGTPQOBST-VAWYXSNFNA-N     ethylbenzene   -8.869  -0.537
4     WPYMKLBDIGXBTP-FZOZFQFYNA-N         methanol  -10.205  -1.004
...                           ...              ...      ...     ...
3546  NIHNNTQXNPWCJQ-UHFFFAOYNA-N          heptane   -8.870  -0.355
3547  LVHBHZANLOWSRM-HJYFZBQUNA-N    ethyl_acetate  -11.201  -0.928
3548  FFGPTBGBLSHEPO-ZHLVXTBQNA-N     acetonitrile   -8.975  -0.734
3549  PJANXHGTPQOBST-VAWYXSNFNA-N        1-octanol   -8.947  -0.614
3550  BBEAQIROQSPTKN-UHFFFAOYNA-N     acetophenone   -8.266  -1.481

[3551 rows x 4 columns]


In [17]:
import pandas as pd
import os

solvent_path = "output_solvents/eloise_clayton_solvents"

# Initialize DataFrame to store results with columns for solvent, HOMO, and LUMO
df_homo_lumo = pd.DataFrame(columns=['solvent_name', 'HOMO', 'LUMO'])

# Function to extract solvent information from filename (assuming the filename is in the format 'solvent.aux')
def extract_solvent(filename):
    # Extract solvent from filename (everything before .aux)
    if filename.endswith('.aux'):
        solvent = filename.replace('.aux', '')  # Remove the file extension to get the solvent name
        return solvent
    else:
        return None

# Function to extract data from files
for filename in os.listdir(solvent_path):  # Changed 'sol_path' to 'solvent_path'
    if filename.endswith('.aux'):
        # Extract solvent name (filename without .aux)
        solvent = extract_solvent(filename)
        
        # Open the .aux file
        full_filename = os.path.join(solvent_path, filename) 
        with open(full_filename, 'r') as f:
            lines = f.read().split('\n')
        
        # Find data for MO energies and occupancies
        for line in lines:
            if 'EIGENVALUES' in line:
                index = lines.index(line)
        MOs = []
        index = index + 1
        while 'MOLECULAR_ORBITAL_OCCUPANCIES' not in lines[index]:
            MO_line = lines[index]
            MO_line = MO_line.replace('\n','')
            index = index + 1
            MO_line_split = MO_line.split()
            MOs = MOs + MO_line_split
        index = index + 1
        occupancies = []
        while '###' not in lines[index]:
            occupancy_line = lines[index]
            occupancy_line = occupancy_line.replace('\n','')
            index = index + 1
            occupancy_line_split = occupancy_line.split()
            occupancies = occupancies + occupancy_line_split
        
        # Debugging: print the occupancies to see what values are present
        print(f"Occupancies in {filename}: {occupancies}")
        
        try:
            # Adjusting to handle different types of occupancy representations
            lumo_index = None
            for i, occ in enumerate(occupancies):
                if float(occ) == 0.0:
                    lumo_index = i
                    break

            if lumo_index is None:
                print(f"⚠️ Warning: No LUMO found for {filename}")
                continue  # Skip the file if LUMO cannot be found

            homo_index = lumo_index - 1
            homo = MOs[homo_index]
            lumo = MOs[lumo_index]
            
            # Create a new DataFrame row with solvent, HOMO, and LUMO
            new_row = pd.DataFrame({'solvent_name': [solvent], 'HOMO': [homo], 'LUMO': [lumo]})
            
            # Concatenate the new row with the existing DataFrame using pd.concat
            df_homo_lumo = pd.concat([df_homo_lumo, new_row], ignore_index=True)
        
        except Exception as e:
            print(f"⚠️ Error processing {filename}: {e}")

print(df_homo_lumo)
df_homo_lumo.to_csv("df_sol_homo_lumo.csv", index=False)

Occupancies in nitromethane.aux: ['2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
Occupancies in N-methylformamide.aux: ['2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000']
Occupancies in 1-chlorooctane.aux: ['2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '2.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '0.0000', '

In [18]:
import pandas as pd

df_homo_lumo = pd.read_csv("df_homo_lumo.csv")
df_sol_homo_lumo = pd.read_csv("df_sol_homo_lumo.csv")

# Merge the two DataFrames on 'solvent_name' 
df_merged = pd.merge(df_homo_lumo, df_sol_homo_lumo, on='solvent_name', how='left')

# Calculate Lsolv_Hsol and Lsol_Hsolv
# 'LUMO_x' and 'HOMO_x' are from df_homo_lumo (solute), while 'LUMO_y' and 'HOMO_y' are from df_sol_homo_lumo (solvent)
df_merged['Lsolv_Hsol'] = df_merged['LUMO_y'] - df_merged['HOMO_y']
df_merged['Lsol_Hsolv'] = df_merged['HOMO_y'] - df_merged['LUMO_y']

# Store the result in df_results, which contains all the columns from df_merged
df_results = df_merged 
df_results.to_csv("homo_lumo_data.csv", index=False)

In [38]:
import os
import pandas as pd

def get_index_positions(list_of_elems, element):
    """Returns the indexes of all occurrences of a given element in the list."""
    return [i for i, e in enumerate(list_of_elems) if e == element]

# DataFrame to store results
df_charges = pd.DataFrame(columns=['solute_inchikey', 'solvent_name', 'O_charges', 'C_charges', 'Most_neg', 'Most_pos', 'Het_charges'])

# Read .aux files
for filename in os.listdir(sol_path):
    if filename.endswith('.aux'):  # Ensure only .aux files are processed
        
        # Extract `solute_inchikey` (before first `_`) and `solvent_name` (between `_` and `.aux`)
        parts = filename.split('_', 1)  # Split at the first `_`
        solute_inchikey = parts[0]  # Everything before `_`
        solvent_name = parts[1].replace('.aux', '')  # Everything between `_` and `.aux`

        full_filename = os.path.join(sol_path, filename)
        with open(full_filename, 'r') as f:
            lines = f.readlines()  

        # Find charge data from ATOM_CHARGES
        charges = []
        for i, line in enumerate(lines):
            if 'ATOM_CHARGES' in line:
                index = i + 1  # Move to the next line
                charge_values = []
                # Collect charge values from subsequent lines
                while 'GRADIENTS' not in lines[index] and '=' not in lines[index]:  
                    charge_values.extend(lines[index].split())  # Split and add the charge values
                    index += 1
                
                charges.extend(charge_values)  # Add charges from the current block
        
        # Convert charge values to floats
        charges_number = []
        for charge in charges:
            try:
                charges_number.append(float(charge))  # Convert to float
            except ValueError:
                continue  # Ignore any values that can't be converted to a float

        # Find atom list (now using ATOM_EL)
        atoms = []
        for i, line in enumerate(lines):
            if 'ATOM_EL' in line:
                index = i + 1
                break
        else:
            continue  # Skip file if "ATOM_EL" is not found
        
        while 'ATOM_CORE' not in lines[index]:  # Read atom types
            atom_line_split = lines[index].split()  # Split by space
            atoms.extend(atom_line_split)  # Add atoms to list
            index += 1
        
        # Ensure the atoms and charges are aligned
        if len(atoms) != len(charges_number):
            print(f"Warning: Mismatch between number of atoms and charges in {filename}. Skipping this file.")
            print(f"Number of atoms: {len(atoms)}")
            print(f"Number of charges: {len(charges_number)}")
            continue  # Skip the file if the number of atoms does not match the number of charges
        
        # Compute descriptors
        O_indexes = get_index_positions(atoms, 'O')
        O_charges = sum(charges_number[i] for i in O_indexes if i < len(charges_number))  
        C_indexes = get_index_positions(atoms, 'C')
        C_charges = sum(charges_number[i] for i in C_indexes if i < len(charges_number)) 
        
        Most_pos = max(charges_number)
        Most_neg = min(charges_number)
        
        Het_charges = sum(charges_number[i] for i in range(len(atoms)) if atoms[i] not in ['C', 'H'] and i < len(charges_number))  
     
        new_row = pd.DataFrame([{
            'solute_inchikey': solute_inchikey,
            'solvent_name': solvent_name,
            'O_charges': O_charges,
            'C_charges': C_charges,
            'Most_neg': Most_neg,
            'Most_pos': Most_pos,
            'Het_charges': Het_charges
        }])
        df_charges = pd.concat([df_charges, new_row], ignore_index=True)

print(df_charges)
df_charges.to_csv("df_charges.csv",index=False)

  df_charges = pd.concat([df_charges, new_row], ignore_index=True)


                  solute_inchikey     solvent_name  O_charges  C_charges  \
0     ASWVTGNCAZCNNR-WYUMXYHSNA-N        1-octanol   -2.00246   -1.03262   
1     DFXQXFGFOLXAPO-KZFATGLANA-N      1,4-dioxane   -2.02333    0.30218   
2     ISAOCJYIOMOJEB-UHFFFAOYNA-N  2-butoxyethanol   -1.11268   -1.03120   
3     PJANXHGTPQOBST-VAWYXSNFNA-N     ethylbenzene    0.00000   -1.84616   
4     WPYMKLBDIGXBTP-FZOZFQFYNA-N         methanol   -1.20642   -0.04109   
...                           ...              ...        ...        ...   
3546  NIHNNTQXNPWCJQ-UHFFFAOYNA-N          heptane    0.00000   -1.58032   
3547  LVHBHZANLOWSRM-HJYFZBQUNA-N    ethyl_acetate   -2.31315    0.72943   
3548  FFGPTBGBLSHEPO-ZHLVXTBQNA-N     acetonitrile   -0.69721   -0.60639   
3549  PJANXHGTPQOBST-VAWYXSNFNA-N        1-octanol    0.00000   -1.91149   
3550  BBEAQIROQSPTKN-UHFFFAOYNA-N     acetophenone    0.00000   -1.65758   

      Most_neg  Most_pos  Het_charges  
0     -1.00535   2.56317     -2.02227  
1     -