In [4]:
import pandas as pd
import re, os

In [20]:
# path containing gas phase calculations
gas_path = '/Users/stella/Documents/tierney/project/output MOPAC/gas_mopac_files' 
# path containing solution phase calculation
sol_path = '/Users/stella/Documents/tierney/project/output MOPAC/sol_mopac_files' 

In [6]:
import pandas as pd
import os
import re

def process_aux_file(filename, gas_path):
    """Process .aux files to extract thermodynamic data"""
    try:
        # Extract InChIkey from filename (PM6_{inchikey}_{count}.aux)
        parts = filename.split('_')
        if len(parts) >= 3:
            inchikey = parts[1]  # Second part is the InChIKey
            compound = inchikey  # Using InChIKey as compound name
            
            # Initialize variables
            enthalpy = None
            entropy = None
            
            # Read .aux file
            full_filename = os.path.join(gas_path, filename)
            with open(full_filename, 'r', encoding="utf-8") as f:
                content = f.read()
                
                # Extract enthalpy (first value at 298K)
                enthalpy_match = re.search(r'ENTHALPY_TOT:CAL/MOL\[\d+\]=\s+([\d.\s-]+)', content)
                if enthalpy_match:
                    enthalpy = enthalpy_match.group(1).split()[0]
                
                # Extract entropy (first value at 298K)
                entropy_match = re.search(r'ENTROPY_TOT:CAL/K/MOL\[\d+\]=\s+([\d.\s-]+)', content)
                if entropy_match:
                    entropy = entropy_match.group(1).split()[0]
                
                if enthalpy and entropy:
                    return {
                        'InChIkey': inchikey,
                        'H_gas': enthalpy,
                        'S_gas': entropy
                    }
    
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
    return None

def extract_thermo_data(gas_path, output_filename):
    """Process all .aux files in directory and save results to CSV"""
    rows = []
    
    for filename in os.listdir(gas_path):
        if filename.startswith('PM6_') and filename.endswith('.aux'):
            result = process_aux_file(filename, gas_path)
            if result:
                rows.append(result)
    
    # Create DataFrame and save to CSV
    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(output_filename, index=False)
        print(f"Thermodynamic data written to {output_filename}")
        return df
    else:
        print("No valid .aux files found with thermodynamic data")
        return pd.DataFrame()

# Main execution
gas_path = '/Users/stella/Documents/tierney/project/output MOPAC/gas_mopac_files'
thermo_data = extract_thermo_data(gas_path, 'gas_phase_thermo_data.csv')

# Display results
print(thermo_data)

Thermodynamic data written to gas_phase_thermo_data.csv
                         InChIkey    H_gas    S_gas
0     CMWTZPSULFXXJA-GTNTULIANA-N  10510.0  128.280
1     YASYVMFAVPKPKE-JSWHHWTPNA-N   8749.1  113.838
2     GVIJJXMXTUZIOD-UHFFFAOYNA-N   7228.7  101.488
3     QUEKGYQTRJVEQC-KZFATGLANA-N   7284.9  103.513
4     DAUAQNGYDSHRET-KZFATGLANA-N   8280.9  108.438
...                           ...      ...      ...
3759  CMWTZPSULFXXJA-GTNTULIANA-N  10482.4  127.440
3760  GVIJJXMXTUZIOD-UHFFFAOYNA-N   7531.9  102.196
3761  PJANXHGTPQOBST-VAWYXSNFNA-N   7806.0  104.500
3762  CIWBSHSKHKDKBQ-JLAZNSOCNA-N   8386.9  109.398
3763  GJCOSYZMQJWQCA-UHFFFAOYNA-N   6901.8   96.774

[3764 rows x 3 columns]


In [7]:
print(thermo_data.dtypes)

InChIkey    object
H_gas       object
S_gas       object
dtype: object


In [8]:
thermo_data['H_gas'] = pd.to_numeric(thermo_data['H_gas'], errors='coerce')
thermo_data['S_gas'] = pd.to_numeric(thermo_data['S_gas'], errors='coerce')

In [9]:
gas_thermo_data = thermo_data.groupby('InChIkey').mean().reset_index()
gas_thermo_data.to_csv('ave_gas_thermo_data.csv', index=False)

In [10]:
import pandas as pd
import os
import re

def process_sol_file(filename, sol_path):
    """Process solution phase .aux files to extract thermodynamic data and solvent info"""
    try:
        # Extract InChIkey and solvent from filename (PM6_{solute_inchikey}_{solvent_name}.aux)
        parts = filename.split('_')
        if len(parts) >= 3:
            inchikey = parts[1]  # Second part is the solute InChIKey
            solvent = parts[2].replace('.aux', '')  # Third part is solvent name
            compound = inchikey  # Using InChIKey as compound name
            
            # Initialize variables
            enthalpy = None
            entropy = None
            
            # Read .aux file
            full_filename = os.path.join(sol_path, filename)
            with open(full_filename, 'r', encoding="utf-8") as f:
                content = f.read()
                
                # Extract enthalpy (first value at 298K)
                enthalpy_match = re.search(r'ENTHALPY_TOT:CAL/MOL\[\d+\]=\s+([\d.\s-]+)', content)
                if enthalpy_match:
                    enthalpy = enthalpy_match.group(1).split()[0]
                
                # Extract entropy (first value at 298K)
                entropy_match = re.search(r'ENTROPY_TOT:CAL/K/MOL\[\d+\]=\s+([\d.\s-]+)', content)
                if entropy_match:
                    entropy = entropy_match.group(1).split()[0]
                
                if enthalpy and entropy:
                    return {
                        'InChIkey': inchikey,
                        'Solvent': solvent,
                        'H_sol': enthalpy,
                        'S_sol': entropy
                    }
    
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
    return None

def extract_sol_data(sol_path, output_filename):
    """Process all solution phase .aux files in directory and save results to CSV"""
    rows = []
    
    for filename in os.listdir(sol_path):
        if filename.startswith('PM6_') and filename.endswith('.aux'):
            result = process_sol_file(filename, sol_path)
            if result:
                rows.append(result)
    
    # Create DataFrame and save to CSV
    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(output_filename, index=False)
        print(f"Solution phase thermodynamic data written to {output_filename}")
        print(f"Found {len(df)} solute-solvent pairs")
        print(f"Unique solutes: {df['InChIkey'].nunique()}")
        print(f"Unique solvents: {df['Solvent'].nunique()}")
        return df
    else:
        print("No valid solution phase .aux files found with thermodynamic data")
        return pd.DataFrame()

# Main execution - Solution Phase Only
sol_path = '/Users/stella/Documents/tierney/project/output MOPAC/sol_mopac_files'
sol_thermo_data = extract_sol_data(sol_path, 'sol_thermo_data.csv')

# Display solution phase results
print(sol_thermo_data.head())

Solution phase thermodynamic data written to sol_thermo_data.csv
Found 3249 solute-solvent pairs
Unique solutes: 283
Unique solvents: 84
                      InChIkey             Solvent   H_sol    S_sol
0  GVEPBJHOBDJJJI-UHFFFAOYNA-N          chloroform  7570.9  102.618
1  GVIJJXMXTUZIOD-UHFFFAOYNA-N         1,4-dioxane  7720.1  104.363
2  WBYWAXJHAXSJNI-KZFATGLANA-N             acetone  6928.1   99.698
3  RYYVLZVUVIJVGH-UHFFFAOYNA-N             acetone  9122.7  115.734
4  GPSDUZXPYCFOSQ-BGGKNDAXNA-N  2-methyl-1-butanol  6621.6   96.033


In [11]:
# merging dataframes from gas and solution calculations
df_thermo = pd.merge(gas_thermo_data, sol_thermo_data, on='InChIkey')
print(f"row count: {len(df_thermo)}")

row count: 3249


In [12]:
# calculate G_sol and DeltaG_sol descriptors, and remove the rest
# of the data from df_thermo

df_thermo['H_gas'] = df_thermo['H_gas'].astype(float)
df_thermo['S_gas'] = df_thermo['S_gas'].astype(float)
df_thermo['H_sol'] = df_thermo['H_sol'].astype(float)
df_thermo['S_sol'] = df_thermo['S_sol'].astype(float)
df_thermo['G_gas'] = df_thermo['H_gas']-(df_thermo['S_gas']*298)
df_thermo['G_sol'] = df_thermo['H_sol']-(df_thermo['S_sol']*298)
df_thermo['DeltaG_sol'] = df_thermo['G_sol']-df_thermo['G_gas']
df_thermo.drop(['H_gas', 'S_gas', 'H_sol', 'S_sol', 'G_gas'], axis = 1)

# Save the processed DataFrame to a CSV file
output_filename = 'G_sol_thermo_data.csv'
df_thermo.to_csv(output_filename, index=False)
print(f"Thermodynamic data written to {output_filename}")
print(f"row count: {len(df_thermo)}")

Thermodynamic data written to G_sol_thermo_data.csv
row count: 3249


In [13]:
# Calculate thermodynamic descriptors
df_thermo['H_gas'] = df_thermo['H_gas'].astype(float)
df_thermo['S_gas'] = df_thermo['S_gas'].astype(float)
df_thermo['H_sol'] = df_thermo['H_sol'].astype(float)
df_thermo['S_sol'] = df_thermo['S_sol'].astype(float)

# Calculate Gibbs free energies
df_thermo['G_gas'] = df_thermo['H_gas'] - (df_thermo['S_gas'] * 298)
df_thermo['G_sol'] = df_thermo['H_sol'] - (df_thermo['S_sol'] * 298)
df_thermo['DeltaG_sol'] = df_thermo['G_sol'] - df_thermo['G_gas']

# Remove intermediate columns (using inplace)
df_thermo.drop(['H_gas', 'S_gas', 'H_sol', 'S_sol', 'G_gas'], axis=1, inplace=True)

# Save the processed DataFrame
output_filename = 'G_thermo_data.csv'
df_thermo.to_csv(output_filename, index=False)
print(f"Thermodynamic data written to {output_filename}")
print(f"Final columns: {df_thermo.columns.tolist()}")
print(f"row count: {len(df_thermo)}")

Thermodynamic data written to G_thermo_data.csv
Final columns: ['InChIkey', 'Solvent', 'G_sol', 'DeltaG_sol']
row count: 3249


In [None]:
import os
# will read straight from solution structures and bypass gas phase structures

df_dip_vol = pd.DataFrame(columns = ['volume', 'sol_dip'])

for filename in os.listdir(sol_path):
    if filename.endswith('.aux'):
        # Extract the InChIKey from the filename
        parts = filename.split('_')
        if len(parts) >= 3 and parts[0] == 'PM6':
            solute_inchikey = parts[1]  # Extract the solute InChIKey
            solvent = parts[2].replace('.aux', '')  # Third part is solvent name
        # open aux file
        full_filename = os.path.join(sol_path, filename)
        f = open(full_filename, 'r')
        # find thermodynamic values
        for line in f:
            if 'DIPOLE' in line:
                dipole_line = line.split('=')
                sol_dip_string = dipole_line[1].replace('\n','')
                sol_dip_string = sol_dip_string.replace('+','',1)
                sol_dip_string_split = sol_dip_string.split('D')
                sol_dip = float(sol_dip_string_split[0])*pow(10,float(sol_dip_string_split[1]))
                # print('sol_dip (Debye) =', sol_dip)
            if 'VOLUME' in line:
                volume_line = line.split('=')
                volume_string = volume_line[1].replace('\n','')
                volume_string = volume_string.replace('+','',1)
                volume_string_split = volume_string.split('D')
                volume = float(volume_string_split[0])*pow(10,float(volume_string_split[1]))
                # print('volume (A^3)=', volume)
        # save extracted data to dataframe
        new_row = {'InChIkey':solute_inchikey,'Solvent':solvent, 'volume':volume, 'sol_dip':sol_dip}
        df_dip_vol = pd.concat([df_dip_vol, pd.DataFrame([new_row])], ignore_index=True)
df_dip_vol
df_dip_vol.to_csv('/Users/stella/Documents/tierney/project/dip_data.csv', index=False)

In [36]:
import re
import os
import pandas as pd

def parse_scientific_number(s):
    """Parse numbers like '+0.325051D+00' or '0.243071E+03' into floats."""
    s = s.strip().replace('+', '').replace(' ', '')
    if 'D' in s:
        base, exponent = s.split('D')
    elif 'E' in s:
        base, exponent = s.split('E')
    else:
        return float(s)  # No exponent
    return float(base) * 10 ** float(exponent)

# Initialize DataFrame
df_dip_vol = pd.DataFrame(columns=['InChikey', 'Solvent', 'volume', 'sol_dip'])
skipped_files = []

for filename in os.listdir(sol_path):
    if filename.endswith('.aux'):
        parts = filename.split('_')
        if len(parts) >= 3 and parts[0] == 'PM6':
            solute_inchikey = parts[1]
            solvent = parts[2].replace('.aux', '')
        else:
            skipped_files.append(f"{filename}: Malformed filename")
            continue

        full_path = os.path.join(sol_path, filename)
        try:
            with open(full_path, 'r') as f:
                content = f.read()

                # Case-insensitive search for DIPOLE
                dipole_match = re.search(r'DIPOLE:DEBYE\s*=\s*([+-]?\d+\.\d+D[+-]?\d+)', content, re.IGNORECASE)
                if not dipole_match:
                    skipped_files.append(f"{filename}: Missing DIPOLE")
                    continue
                sol_dip = parse_scientific_number(dipole_match.group(1))

                # Case-insensitive search for VOLUME
                volume_match = re.search(r'VOLUME:CUBIC ANGSTROMS\s*=\s*([+-]?\d+\.\d+D[+-]?\d+)', content, re.IGNORECASE)
                if not volume_match:
                    skipped_files.append(f"{filename}: Missing VOLUME")
                    continue
                volume = parse_scientific_number(volume_match.group(1))

                # Add to DataFrame
                new_row = {
                    'InChikey': solute_inchikey,
                    'Solvent': solvent,
                    'volume': volume,
                    'sol_dip': sol_dip
                }
                df_dip_vol = pd.concat([df_dip_vol, pd.DataFrame([new_row])], ignore_index=True)

        except Exception as e:
            skipped_files.append(f"{filename}: Error reading file ({str(e)})")
            continue

# Save results
df_dip_vol.to_csv('dipole_vol_data.csv', index=False)

# Log skipped files
with open('skipped_files.log', 'w') as f:
    f.write("\n".join(skipped_files))

print(f"Processed {len(df_dip_vol)} files. Skipped {len(skipped_files)} files (see skipped_files.log).")

  df_dip_vol = pd.concat([df_dip_vol, pd.DataFrame([new_row])], ignore_index=True)


Processed 1206 files. Skipped 2043 files (see skipped_files.log).


In [None]:
import os
import glob
import pandas as pd
import re

def parse_mopac_aux(filepath):
    """Parse MOPAC .aux file to extract HOMO/LUMO energies"""
    try:
        with open(filepath, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None, None

    # Find EIGENVALUES section
    eig_match = re.search(r'EIGENVALUES\[.*?\]=\s*((?:-?\d+\.\d+\s*)+)', content)
    if not eig_match:
        print(f"No EIGENVALUES found in {filepath}")
        return None, None
    
    # Convert scientific notation (1.23D+05 → 1.23e+05) and parse numbers
    try:
        eig_values = [float(x.replace('D', 'e')) for x in eig_match.group(1).split()]
    except:
        print(f"Couldn't parse eigenvalues in {filepath}")
        return None, None

    # Find OCCUPANCIES section
    occ_match = re.search(r'MOLECULAR_ORBITAL_OCCUPANCIES\[.*?\]=\s*((?:\d+\.\d+\s*)+)', content)
    if not occ_match:
        print(f"No OCCUPANCIES found in {filepath}")
        return None, None
    
    try:
        occ_values = [float(x) for x in occ_match.group(1).split()]
    except:
        print(f"Couldn't parse occupancies in {filepath}")
        return None, None

    # Verify data
    if not eig_values or not occ_values:
        print(f"Empty data in {filepath}")
        return None, None
    if len(eig_values) != len(occ_values):
        print(f"Mismatched data lengths in {filepath}")
        return None, None

    # Find HOMO (last occupied) and LUMO (first unoccupied)
    try:
        homo_idx = len([occ for occ in occ_values if occ > 0]) - 1
        lumo_idx = homo_idx + 1
        homo = eig_values[homo_idx]
        lumo = eig_values[lumo_idx]
        print(f"{os.path.basename(filepath)} - HOMO: {homo:.3f} eV, LUMO: {lumo:.3f} eV")
        return homo, lumo
    except IndexError:
        print(f"Error finding HOMO/LUMO in {filepath}")
        return None, None

# Configuration
input_dir = "/Users/stella/Documents/tierney/project/solvent_output_mopac"
output_file = "/Users/stella/Documents/tierney/project/solvent_homo_lumo.csv"

# Process all PM6_*.aux files
results = []
for aux_path in glob.glob(os.path.join(input_dir, "PM6_*.aux")):
    solvent = os.path.basename(aux_path)[4:-4]  # Remove "PM6_" and ".aux"
    homo, lumo = parse_mopac_aux(aux_path)
    
    if homo is not None and lumo is not None:
        results.append({
            "Solvent": solvent,
            "HOMO": homo,
            "LUMO": lumo,
            "Band Gap": lumo - homo
        })

# Save results
if results:
    solv_homo_lumo_df = pd.DataFrame(results)
    solv_homo_lumo_df.to_csv(output_file, index=False)
    print(f"\nSuccess! Processed {len(results)} solvents.")
    print("Sample results:")
    print(solv_homo_lumo_df.head())
else:
    print("\nNo valid data extracted. Check error messages above.")

print(f"row count: {len(solv_homo_lumo_df)}")

PM6_1_octanol.aux - HOMO: -10.392 eV, LUMO: 2.809 eV
PM6_propylene_glycol.aux - HOMO: -10.492 eV, LUMO: 2.203 eV
PM6_acetic_acid.aux - HOMO: -11.368 eV, LUMO: 0.429 eV
PM6_toluene.aux - HOMO: -9.283 eV, LUMO: 0.447 eV
PM6_nonane.aux - HOMO: -10.800 eV, LUMO: 4.006 eV
PM6_formamide.aux - HOMO: -10.614 eV, LUMO: 0.922 eV
PM6_chloroform.aux - HOMO: -11.344 eV, LUMO: -0.696 eV
PM6_3_methyl_1_butanol.aux - HOMO: -10.196 eV, LUMO: 2.981 eV
PM6_hexane.aux - HOMO: -11.015 eV, LUMO: 4.396 eV
PM6_THF.aux - HOMO: -9.540 eV, LUMO: 2.345 eV
PM6_cis_1_4_dimethylcyclohexane.aux - HOMO: -10.566 eV, LUMO: 4.044 eV
PM6_heptane.aux - HOMO: -11.017 eV, LUMO: 4.067 eV
PM6_benzene.aux - HOMO: -9.662 eV, LUMO: 0.389 eV
PM6_ethylbenzene.aux - HOMO: -9.305 eV, LUMO: 0.467 eV
PM6_1_2_dichloroethane.aux - HOMO: -10.790 eV, LUMO: 0.425 eV
PM6_2_methyl_1_propanol.aux - HOMO: -10.254 eV, LUMO: 2.927 eV
PM6_propionitrile.aux - HOMO: -12.372 eV, LUMO: 1.294 eV
PM6_cyclohexane.aux - HOMO: -10.594 eV, LUMO: 4.148 eV
PM

In [16]:
import os
import pandas as pd
import re

def parse_homo_lumo(filepath):
    """Extract HOMO/LUMO from MOPAC .aux file"""
    try:
        with open(filepath, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None, None

    # Extract EIGENVALUES
    eig_match = re.search(r'EIGENVALUES\[.*?\]=\s*((?:-?\d+\.\d+(?:D[+-]\d+)?\s*)+)', content)
    if not eig_match:
        print(f"No EIGENVALUES in {os.path.basename(filepath)}")
        return None, None
    
    # Extract OCCUPANCIES
    occ_match = re.search(r'MOLECULAR_ORBITAL_OCCUPANCIES\[.*?\]=\s*((?:\d+\.\d+\s*)+)', content)
    if not occ_match:
        print(f"No OCCUPANCIES in {os.path.basename(filepath)}")
        return None, None

    try:
        # Convert scientific notation (1.23D+05 → 1.23e+05)
        eigenvalues = [float(x.replace('D', 'e')) for x in eig_match.group(1).split()]
        occupancies = [float(x) for x in occ_match.group(1).split()]
        
        # Get HOMO (last occupied) and LUMO (first unoccupied)
        homo_idx = len([occ for occ in occupancies if occ > 0]) - 1
        return eigenvalues[homo_idx], eigenvalues[homo_idx + 1]
    except Exception as e:
        print(f"Parse error in {os.path.basename(filepath)}: {e}")
        return None, None

# Configuration
input_dir = "/Users/stella/Documents/tierney/project/output MOPAC/sol_mopac_files"
output_csv = os.path.join(input_dir, "/Users/stella/Documents/tierney/project/solu_homo_lumo.csv")
results = []

# Process files
for filename in os.listdir(input_dir):
    if filename.startswith('PM6_') and filename.endswith('.aux'):
        try:
            # Extract InChIKey and Solvent from filename (PM6_{inchikey}_{solvent}.aux)
            parts = filename[4:-4].split('_')  # Remove 'PM6_' and '.aux'
            inchikey = parts[0]
            solvent = '_'.join(parts[1:])  # Handle solvents with underscores
            
            # Parse HOMO/LUMO
            full_path = os.path.join(input_dir, filename)
            homo, lumo = parse_homo_lumo(full_path)
            
            if homo is not None and lumo is not None:
                results.append({
                    'InChIKey': inchikey,
                    'Solvent': solvent,
                    'HOMO': homo,
                    'LUMO': lumo
                })
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Create and save DataFrame
if results:
    solu_homo_lumo_df = pd.DataFrame(results)
    solu_homo_lumo_df.to_csv(output_csv, index=False)
    print(f"\nSuccess! Saved {len(solu_homo_lumo_df)} records to:\n{output_csv}")
    print("\nSample data:")
    print(solu_homo_lumo_df.head())
else:
    print("\nNo valid data found. Check error messages above.")

print(f"row count: {len(solu_homo_lumo_df)}")



Success! Saved 3249 records to:
/Users/stella/Documents/tierney/project/solu_homo_lumo.csv

Sample data:
                      InChIKey             Solvent   HOMO   LUMO
0  GVEPBJHOBDJJJI-UHFFFAOYNA-N          chloroform -8.754 -1.359
1  GVIJJXMXTUZIOD-UHFFFAOYNA-N         1,4-dioxane -8.529 -0.561
2  WBYWAXJHAXSJNI-KZFATGLANA-N             acetone -9.757 -0.747
3  RYYVLZVUVIJVGH-UHFFFAOYNA-N             acetone -9.428 -0.709
4  GPSDUZXPYCFOSQ-BGGKNDAXNA-N  2-methyl-1-butanol -9.828 -0.881
row count: 3249


In [19]:
import pandas as pd

df_solv_homo_lumo = pd.read_csv("/Users/stella/Documents/tierney/project/solvent_homo_lumo.csv")
df_solu_homo_lumo = pd.read_csv("/Users/stella/Documents/tierney/project/solu_homo_lumo.csv")

# Convert to float
df_solv_homo_lumo['HOMO'] = df_solv_homo_lumo['HOMO'].astype(float)
df_solv_homo_lumo['LUMO'] = df_solv_homo_lumo['LUMO'].astype(float)
df_solu_homo_lumo['HOMO'] = df_solu_homo_lumo['HOMO'].astype(float)
df_solu_homo_lumo['LUMO'] = df_solu_homo_lumo['LUMO'].astype(float)

# Merge DataFrames (keep all solute rows)
merged_df = pd.merge(
    df_solu_homo_lumo,
    df_solv_homo_lumo[['Solvent', 'HOMO', 'LUMO']],
    on='Solvent',
    how='left',
    suffixes=('_solu', '_solv')
)

# Calculate descriptors
merged_df['Lsolu_Hsolv'] = merged_df['LUMO_solu'] - merged_df['HOMO_solv']
merged_df['Lsolv_Hsolu'] = merged_df['LUMO_solv'] - merged_df['HOMO_solu']

# Save to CSV
merged_df.to_csv("/Users/stella/Documents/tierney/project/merged_results.csv", index=False)

# Verify
print("Shape before merge:", df_solu_homo_lumo.shape[0], "rows")
print("Shape after merge:", merged_df.shape[0], "rows")
print("Missing solvents:", merged_df[merged_df['HOMO_solv'].isna()]['Solvent'].unique())

# Drop the original HOMO/LUMO columns if needed
homo_lumo_df = merged_df.drop(['HOMO_solu', 'LUMO_solu', 'HOMO_solv', 'LUMO_solv'], axis=1)

# keep the original df_solu_homo_lumo structure:
df_solu_homo_lumo = merged_df


print(f"Final columns: {merged_df.columns.tolist()}")
print(f"row count: {len(merged_df)}")

Shape before merge: 3249 rows
Shape after merge: 3249 rows
Missing solvents: ['1,4-dioxane' '2-methyl-1-butanol' '2-butoxyethanol'
 '4-methyl-2-pentanol' 'peg400' 'dmso' '1-decanol' '2-propanol'
 '3-methyl-1-butanol' '1-hexanol' '2-methyl-1-propanol' '1-heptanol'
 '1-propanol' '1-butanol' 'n-methylformamide' '1-pentanol' 'o-xylene'
 'thf' '1-octanol' '3-methoxy-1-butanol' '2-ethoxyethanol' '2-pentanol'
 '2-methyl-2-propanol' '2-ethyl-1-hexanol' '1,2-dichloroethane'
 '2-methyl-1-pentanol' '2-butanol' 'p-xylene'
 'cis-1,2-dimethylcyclohexane' '2-propoxyethanol' 'm-xylene'
 'tert-butylcyclohexane' '2-isopropoxyethanol' 'n-methyl-2-pyrrolidone'
 '1-tert-butoxy-2-propanol' '3,7-dimethyl-1-octanol' '1-chlorobutane'
 'cis-1,4-dimethylcyclohexane' '2,2,4-trimethylpentane' 'dmf'
 'cis-1,3-dimethylcyclohexane' '2-methyl-2-butanol' '1,2-propanediol'
 'trans-1,2-dimethylcyclohexane' 'trans-1,4-dimethylcyclohexane'
 '1-chlorooctane' '1,2-dibromoethane']
Final columns: ['InChIKey', 'Solvent', 'HOMO_

In [48]:
# Read the CSV files (assuming you haven't done this yet)
import pandas as pd

df_solv_homo_lumo = pd.read_csv("/Users/stella/Documents/tierney/project/solvent_homo_lumo.csv")
df_solu_homo_lumo = pd.read_csv("/Users/stella/Documents/tierney/project/solu_homo_lumo.csv")

# Convert to float
df_solv_homo_lumo['HOMO'] = df_solv_homo_lumo['HOMO'].astype(float)
df_solv_homo_lumo['LUMO'] = df_solv_homo_lumo['LUMO'].astype(float)
df_solu_homo_lumo['HOMO'] = df_solu_homo_lumo['HOMO'].astype(float)
df_solu_homo_lumo['LUMO'] = df_solu_homo_lumo['LUMO'].astype(float)

# Replace underscores with hyphens/commas in df_solv_homo_lumo
df_solv_homo_lumo['Solvent'] = df_solv_homo_lumo['Solvent'].str.replace('_', '-')
# Special case for "1,4-dioxane" (replace "1-4" with "1,4")
df_solv_homo_lumo['Solvent'] = df_solv_homo_lumo['Solvent'].str.replace('1-4', '1,4')

missing_data = {
    'Solvent': ['peg400', 'dmso', 'n-methylformamide', 'thf', '1,2-dichloroethane',
               'cis-1,2-dimethylcyclohexane', 'n-methyl-2-pyrrolidone',
               '3,7-dimethyl-1-octanol', '2,2,4-trimethylpentane', 'dmf',
               'cis-1,3-dimethylcyclohexane', '1,2-propanediol',
               'trans-1,2-dimethylcyclohexane', '1,2-dibromoethane'],
    'HOMO': [-9.772, -8.417, -9.962, -9.54, -10.79,
            -10.506, -9.331, -10.255, -10.792, -9.435,
            -10.533, -10.4, -10.559, -10.59],
    'LUMO': [1.118, 0.948, 1.024, 2.345, 0.425,
            4.043, 1.173, 2.869, 4.159, 1.134,
            4.012, 2.234, 3.96, -0.439],
}

# Convert to DataFrame and add to existing solvent data
df_solv_homo_lumo = pd.concat([
    df_solv_homo_lumo, 
    pd.DataFrame(missing_data)
], ignore_index=True)

# Merge DataFrames (keep all solute rows)
merged_df = pd.merge(
    df_solu_homo_lumo,
    df_solv_homo_lumo[['Solvent', 'HOMO', 'LUMO']],
    on='Solvent',
    how='left',
    suffixes=('_solu', '_solv')
)

# Calculate descriptors
merged_df['Lsolu_Hsolv'] = merged_df['LUMO_solu'] - merged_df['HOMO_solv']
merged_df['Lsolv_Hsolu'] = merged_df['LUMO_solv'] - merged_df['HOMO_solu']

# Save to CSV
merged_df.to_csv("/Users/stella/Documents/tierney/project/homo_lumo.csv", index=False)

# Verify
print("Shape before merge:", df_solu_homo_lumo.shape[0], "rows")
print("Shape after merge:", merged_df.shape[0], "rows")
print("Missing solvents:", merged_df[merged_df['HOMO_solv'].isna()]['Solvent'].unique())

# Drop the original HOMO/LUMO columns
homo_lumo_df = merged_df.drop(['HOMO_solu', 'LUMO_solu', 'HOMO_solv', 'LUMO_solv'], axis=1)
homo_lumo_df.to_csv("/Users/stella/Documents/tierney/project/homo_lumo.csv", index=False)
print(f"Final columns: {homo_lumo_df.columns.tolist()}")


Shape before merge: 3249 rows
Shape after merge: 3249 rows
Missing solvents: []
Final columns: ['InChIKey', 'Solvent', 'Lsolu_Hsolv', 'Lsolv_Hsolu']


In [45]:
homo_lumo_df = merged_df.drop(['HOMO_solu', 'LUMO_solu', 'HOMO_solv', 'LUMO_solv'], axis=1)
merged_df.to_csv("/Users/stella/Documents/tierney/project/merged_results.csv", index=False)
print(f"Final columns: {merged_df.columns.tolist()}")

Final columns: ['InChIkey', 'Solvent', 'HOMO_solu', 'LUMO_solu', 'HOMO_solv', 'LUMO_solv', 'Lsolu_Hsolv', 'Lsolv_Hsolu']


In [None]:
 'LUMO_solu', 'HOMO_solv', 'LUMO_solv'], axis=1)
merged_df.to_csv("/Users/stella/Documents/tierney/project/merged_results.csv", index=False)

In [21]:
import os
import pandas as pd

def get_index_positions(list_of_elems, element):
    return [i for i, x in enumerate(list_of_elems) if x == element]

def parse_section(lines, start_marker):
    """Helper function to parse multi-line sections"""
    section_data = []
    capturing = False
    for line in lines:
        if start_marker in line:
            capturing = True
            continue
        if capturing:
            if ']=' in line:  # End of section
                break
            section_data.extend(line.strip().split())
    return section_data

def get_charge_descriptors(sol_path):
    df_charges = pd.DataFrame(columns=['InChIkey', 'Solvent', 'O_charges', 'C_charges',
                                     'Most_neg', 'Most_pos', 'Het_charges'])

    for filename in os.listdir(sol_path):
        if not filename.endswith('.aux'):
            continue
            
        try:
            # Extract InChIkey and solvent
            parts = filename.split('_')
            if len(parts) < 3:
                print(f"Skipping bad filename: {filename}")
                continue
                
            inchikey = parts[1]
            solvent = parts[2].replace('.aux', '')

            # Read file
            with open(os.path.join(sol_path, filename), 'r') as f:
                lines = f.readlines()

            # Parse atoms (handles multi-line ATOM_EL)
            atoms = parse_section(lines, 'ATOM_EL[')
            
            # Parse charges (handles multi-line ATOM_CHARGES)
            charges = parse_section(lines, 'ATOM_CHARGES[')

            # Validate
            if not atoms or not charges:
                print(f"Skipping {filename}: missing atom or charge data")
                continue
                
            if len(charges) != len(atoms):
                print(f"Debug {filename}: {len(atoms)} atoms, charges: {charges[:10]}...")
                continue

            # Convert charges
            try:
                charges_number = [float(x) for x in charges]
            except ValueError:
                print(f"Charge conversion failed in {filename}")
                continue

            # Calculate descriptors
            O_charges = sum(charges_number[i] for i in get_index_positions(atoms, 'O'))
            C_charges = sum(charges_number[i] for i in get_index_positions(atoms, 'C'))
            Most_pos = max(charges_number)
            Most_neg = min(charges_number)
            Het_charges = sum(c for a, c in zip(atoms, charges_number) if a not in {'C', 'H'})

            # Store results
            df_charges = pd.concat([
                df_charges,
                pd.DataFrame([{
                    'InChIkey': inchikey,
                    'Solvent': solvent,
                    'O_charges': O_charges,
                    'C_charges': C_charges,
                    'Most_neg': Most_neg,
                    'Most_pos': Most_pos,
                    'Het_charges': Het_charges
                }])
            ], ignore_index=True)

        except Exception as e:
            print(f"Error with {filename}: {str(e)}")
            continue

    return df_charges

# Usage
sol_path = '/Users/stella/Documents/tierney/project/output MOPAC/sol_mopac_files'
df_charges = get_charge_descriptors(sol_path)

# Save and show results
output_path = '/Users/stella/Documents/tierney/project/charge_descriptors.csv'
df_charges.to_csv(output_path, index=False)
print(f"Saved results for {len(df_charges)} compounds to {output_path}")
print(df_charges.head())

  df_charges = pd.concat([


Saved results for 3249 compounds to /Users/stella/Documents/tierney/project/charge_descriptors.csv
                      InChIkey             Solvent O_charges  C_charges  \
0  GVEPBJHOBDJJJI-UHFFFAOYNA-N          chloroform         0   -1.61070   
1  GVIJJXMXTUZIOD-UHFFFAOYNA-N         1,4-dioxane         0   -1.27891   
2  WBYWAXJHAXSJNI-KZFATGLANA-N             acetone  -1.10051   -0.44112   
3  RYYVLZVUVIJVGH-UHFFFAOYNA-N             acetone   -1.2109    0.32621   
4  GPSDUZXPYCFOSQ-BGGKNDAXNA-N  2-methyl-1-butanol   -1.0889   -0.48849   

   Most_neg  Most_pos Het_charges  
0  -0.15618   0.16410           0  
1  -0.13958   0.16914    -0.03833  
2  -0.60299   0.68712    -1.10051  
3  -0.60868   0.69004    -2.22267  
4  -0.60654   0.65746     -1.0889  


In [37]:
import pandas as pd

df_thermo = pd.read_csv('/Users/stella/Documents/tierney/project/G_thermo_data.csv')
df_dip_vol = pd.read_csv('/Users/stella/Documents/tierney/project/dip_data.csv')
merged_df = pd.read_csv('/Users/stella/Documents/tierney/project/merged_results.csv')
df_charges = pd.read_csv('/Users/stella/Documents/tierney/project/charge_descriptors.csv')

# 1. First check column names in each DataFrame
print("Column names check:")
print("df_thermo:", df_thermo.columns.tolist())
print("df_dip_vol:", df_dip_vol.columns.tolist())
print("merged_df:", merged_df.columns.tolist())
print("df_charges:", df_charges.columns.tolist())

df_thermo_dip_vol = pd.merge(df_thermo, df_dip_vol, on='InChIkey')




Column names check:
df_thermo: ['InChIkey', 'Solvent', 'G_sol', 'DeltaG_sol']
df_dip_vol: ['volume', 'sol_dip', 'InChIkey', 'Solvent']
merged_df: ['InChIkey', 'Solvent', 'HOMO_solu', 'LUMO_solu', 'HOMO_solv', 'LUMO_solv', 'Lsolu_Hsolv', 'Lsolv_Hsolu']
df_charges: ['InChIkey', 'Solvent', 'O_charges', 'C_charges', 'Most_neg', 'Most_pos', 'Het_charges']


In [38]:
df_thermo_dip_vol_homo_lumo = pd.merge(df_thermo_dip_vol, merged_df, on='InChIkey')

In [4]:
df_all = pd.merge(df_thermo_dip_vol_homo_lumo, df_charges, on='InChIkey')

print(f"Merged DataFrame shape: {df_all.shape}")

In [5]:
print(f"Merged DataFrame shape: {df_all.shape}")

Merged DataFrame shape: (89323553, 19)


In [51]:
import pandas as pd
df_thermo = pd.read_csv('/Users/stella/Documents/tierney/project/G_thermo_data.csv')
df_dip_vol = pd.read_csv('/Users/stella/Documents/tierney/project/dip_data.csv')
merged_df = pd.read_csv('/Users/stella/Documents/tierney/project/homo_lumo.csv')
df_charges = pd.read_csv('/Users/stella/Documents/tierney/project/charge_descriptors.csv')
# 1. Verify expected row counts
print("Original row counts:")
print(f"df_thermo: {len(df_thermo)} rows")
print(f"df_dip_vol: {len(df_dip_vol)} rows") 
print(f"merged_df: {len(merged_df)} rows")
print(f"df_charges: {len(df_charges)} rows")

# 2. Check for duplicate InChIkey+Solvent combinations
for name, df in [('thermo', df_thermo), ('dip_vol', df_dip_vol),
                 ('homo_lumo', merged_df), ('charges', df_charges)]:
    dupes = df.duplicated(subset=['InChIkey', 'Solvent']).sum()
    print(f"{name} duplicates: {dupes}")
    if dupes > 0:
        df = df.drop_duplicates(subset=['InChIkey', 'Solvent'], keep='first')

# 3. Perform the corrected merge - CRITICAL STEP
# Merge on BOTH InChIkey AND Solvent
df_all = (df_thermo
          .merge(df_dip_vol, on=['InChIkey', 'Solvent'], how='inner')
          .merge(merged_df, on=['InChIkey', 'Solvent'], how='inner')
          .merge(df_charges, on=['InChIkey', 'Solvent'], how='inner'))

# 4. Verify final output
print(f"\nFinal merged shape: {df_all.shape}")
print("Expected columns:", df_all.columns.tolist())
print("Sample data:")
print(df_all.head())

# 5. Save results
output_path = '/Users/stella/Documents/tierney/project/total_data.csv'
df_all.to_csv(output_path, index=False)
print(f"\nSaved {len(df_all)} rows to {output_path}")

Original row counts:
df_thermo: 3249 rows
df_dip_vol: 3249 rows
merged_df: 3249 rows
df_charges: 3249 rows
thermo duplicates: 0
dip_vol duplicates: 0
homo_lumo duplicates: 0
charges duplicates: 0

Final merged shape: (3249, 13)
Expected columns: ['InChIkey', 'Solvent', 'G_sol', 'DeltaG_sol', 'volume', 'sol_dip', 'Lsolu_Hsolv', 'Lsolv_Hsolu', 'O_charges', 'C_charges', 'Most_neg', 'Most_pos', 'Het_charges']
Sample data:
                      InChIkey                 Solvent      G_sol  DeltaG_sol  \
0  AAOVKJBEBIDNHE-UHFFFAOYNA-N  n-methyl-2-pyrrolidone -29429.188   -877.9704   
1  AAOVKJBEBIDNHE-UHFFFAOYNA-N               1-octanol -29190.814   -639.5964   
2  AAOVKJBEBIDNHE-UHFFFAOYNA-N                   water -29262.224   -711.0064   
3  AAOVKJBEBIDNHE-UHFFFAOYNA-N                 ethanol -29257.446   -706.2284   
4  ACWBQPMHZXGDFX-XGLYSIDGNA-N              chloroform -41792.200  -1033.5428   

    volume  sol_dip  Lsolu_Hsolv  Lsolv_Hsolu  O_charges  C_charges  Most_neg  \
0  324.523

In [50]:
 
# 1. Verify expected row counts and columns
print("Original row counts and columns:")
for name, df in [('thermo', df_thermo), ('dip_vol', df_dip_vol),
                 ('homo_lumo', merged_df), ('charges', df_charges)]:
    print(f"\n{name}:")
    print(f"Rows: {len(df)}")
    print("Columns:", df.columns.tolist())

Original row counts and columns:

thermo:
Rows: 3249
Columns: ['InChIkey', 'Solvent', 'G_sol', 'DeltaG_sol']

dip_vol:
Rows: 3249
Columns: ['volume', 'sol_dip', 'InChIkey', 'Solvent']

homo_lumo:
Rows: 3249
Columns: ['InChIKey', 'Solvent', 'Lsolu_Hsolv', 'Lsolv_Hsolu']

charges:
Rows: 3249
Columns: ['InChIkey', 'Solvent', 'O_charges', 'C_charges', 'Most_neg', 'Most_pos', 'Het_charges']
