In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/Users/stella/Downloads/tierney/project/original datasets/cross_solvent_solubility_dataset_BSc_2024.csv')

# Extract the desired columns
extracted_data = df[['MW', 'solute_inchikey']]

# If you want to save the extracted data to a new CSV file
extracted_data.to_csv('mw_data.csv', index=False)

# Display the extracted data (optional)
print(extracted_data.head())

        MW              solute_inchikey
0  312.453  WWYNJERNGUHSAO-XUDSTZEENA-N
1  312.453  WWYNJERNGUHSAO-XUDSTZEENA-N
2  249.310  BZZFPIRKNVWTKJ-UHFFFAOYNA-N
3  193.224  HQKJYAGNQSMRJK-FZOZFQFYNA-N
4  380.784  BUYMVQAILCEWRR-UHFFFAOYNA-N


In [6]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('mw_data.csv')

# Drop duplicates based on solute_inchikey, keeping the first occurrence
df_unique = df.drop_duplicates(subset=['solute_inchikey'], keep='first')
df_unique.to_csv('MW_data.csv', index=False)


# Print the number of duplicates removed
print(f"Original rows: {len(df)}")
print(f"Unique rows: {len(df_unique)}")
print(f"Duplicates removed: {len(df) - len(df_unique)}")

Original rows: 8562
Unique rows: 3268
Duplicates removed: 5294


In [4]:
%pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.6-cp313-cp313-macosx_10_15_x86_64.whl.metadata (4.0 kB)
Collecting Pillow (from rdkit)
  Downloading pillow-11.1.0-cp313-cp313-macosx_10_13_x86_64.whl.metadata (9.1 kB)
Downloading rdkit-2024.9.6-cp313-cp313-macosx_10_15_x86_64.whl (30.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.0/30.0 MB[0m [31m758.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading pillow-11.1.0-cp313-cp313-macosx_10_13_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m273.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: Pillow, rdkit
Successfully installed Pillow-11.1.0 rdkit-2024.9.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import os
import re
import mmap  
from concurrent.futures import ThreadPoolExecutor

def process_sol_file(filename, sol_path):
    """Process a single .aux file with mmap for fast reads"""
    try:
        with open(os.path.join(sol_path, filename), 'rb') as f:
            # Use mmap for zero-copy file access
            with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped_file:
                # Binary regex for performance
                mw_match = re.search(br'MOLECULAR_WEIGHT:AMU=([+-]?\d+\.\d+D[+-]?\d{2})', mmapped_file)
                sasa_match = re.search(br'AREA:SQUARE ANGSTROMS=([+-]?\d+\.\d+D[+-]?\d{2})', mmapped_file)
                
                if mw_match and sasa_match:
                    return {
                        "InChIkey": filename.split('_')[1],
                        "Solvent": filename.split('_')[2].replace('.aux', ''),
                        "MW": float(mw_match.group(1).replace(b'D', b'E')),
                        "SASA": float(sasa_match.group(1).replace(b'D', b'E'))
                    }
    except Exception as e:
        print(f"Error in {filename}: {str(e)}")
    return None

def extract_sol_data(sol_path, output_csv, batch_size=500, max_workers=8):
    """Process files in parallel batches"""
    files = [f for f in os.listdir(sol_path) if f.startswith('PM6_') and f.endswith('.aux')]
    
    # Write CSV header
    pd.DataFrame(columns=["InChIkey", "Solvent", "MW", "SASA"]).to_csv(output_csv, index=False)
    
    # Process in chunks
    for i in range(0, len(files), batch_size):
        batch = files[i:i + batch_size]
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(lambda f: process_sol_file(f, sol_path), batch))
        
        # Append results incrementally
        pd.DataFrame([r for r in results if r]).to_csv(
            output_csv, mode='a', header=False, index=False
        )
        print(f"Processed batch {i//batch_size + 1}/{(len(files)//batch_size)+1}")

# Run it
sol_path = '/Users/stella/Documents/aux_files'
extract_sol_data(sol_path, 'sasa_mw_data.csv')

Processed batch 1/7
Processed batch 2/7
Processed batch 3/7
Processed batch 4/7
Processed batch 5/7
Processed batch 6/7
Processed batch 7/7


In [42]:
import pandas as pd

# 1. Load data
df_sasa = pd.read_csv('/Users/stella/Downloads/tierney/project/sasa_mw_data.csv')
df_all = pd.read_csv('/Users/stella/Downloads/tierney/project/total_data.csv')

# 2. Verify column names exist
required_cols = ['InChIkey', 'Solvent']  # or ['solute_inchikey', 'solvent']?
for col in required_cols:
    if col not in df_sasa.columns:
        raise ValueError(f"sasa_mw_data.csv missing column: {col}")
    if col not in df_all.columns:
        raise ValueError(f"total_data.csv missing column: {col}")

# 3. Perform the merge
df_merge = df_sasa.merge(df_all, on=required_cols, how='inner')

# 4. Verify output
print(f"\nFinal merged shape: {df_merge.shape}")
print("Unique InChIkeys:", df_merge['InChIkey'].nunique())
print("Unique Solvents:", df_merge['Solvent'].nunique())
print("\nSample data:")
print(df_merge.head(3))

# 5. Save results
output_path = '/Users/stella/Downloads/tierney/project/all_data.csv'
df_merge.to_csv(output_path, index=False)
print(f"\nSaved {len(df_merge)} rows to {output_path}")


Final merged shape: (3249, 15)
Unique InChIkeys: 283
Unique Solvents: 84

Sample data:
                      InChIkey      Solvent       MW     SASA      G_sol  \
0  GVEPBJHOBDJJJI-UHFFFAOYNA-N   chloroform  202.255  227.244 -23009.264   
1  GVIJJXMXTUZIOD-UHFFFAOYNA-N  1,4-dioxane  216.315  223.495 -23380.074   
2  WBYWAXJHAXSJNI-KZFATGLANA-N      acetone  148.161  181.325 -22781.904   

   DeltaG_sol   volume   sol_dip  Lsolu_Hsolv  Lsolv_Hsolu  O_charges  \
0 -445.344204  243.071  0.325051        9.985        8.058    0.00000   
1 -381.436316  239.592  1.712100        9.121       10.254    0.00000   
2 -522.368375  182.608  2.469010        9.490       10.372   -1.10051   

   C_charges  Most_neg  Most_pos  Het_charges  
0   -1.61070  -0.15618   0.16410      0.00000  
1   -1.27891  -0.13958   0.16914     -0.03833  
2   -0.44112  -0.60299   0.68712     -1.10051  

Saved 3249 rows to /Users/stella/Downloads/tierney/project/all_data.csv
