In [6]:
import pandas as pd
import requests
import re

In [7]:
file_path = "solvents_manClean.xlsx"
df = pd.read_excel(file_path, sheet_name='solvents')
df.head()

Unnamed: 0,Solvent_A,SMILES_solvent,Name_in_other_dataset,Data_source_for_SMILES_name,Comment
0,hydrogenated tetrapropylene,CCCC(C)CC(C)CC(C)C,hydrogenated tetrapropene; TPH (hydrogenated t...,PubChem,"search for 2,4,6-Trimethylnonane in PubChem. ..."
1,chloroform,C(Cl)(Cl)Cl,Chloroform,PubChem,
2,1-dodecanol,CCCCCCCCCCCCO,,PubChem,
3,n-octane,CCCCCCCC,,PubChem,
4,toluene,CC1=CC=CC=C1,Toluene,PubChem,


In [8]:
def get_cid_from_smiles(smiles):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/cids/TXT'
    response = requests.get(url)
    if response.status_code == 200:
        cid = response.text.strip()
        return cid
    else:
        print(f"Failed to retrieve CID for SMILES {smiles}")
        return None

def extract_properties(sections, property_name):
    properties = []
    for section in sections:
        if 'TOCHeading' in section and section['TOCHeading'] == property_name:
            for info in section.get('Information', []):
                value = info.get('Value', {}).get('StringWithMarkup', [{}])[0].get('String', '')
                if value:
                    properties.append(value)
        # Recursively search in subsections
        if 'Section' in section:
            properties.extend(extract_properties(section['Section'], property_name))
    return properties

def get_property_from_cid(cid, property_name):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON?heading={property_name.replace(" ", "+")}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        try:
            sections = data['Record']['Section']
            properties = extract_properties(sections, property_name)
            return properties
        except KeyError:
            pass
    else:
        print(f"Failed to retrieve {property_name} data for CID {cid}")
    return None


In [9]:
def parse_temperature(value_str):
    # Extract numerical value and unit
    match = re.search(r'(-?\d+\.?\d*)\s*Â°\s*([CFK])', value_str)
    if match:
        temp = float(match.group(1))
        unit = match.group(2)
        if unit == 'C':
            temp_k = temp + 273.15
        elif unit == 'F':
            temp_k = (temp - 32) * 5/9 + 273.15
        elif unit == 'K':
            temp_k = temp
        else:
            return None  # Unknown unit
        return temp_k
    else:
        return None  # Could not parse temperature


In [10]:
def extract_bp(smiles):
    cid = get_cid_from_smiles(smiles)
    if cid:
        print(f"CID for SMILES {smiles} is {cid}")
        boiling_points = get_property_from_cid(cid, 'Boiling Point')
        
        # Process Boiling Points
        if boiling_points:
            temp_k = parse_temperature(boiling_points[0])
            if temp_k:
                return temp_k
            else:
                return None
        else:
            return None

df['Boiling_point_K'] = df['SMILES_solvent'].apply(extract_bp)

CID for SMILES CCCC(C)CC(C)CC(C)C is 10953968
Failed to retrieve Boiling Point data for CID 10953968
CID for SMILES C(Cl)(Cl)Cl is 6212
CID for SMILES CCCCCCCCCCCCO is 8193
CID for SMILES CCCCCCCC is 356
CID for SMILES CC1=CC=CC=C1 is 1140
CID for SMILES CC(C)C1=CC=C(C=C1)C(C)C is 7486
CID for SMILES CC(C)(C)C1=CC=CC=C1 is 7366
CID for SMILES CCCCCCCCCCCC is 8182
CID for SMILES CCCCN1C=C[N+](=C1)C.CCCCN1C=C[N+](=C1)C.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F is 131725277
Failed to retrieve Boiling Point data for CID 131725277
CID for SMILES C1=CC=C(C=C1)[N+](=O)[O-] is 7416
CID for SMILES C(CCl)Cl is 11
CID for SMILES CCCCCCCCO is 957
CID for SMILES CCCCCC is 8058
CID for SMILES CCOCC is 3283
CID for SMILES C1=CC=CC=C1 is 241
CID for SMILES C(Cl)Cl is 6344
CID for SMILES CCl is 6327
CID for SMILES C1CCC(=O)CC1 is 7967
CID for SMILES C1=CC=C(C(=C1)C(F)(F)F)[N+](=O)[O-] is 9795
CID for SMILES C1=CC=C(C=C1)S(=O)(=O)C(F)(F)F is 555605
Failed to retri

In [11]:
def extract_mp(smiles):
    cid = get_cid_from_smiles(smiles)
    if cid:
        print(f"CID for SMILES {smiles} is {cid}")
        melting_points = get_property_from_cid(cid, 'Melting Point')

        # Process Boiling Points
        if melting_points:
            temp_k = parse_temperature(melting_points[0])
            if temp_k:
                return temp_k
            else:
                return None
        else:
            return None

df['Melting_point_K'] = df['SMILES_solvent'].apply(extract_mp)

CID for SMILES CCCC(C)CC(C)CC(C)C is 10953968
Failed to retrieve Melting Point data for CID 10953968
CID for SMILES C(Cl)(Cl)Cl is 6212
CID for SMILES CCCCCCCCCCCCO is 8193
CID for SMILES CCCCCCCC is 356
CID for SMILES CC1=CC=CC=C1 is 1140
CID for SMILES CC(C)C1=CC=C(C=C1)C(C)C is 7486
CID for SMILES CC(C)(C)C1=CC=CC=C1 is 7366
CID for SMILES CCCCCCCCCCCC is 8182
CID for SMILES CCCCN1C=C[N+](=C1)C.CCCCN1C=C[N+](=C1)C.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F is 131725277
Failed to retrieve Melting Point data for CID 131725277
CID for SMILES C1=CC=C(C=C1)[N+](=O)[O-] is 7416
CID for SMILES C(CCl)Cl is 11
CID for SMILES CCCCCCCCO is 957
CID for SMILES CCCCCC is 8058
CID for SMILES CCOCC is 3283
CID for SMILES C1=CC=CC=C1 is 241
CID for SMILES C(Cl)Cl is 6344
CID for SMILES CCl is 6327
CID for SMILES C1CCC(=O)CC1 is 7967
CID for SMILES C1=CC=C(C(=C1)C(F)(F)F)[N+](=O)[O-] is 9795
CID for SMILES C1=CC=C(C=C1)S(=O)(=O)C(F)(F)F is 555605
Failed to retri

In [12]:
def extract_density(smiles):
    cid = get_cid_from_smiles(smiles)
    if cid:
        print(f"CID for SMILES {smiles} is {cid}")
        densities = get_property_from_cid(cid, 'Density')

        # Process Boiling Points
        if densities:
            return densities[0]
        else:
            return None
    else:
        return None

df['Density'] = df['SMILES_solvent'].apply(extract_density)

CID for SMILES CCCC(C)CC(C)CC(C)C is 10953968
Failed to retrieve Density data for CID 10953968
CID for SMILES C(Cl)(Cl)Cl is 6212
CID for SMILES CCCCCCCCCCCCO is 8193
CID for SMILES CCCCCCCC is 356
CID for SMILES CC1=CC=CC=C1 is 1140
CID for SMILES CC(C)C1=CC=C(C=C1)C(C)C is 7486
CID for SMILES CC(C)(C)C1=CC=CC=C1 is 7366
CID for SMILES CCCCCCCCCCCC is 8182
CID for SMILES CCCCN1C=C[N+](=C1)C.CCCCN1C=C[N+](=C1)C.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F is 131725277
Failed to retrieve Density data for CID 131725277
CID for SMILES C1=CC=C(C=C1)[N+](=O)[O-] is 7416
CID for SMILES C(CCl)Cl is 11
CID for SMILES CCCCCCCCO is 957
CID for SMILES CCCCCC is 8058
CID for SMILES CCOCC is 3283
CID for SMILES C1=CC=CC=C1 is 241
CID for SMILES C(Cl)Cl is 6344
CID for SMILES CCl is 6327
CID for SMILES C1CCC(=O)CC1 is 7967
CID for SMILES C1=CC=C(C(=C1)C(F)(F)F)[N+](=O)[O-] is 9795
Failed to retrieve Density data for CID 9795
CID for SMILES C1=CC=C(C=C1)S(=O)(=O)C

In [13]:
def extract_solubility(smiles):
    cid = get_cid_from_smiles(smiles)
    if cid:
        print(f"CID for SMILES {smiles} is {cid}")
        solubilities = get_property_from_cid(cid, 'Solubility')

        # Process Boiling Points
        if solubilities:
            return solubilities
        else:
            return None
    else:
        return None

df['solubility'] = df['SMILES_solvent'].apply(extract_solubility)

CID for SMILES CCCC(C)CC(C)CC(C)C is 10953968
Failed to retrieve Solubility data for CID 10953968
CID for SMILES C(Cl)(Cl)Cl is 6212
CID for SMILES CCCCCCCCCCCCO is 8193
CID for SMILES CCCCCCCC is 356
CID for SMILES CC1=CC=CC=C1 is 1140
CID for SMILES CC(C)C1=CC=C(C=C1)C(C)C is 7486
CID for SMILES CC(C)(C)C1=CC=CC=C1 is 7366
CID for SMILES CCCCCCCCCCCC is 8182
CID for SMILES CCCCN1C=C[N+](=C1)C.CCCCN1C=C[N+](=C1)C.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F.C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F is 131725277
Failed to retrieve Solubility data for CID 131725277
CID for SMILES C1=CC=C(C=C1)[N+](=O)[O-] is 7416
CID for SMILES C(CCl)Cl is 11
CID for SMILES CCCCCCCCO is 957
CID for SMILES CCCCCC is 8058
CID for SMILES CCOCC is 3283
CID for SMILES C1=CC=CC=C1 is 241
CID for SMILES C(Cl)Cl is 6344
CID for SMILES CCl is 6327
CID for SMILES C1CCC(=O)CC1 is 7967
CID for SMILES C1=CC=C(C(=C1)C(F)(F)F)[N+](=O)[O-] is 9795
Failed to retrieve Solubility data for CID 9795
CID for SMILES C1=CC=C(C=C1)S

In [14]:
df.to_excel('output_solvents_pubchem.xlsx', index=False)