In [1]:
import pandas as pd
import requests
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from is_PFA import is_PFA
from tqdm import tqdm

## Load data and filter fully fluorinated PFAs

In [2]:
suppl = Chem.rdmolfiles.SDMolSupplier('../data/reaxys_bps_1.sdf')
mols = [mol for mol in suppl if mol]
print(len(mols))
suppl2 = Chem.rdmolfiles.SDMolSupplier('../data/reaxys_bps_2.sdf')
for m in suppl2:
    if m:
        mols.append(m)
print(len(mols))

517
950


In [3]:
PFAs = []
for mol in mols:
    if is_PFA(mol, alkane_only=True, print_reason=False):
        PFAs.append(mol)

In [4]:
len(PFAs)

276

## Filter data for molecules where the boiling point was measured at 760 torr

In [5]:
mols_filtered = []
# if boiling point and pressure is present
for mol in PFAs:
    try:
        mol.GetProp('BP.BP')
        mol.GetProp('BP.P')
        mols_filtered.append(mol)
    except:
        pass
len(mols_filtered)

76

In [6]:
# where boiling point is measured at 760 (with tolerance, there is some data with measurments at similar pressures, the expected differece caused by 
# this is smaller then the expected error from the predictions)
mols_filtered = [mol for mol in mols_filtered if (float(mol.GetProp('BP.P')) >= 755 and float(mol.GetProp('BP.P')) <= 765)]
len(mols_filtered)

31

## Deduplicate data and extract relevant information

In [7]:
data = []
for mol in mols_filtered:
    if mol is not None:
        name = mol.GetProp('IDE.CN')
        bp = mol.GetProp('BP.BP')
        inchi = mol.GetProp('IDE.INCHI')
        
        try:
            bp_value = float(bp) 
            data.append((name, bp_value, inchi))
        except ValueError:
            if '-' in bp:
                bp1, bp2 = bp.split('-')
                bp_value = (float(bp1.strip()) + float(bp2.strip()))/2
                data.append((name, bp_value, inchi))

df = pd.DataFrame(data, columns=['name', 'boiling_point', 'InChIKey'])

df_reaxys = df.groupby('InChIKey', as_index=False).agg({
    'name': 'first',  # Take the first name from each group
    'boiling_point': 'mean'  # Use the average boiling point for each group
})
def select_per(name):
    parts = name.split('|')  # Split by '|'
    per_parts = [p for p in parts if 'per' in p.lower()]  # Find parts containing "per"
    return per_parts[0] if per_parts else parts[0]  # Use first match or fallback to first part

df_reaxys['name'] = df_reaxys['name'].apply(select_per)

print('num of molecules: ', df_reaxys.shape[0])
print(df_reaxys.head())

num of molecules:  12
                      InChIKey                          name  boiling_point
0  AILNJPONTDNFHN-UHFFFAOYSA-N  perfluoro-3,4-dimethylhexane        103.750
1  BPHQIXJDBIHMLT-UHFFFAOYSA-N                 Perfluordecan        147.000
2  KAVGMUDTWQVPDF-UHFFFAOYSA-N               perfluorobutane         -1.700
3  LGUZHRODIJCVOC-UHFFFAOYSA-N              perfluoroheptane         82.454
4  MPEFSWGYIJNMCW-UHFFFAOYSA-N           perfluoroisopentane         30.120


In [8]:
inchi_1 = df_reaxys.InChIKey

In [9]:
df_reaxys

Unnamed: 0,InChIKey,name,boiling_point
0,AILNJPONTDNFHN-UHFFFAOYSA-N,"perfluoro-3,4-dimethylhexane",103.75
1,BPHQIXJDBIHMLT-UHFFFAOYSA-N,Perfluordecan,147.0
2,KAVGMUDTWQVPDF-UHFFFAOYSA-N,perfluorobutane,-1.7
3,LGUZHRODIJCVOC-UHFFFAOYSA-N,perfluoroheptane,82.454
4,MPEFSWGYIJNMCW-UHFFFAOYSA-N,perfluoroisopentane,30.12
5,NBQYGIPVNCVJJP-UHFFFAOYSA-N,"perfluoro(2,3-dimethylbutane)",60.0
6,NJCBUSHGCBERSK-UHFFFAOYSA-N,perfluoropentane,-4.5
7,NNZABWQPYXLJGH-UHFFFAOYSA-N,"Perfluor(3-ethyl-2,3-dimethylpentan)",135.5
8,UVWPNDVAQBNQBG-UHFFFAOYSA-N,perfluorononane,125.0
9,WMIYKQLTONQJES-UHFFFAOYSA-N,Hexafluoroethane,-78.23


# Additional data

## Acyclic

In [14]:
df_2 = pd.read_csv('../data/bp_data.csv', sep=';')

In [15]:
df_2['Mol'] = df_2.smiles.apply(Chem.MolFromSmiles)

In [16]:
for mol in df_2.Mol:
    if not is_PFA(mol, alkane_only=True, print_reason=False):
        display(mol)

In [17]:
df_2['InchiKey'] = df_2.Mol.apply(Chem.inchi.MolToInchiKey)

### Convert to the same units and deduplicate data

In [18]:
df_2.loc[df_2['unit'] == 'K', 'bp'] = df_2.loc[df_2['unit'] == 'K', 'bp'] - 273.15
df_2.loc[df_2['unit'] == 'K', 'unit'] = 'C'

In [19]:
df_grouped = (
    df_2
    .drop(columns='source')
    .groupby('InchiKey', as_index=False)
    .agg({
        'smiles': 'first',
        'molecular_formula': 'first',
        'bp': ['mean', 'std'],
        'Mol': 'first'
    })
)

df_grouped.columns = ['InchiKey', 'smiles', 'molecular_formula', 'bp_mean', 'bp_std', 'Mol']


In [20]:
df_grouped

Unnamed: 0,InchiKey,smiles,molecular_formula,bp_mean,bp_std,Mol
0,AQPUCGPFMVEJGS-UHFFFAOYSA-N,C(C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(...,C16F34,239.0,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
1,BPHQIXJDBIHMLT-UHFFFAOYSA-N,C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(C(C(C(...,C10F22,144.2,4.019437e-14,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
2,COQIQRBKEGPRSG-UHFFFAOYSA-N,C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C4F10,-0.3,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
3,CPMLSIADGFUFBX-UHFFFAOYSA-N,C(C(C(C(F)(F)F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(C(F...,C7F16,81.5,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
4,GLSWXDROFHXXOQ-UHFFFAOYSA-N,C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)C(F)(F)F,C5F12,30.1,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
5,GTTNGNCJHFPTQV-UHFFFAOYSA-N,C(C(C(C(F)(F)F)(C(F)(F)F)F)(F)F)(C(F)(F)F)(C(F...,C8F18,104.0,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
6,GXSFAIDPYIEIEF-UHFFFAOYSA-N,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(C(F)(F)F)F,C6F14,58.18,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
7,KAVGMUDTWQVPDF-UHFFFAOYSA-N,C(C(C(F)(F)F)(F)F)(C(F)(F)F)(F)F,C4F10,-1.3,3.22653e-14,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
8,LGUZHRODIJCVOC-UHFFFAOYSA-N,C(C(C(C(F)(F)F)(F)F)(F)F)(C(C(C(F)(F)F)(F)F)(F...,C7F16,82.35,0.07071068,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...
9,MPEFSWGYIJNMCW-UHFFFAOYSA-N,C(C(C(F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C512,30.06,0.06557439,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...


### Fetch IUPAC names from PubChem

In [21]:
def fetch_name(smiles):
    pugrest_prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    pugrest_operation = "property/CanonicalSMILES,MolecularFormula,IUPACName"
    pugrest_output = "json"
    pugrest_input = "compound/smiles/" + smiles
    pugrest_url = "/".join( (pugrest_prolog, pugrest_input, pugrest_operation, pugrest_output))
    res = requests.get(pugrest_url).json()
    return(res)

In [22]:
fetch_name('C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)F')

{'PropertyTable': {'Properties': [{'CID': 67724,
    'MolecularFormula': 'C4F10',
    'CanonicalSMILES': 'C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)F',
    'IUPACName': '1,1,1,2,3,3,3-heptafluoro-2-(trifluoromethyl)propane'}]}}

In [23]:
iupac_names = []
canonical_smiles_list = []
molecular_formulas = []

for smiles in tqdm(df_grouped['smiles']):
    try:
        res = fetch_name(smiles)
        props = res['PropertyTable']['Properties'][0]
        iupac_names.append(props['IUPACName'])
        canonical_smiles_list.append(props['CanonicalSMILES'])
        molecular_formulas.append(props['MolecularFormula'])
    except Exception as e:
        print(smiles)

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:20<00:00,  1.23it/s]


In [24]:
df_grouped['IUPAC_name'] = iupac_names
df_grouped['canonical_smiles'] = canonical_smiles_list
df_grouped['pubchem_mf'] = molecular_formulas

In [25]:
# Check if molecular formulas written by hand are correct
(df_grouped['pubchem_mf'] == df_grouped['molecular_formula']).all()

False

In [26]:
df_grouped[df_grouped['pubchem_mf'] != df_grouped['molecular_formula']]

Unnamed: 0,InchiKey,smiles,molecular_formula,bp_mean,bp_std,Mol,IUPAC_name,canonical_smiles,pubchem_mf
9,MPEFSWGYIJNMCW-UHFFFAOYSA-N,C(C(C(F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C512,30.06,0.065574,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,"1,1,1,2,2,3,4,4,4-nonafluoro-3-(trifluoromethy...",C(C(C(F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C5F12
24,ZKJTZMGEXCUNIE-UHFFFAOYSA-N,C(C(C(F)(F)F)(C(F)(F)F)F)(C(F)(F)F)(C(F)(F)F)C...,C7F14,82.2,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,"1,1,1,2,4,4,4-heptafluoro-2,3,3-tris(trifluoro...",C(C(C(F)(F)F)(C(F)(F)F)F)(C(F)(F)F)(C(F)(F)F)C...,C7F16


In [27]:
# Use the PubChem molecular formula and canonical smiles
df_grouped = df_grouped.drop(columns=['molecular_formula', 'smiles'])

In [28]:
df_grouped[df_grouped['pubchem_mf'] == 'C5F12']

Unnamed: 0,InchiKey,bp_mean,bp_std,Mol,IUPAC_name,canonical_smiles,pubchem_mf
4,GLSWXDROFHXXOQ-UHFFFAOYSA-N,30.1,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,"1,1,1,3,3,3-hexafluoro-2,2-bis(trifluoromethyl...",C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)C(F)(F)F,C5F12
9,MPEFSWGYIJNMCW-UHFFFAOYSA-N,30.06,0.06557439,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,"1,1,1,2,2,3,4,4,4-nonafluoro-3-(trifluoromethy...",C(C(C(F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C5F12
11,NJCBUSHGCBERSK-UHFFFAOYSA-N,29.3,6.153481e-15,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,"1,1,1,2,2,3,3,4,4,5,5,5-dodecafluoropentane",C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(F)F,C5F12


### Combine the reaxys and other datasets

In [29]:
df_combined = pd.merge(
    df_grouped,
    df_reaxys,
    how='outer',
    left_on='InchiKey',
    right_on='InChIKey'
)
df_combined['bp_diff'] = df_combined['bp_mean'] - df_combined['boiling_point']
df_combined['InChIKey'] = df_combined['InchiKey'].combine_first(df_combined['InChIKey'])
df_combined['name'] = df_combined['name'].combine_first(df_combined['IUPAC_name'])
df_combined['bp'] = df_combined['bp_mean'].combine_first(df_combined['boiling_point'])
df_combined = df_combined.drop(columns=['InchiKey', 'IUPAC_name', 'bp_mean', 'boiling_point'])

In [30]:
df_combined['bp_std'] = df_combined['bp_std'].round(2)

In [31]:
df_combined[df_combined['bp_diff'] > 5]

Unnamed: 0,bp_std,Mol,canonical_smiles,pubchem_mf,InChIKey,name,bp_diff,bp
12,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(F)F,C5F12,NJCBUSHGCBERSK-UHFFFAOYSA-N,perfluoropentane,33.8,29.3


Perfluoropentane has very different boiling points between reaxys and other sources. Reaxys says -4.5, other about 29°C. Other reaxys sources that do not state pressure also have something similar to 29. So I will use that

In [32]:
df_combined

Unnamed: 0,bp_std,Mol,canonical_smiles,pubchem_mf,InChIKey,name,bp_diff,bp
0,,,,,AILNJPONTDNFHN-UHFFFAOYSA-N,"perfluoro-3,4-dimethylhexane",,103.75
1,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(...,C16F34,AQPUCGPFMVEJGS-UHFFFAOYSA-N,"1,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11...",,239.0
2,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(C(C(C(...,C10F22,BPHQIXJDBIHMLT-UHFFFAOYSA-N,Perfluordecan,-2.8,144.2
3,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)F,C4F10,COQIQRBKEGPRSG-UHFFFAOYSA-N,"1,1,1,2,3,3,3-heptafluoro-2-(trifluoromethyl)p...",,-0.3
4,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(C(F)(F)F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(C(F...,C7F16,CPMLSIADGFUFBX-UHFFFAOYSA-N,"1,1,1,2,2,3,3,4,5,5,6,6,6-tridecafluoro-4-(tri...",,81.5
5,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(F)(F)F)(C(F)(F)F)(C(F)(F)F)C(F)(F)F,C5F12,GLSWXDROFHXXOQ-UHFFFAOYSA-N,"1,1,1,3,3,3-hexafluoro-2,2-bis(trifluoromethyl...",,30.1
6,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(C(F)(F)F)(C(F)(F)F)F)(F)F)(C(F)(F)F)(C(F...,C8F18,GTTNGNCJHFPTQV-UHFFFAOYSA-N,"1,1,1,2,3,3,5,5,5-nonafluoro-2,4,4-tris(triflu...",,104.0
7,,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(C(F)(F)F)F,C6F14,GXSFAIDPYIEIEF-UHFFFAOYSA-N,"1,1,1,2,2,3,4,4,5,5,5-undecafluoro-3-(trifluor...",,58.18
8,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(F)(F)F)(F)F)(C(F)(F)F)(F)F,C4F10,KAVGMUDTWQVPDF-UHFFFAOYSA-N,perfluorobutane,0.4,-1.3
9,0.07,<rdkit.Chem.rdchem.Mol object at 0x000001B6178...,C(C(C(C(F)(F)F)(F)F)(F)F)(C(C(C(F)(F)F)(F)F)(F...,C7F16,LGUZHRODIJCVOC-UHFFFAOYSA-N,perfluoroheptane,-0.104,82.35


In [33]:
df_combined.loc[df_combined['InChIKey'] == 'NJCBUSHGCBERSK-UHFFFAOYSA-N', 'bp'] = 29.30

In [34]:
df_combined.rename(columns={'bp': 'boiling_point', 'canonical_smiles': 'smiles', 'pubchem_mf': 'molecular_formula'}, inplace=True)

In [35]:
# fill missing smiles
def fetch_smiles(inchikey):
    pugrest_prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    pugrest_operation = "property/CanonicalSMILES"
    pugrest_output = "json"
    pugrest_input = "compound/inchikey/" + inchikey
    pugrest_url = "/".join( (pugrest_prolog, pugrest_input, pugrest_operation, pugrest_output))
    res = requests.get(pugrest_url).json()
    return(res['PropertyTable']['Properties'][0]['CanonicalSMILES'])

In [36]:
mask = df_combined['smiles'].isna()
df_combined.loc[mask, 'smiles'] = df_combined.loc[mask, 'InChIKey'].apply(fetch_smiles)

In [37]:
# fill missing formulas:
def fetch_data(inchikey):
    pugrest_prolog = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    pugrest_operation = "property/CanonicalSMILES,MolecularFormula,IUPACName"
    pugrest_output = "json"
    pugrest_input = "compound/inchikey/" + inchikey
    pugrest_url = "/".join( (pugrest_prolog, pugrest_input, pugrest_operation, pugrest_output))
    res = requests.get(pugrest_url).json()
    return(res['PropertyTable']['Properties'][0])

In [38]:
def fill_nan_with_fetch(row):
    if pd.isna(row['molecular_formula']):
        data = fetch_data(row['InChIKey'])
        return pd.Series({
            'molecular_formula': data['MolecularFormula'],
            'smiles': data['CanonicalSMILES'],
            'name': data['IUPACName']
        })
    return row[['molecular_formula', 'smiles', 'name']]

df_combined[['molecular_formula', 'smiles', 'name']] = df_combined.apply(fill_nan_with_fetch, axis=1)

In [39]:
df_combined.to_csv('../data/combined_data.csv', sep=';',columns=['name', 'smiles', 'InChIKey', 'molecular_formula', 'boiling_point'],index=False)