In [1]:
import os
import pandas as pd
import pubchempy as pcp
from pubchempy import get_compounds
from pka_lookup import pka_lookup_pubchem
from pubchemprops.pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props

path = os.path.join('your', 'directory', 'file.txt')

"""
The Pubchempy python package here is used to have easy access to PubChem library. 
We can retrieve the information on PubChem for molecules which the user demands the chromatography separation.
If the name of the compound matches with a PubChem page name, the properrties are retrieved and placed in the dictionnary properties.
"""

def get_compound_info(compound_name):
    compound = pcp.get_compounds(compound_name, 'name')
    if compound:
        compound = compound[0]
        properties = compound.to_dict(properties=['IUPACName', 'MolecularFormula', 'MolecularWeight', 'IsomericSMILES', 'XLogP', 'BoilingPt', 'Solubility', 'SaturatedVaporPressure', 'pKa', 'ExactMass', 'Fingerprint3D'])
        return properties
    else:
        print(f"Compound '{compound_name}' not found on PubChem.")
        return None

#Get the pKa value of a compound
def get_pka():
    
    
    if 'pKa' in text_pka:
    pKa_value = text_pka['pKa']
print(pKa_value)

"""THe df_info function gets an input of the user and uses the get_compound_info to write it in a df if possible"""

def df_info():
    compound_name = input("Enter the name of the compound like this (water, ethanol, methanol): ")
    compounds = ["water", "ethanol", "methanol"]

    df = pd.DataFrame()

    # Tries to get info from each compound
    for compound_name in compound_names:
        compound_info = get_compound_info(compound_name.strip())
        if compound_info:
            df = df.append(compound_info, ignore_index=True)
        else:
            print(f"Could not retrieve information for the compound '{compound_name.strip()}'.")
    print(df)

Boiling_temp_threshold = 300


def df_from_name(compound):
    easy_properties = get_first_layer_props('acetone', ['MolecularWeight', 'IUPACName', 'CanonicalSMILES', 'InChI'])


if __name__ == "__main__":
    main()

TypeError: main() missing 8 required positional arguments: 'boiling_temp', 'solubility', 'pKa', 'p_star', 'Boiling_temp_threshold', 'solubility_threshold', 'pKa_threshold', and 'p_star_threshold'

In [None]:
"""
This following notebook is the builind block which is used:
"""

In [28]:
import pandas as pd
from pubchemprops.pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props
import urllib.error
import urllib.parse
from pka_lookup import pka_lookup_pubchem
import pubchempy as pcp
import re
import json

#Finds the pKa using the code of Khoi Van.
def find_pka(inchikey_string):
    text_pka = pka_lookup_pubchem(inchikey_string, "inchikey")
    if text_pka is not None and 'pKa' in text_pka:
        pKa_value = text_pka['pKa']
        return pKa_value
    else:
        return None

def find_boiling_point(name):
    text_dict = get_second_layer_props(str(name), ['Boiling Point', 'Vapor Pressure'])
    
    Boiling_point_values = []
    #finds all celsius
    pattern_celsius = r'([-+]?\d*\.\d+|\d+) °C'
    pattern_F = r'([-+]?\d*\.\d+|\d+) °F'
    
    for item in text_dict['Boiling Point']:
        # Check if the item has a key 'Value' and 'StringWithMarkup'
        if 'Value' in item and 'StringWithMarkup' in item['Value']:
            # Access the 'String' key inside the nested dictionary
            string_value = item['Value']['StringWithMarkup'][0]['String']
            match_celsius = re.search(pattern_celsius, string_value)
            if match_celsius:
                celsius = float(match_celsius.group(1))
                Boiling_point_values.append(celsius)

            #Search for Farenheit values, if found: converts farenheit to celsius before adding to the list
            match_F = re.search(pattern_F, string_value)
            if match_F:
                fahrenheit_temp = float(match_F.group(1))
                celsius_from_F = round(((fahrenheit_temp - 32) * (5/9)), 2)
                Boiling_point_values.append(celsius_from_F)

    #get the mean value
    Boiling_temp = round((sum(Boiling_point_values) / len(Boiling_point_values)), 2)
    return Boiling_temp

"""
This code takes as input a list of compound written like: acetone, water. The code allows spaces, wrong names and unknown pubchem names.
Then it iterates through each of them to find if they exist on pubchem, and if they do,
then 'CID', 'MolecularFormula', 'MolecularWeight', 'InChI', 'InChIKey', 'IUPACName', 'XLogP', 'pKa',  and 'BoilingPoint' is added into a list and then a data frame.
The code takes time as find_pka(inchikey_string) and find_boiling_point(name) request URL to find the string on the Pubchem page, then extract it using regex. 
The boiling point is a mean of all the values (references) found.
"""

def get_df_properties():
    compound_name = input("Enter the name of the compound like this: water, ethanol, methanol =")
    compound_list = [compound.strip() for compound in compound_name.split(',')]
    
    compound_properties = []
    valid_properties = []
    for compound_name in compound_list:
        compound_name_encoded = urllib.parse.quote(compound_name.strip())
        try: 
            first_data = get_first_layer_props(compound_name_encoded, ['MolecularFormula', 'MolecularWeight', 'InChI', 'InChIKey', 'IUPACName', 'XLogP'])
            compound_info = {}
            for prop in ['CID', 'MolecularFormula', 'MolecularWeight', 'InChI', 'InChIKey', 'IUPACName', 'XLogP']:
                compound_info[prop] = first_data.get(prop)
            #print(first_data)
            
            #adds pKa value
            pka_value = find_pka(first_data['InChIKey'])
            if pka_value is not None:
                compound_info['pKa'] = pka_value
            else:
                pass
            
            #adds boiling point, solubility
            compound_info['BoilingPoint'] = find_boiling_point(compound_name_encoded)
    
            # When every property has been added to compound_info, add to the properties. This makes sure all properties have the right keys
            compound_properties.append(compound_info)
        
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f'{compound_name} not found on PubChem')
            else:
                print(f'An error occurred: {e}')
    
    for prop in compound_properties:
        if isinstance(prop, dict):
            valid_properties.append(prop)
    df = pd.DataFrame(valid_properties)
    # Set the property names from the first dictionary as column headers
    if len(valid_properties) > 0:
        df = df.reindex(columns=valid_properties[0].keys())

    #print(df)
    return(df)

Enter the name of the compound like this: water, ethanol, methanol = L-phenylalanine, Malonic acid


    CID MolecularFormula MolecularWeight  \
0  6140         C9H11NO2          165.19   
1   867           C3H4O4          104.06   

                                               InChI  \
0  InChI=1S/C9H11NO2/c10-8(9(11)12)6-7-4-2-1-3-5-...   
1  InChI=1S/C3H4O4/c4-2(5)1-3(6)7/h1H2,(H,4,5)(H,...   

                      InChIKey                            IUPACName  XLogP  \
0  COLNVLDHVKWLRT-QMMMGPOBSA-N  (2S)-2-amino-3-phenylpropanoic acid   -1.5   
1  OFOBLEOULBTSOW-UHFFFAOYSA-N                    propanedioic acid   -0.8   

               pKa  BoilingPoint  
0              2.2         295.0  
1  2.85 (at 25 °C)         140.0  


In [28]:
from pubchemprops.pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props
import urllib.error

def df_from_name(compound):
    
    try: 
        1st_datas = get_first_layer_props(compound, ['MolecularFormula', 'MolecularWeight', 'InChI', 'InChIKey', 'IUPACName', 'XLogP'])
        
    except urllib.error.HTTPError as e:
        if e.code == 404:
            print(f'{compound} not found on PubChem')
        else:
            print(f'An error occurred: {e}')



    
df_from_name('water')

all is good


In [14]:
from pubchempy import get_compounds

def info_from_name(compound_name):
    compound = pcp.get_compounds(compound_name, 'name')
    if compound:
        compound = compound[0]
        properties = compound.to_dict(properties=['XLogP', 'BoilingPt', 'Solubility', 'SaturatedVaporPressure', 'pKa', 'ExactMass', 'Fingerprint3D'])
        return properties
    else:
        print(f"Compound '{compound_name}' not found on PubChem.")
        return None


print .xlogp

AttributeError: 'Compound' object has no attribute 'XLogP'

In [None]:
pcp.get_compounds('C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1', 'smiles')

In [4]:
import pubchempy as pcp
from pubchempy import Compound, get_compounds

df1 = pcp.get_compounds('C20H41Br', 'formula', as_dataframe=True)
df2 = pcp.get_substances([1, 2, 3, 4], as_dataframe=True)
df3 = pcp.get_properties(['isomeric_smiles', 'xlogp'], 'methanol', 'name', as_dataframe=True)
print(df3)

    IsomericSMILES  XLogP
CID                      
887             CO   -0.5


In [11]:
import pandas as pd
from pubchemprops.pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props
import re
import json

def find_boiling_point(name):
    text_dict = get_second_layer_props(str(name), ['Boiling Point', 'Vapor Pressure'])
    print(text_dict)
    print()
    
    Boiling_point_values = []
    #finds all celsius
    pattern_celsius = r'([-+]?\d*\.\d+|\d+) °C'
    pattern_F = r'([-+]?\d*\.\d+|\d+) °F'
    
    for item in text_dict['Boiling Point']:
        # Check if the item has a key 'Value' and 'StringWithMarkup'
        if 'Value' in item and 'StringWithMarkup' in item['Value']:
            # Access the 'String' key inside the nested dictionary
            string_value = item['Value']['StringWithMarkup'][0]['String']
            match_celsius = re.search(pattern_celsius, string_value)
            print(match_celsius)
            if match_celsius:
                celsius = float(match_celsius.group(1))
                Boiling_point_values.append(celsius)

            #Search for Farenheit values, if found: converts farenheit to celsius before adding to the list
            match_F = re.search(pattern_F, string_value)
            print(match_F)
            if match_F:
                fahrenheit_temp = float(match_F.group(1))
                celsius_from_F = round(((fahrenheit_temp - 32) * (5/9)), 2)
                Boiling_point_values.append(celsius_from_F)

    #get the mean value
    Boiling_temp = round((sum(Boiling_point_values) / len(Boiling_point_values)), 2)
    return Boiling_temp
        
        
    print(Boiling_point_values)
    print(Boiling_temp)
print(find_boiling_point('acetone'))

{'Boiling Point': [{'ReferenceNumber': 2, 'Reference': ['National Toxicology Program, Institute of Environmental Health Sciences, National Institutes of Health (NTP). 1992. National Toxicology Program Chemical Repository Database. Research Triangle Park, North Carolina.'], 'Value': {'StringWithMarkup': [{'String': '133 °F at 760 mmHg (NTP, 1992)'}]}}, {'ReferenceNumber': 39, 'Description': 'PEER REVIEWED', 'Reference': ['Haynes, W.M. (ed.). CRC Handbook of Chemistry and Physics. 95th Edition. CRC Press LLC, Boca Raton: FL 2014-2015, p. 3-4'], 'Value': {'StringWithMarkup': [{'String': '56.08 °C'}]}}, {'ReferenceNumber': 40, 'Reference': ['The Good Scents Company Information System'], 'Value': {'StringWithMarkup': [{'String': '56.00 to 57.00 °C. @ 760.00 mm Hg'}]}}, {'ReferenceNumber': 55, 'Value': {'StringWithMarkup': [{'String': '56 °C'}]}}, {'ReferenceNumber': 101, 'Value': {'StringWithMarkup': [{'String': '133 °F'}]}}, {'ReferenceNumber': 139, 'Value': {'StringWithMarkup': [{'String'

In [10]:
from pubchemprops.pubchemprops import get_cid_by_name, get_second_layer_props

name = 'methanol'
methanol_values = get_second_layer_props(str(name), ['Boiling Point'])
#, 'Vapor Pressure'
print(methanol_values)

{'Boiling Point': [{'ReferenceNumber': 1, 'Reference': ['National Toxicology Program, Institute of Environmental Health Sciences, National Institutes of Health (NTP). 1992. National Toxicology Program Chemical Repository Database. Research Triangle Park, North Carolina.'], 'Value': {'StringWithMarkup': [{'String': '148.3 °F at 760 mmHg (NTP, 1992)'}]}}, {'ReferenceNumber': 45, 'Description': 'PEER REVIEWED', 'Reference': ["O'Neil, M.J. (ed.). The Merck Index - An Encyclopedia of Chemicals, Drugs, and Biologicals. Cambridge, UK:  Royal Society of Chemistry, 2013., p. 1106"], 'Value': {'StringWithMarkup': [{'String': '64.7 °C at 760 mm Hg'}]}}, {'ReferenceNumber': 46, 'Reference': ['The Good Scents Company Information System'], 'Value': {'StringWithMarkup': [{'String': '64.00 to 65.00 °C. @ 760.00 mm Hg'}]}}, {'ReferenceNumber': 56, 'Value': {'StringWithMarkup': [{'String': '65 °C'}]}}, {'ReferenceNumber': 92, 'Value': {'StringWithMarkup': [{'String': '147 °F'}]}}, {'ReferenceNumber': 12

In [26]:
from pubchemprops.pubchemprops import get_cid_by_name, get_second_layer_props

name = 'water'
methanol_values = get_second_layer_props(str(name), ['Boiling Point', 'Vapor Pressure'])
print(methanol_values)

{'Boiling Point': [{'ReferenceNumber': 1, 'Value': {'StringWithMarkup': [{'String': '212 °F at 760 mmHg'}]}}, {'ReferenceNumber': 46, 'Reference': ['MSDS'], 'Value': {'Number': [100]}}, {'ReferenceNumber': 69, 'Description': 'PEER REVIEWED', 'Reference': ['Haynes, W.M. (ed.). CRC Handbook of Chemistry and Physics. 94th Edition. CRC Press LLC, Boca Raton: FL 2013-2014, p. 4-98'], 'Value': {'StringWithMarkup': [{'String': '99.974 °C'}]}}], 'Vapor Pressure': [{'ReferenceNumber': 68, 'Value': {'StringWithMarkup': [{'String': '23.75 [mmHg]'}]}}, {'ReferenceNumber': 69, 'Description': 'PEER REVIEWED', 'Reference': ["Lewis, R.J. Sr. (ed) Sax's Dangerous Properties of Industrial Materials. 11th Edition. Wiley-Interscience, Wiley & Sons, Inc. Hoboken, NJ. 2004., p. V3: 3692"], 'Value': {'StringWithMarkup': [{'String': 'VP: 760 mm Hg at 100 °C'}]}}, {'ReferenceNumber': 69, 'Description': 'PEER REVIEWED', 'Reference': ['Haynes, W.M. (ed.). CRC Handbook of Chemistry and Physics. 94th Edition. CRC 

In [14]:
from pka_lookup import pka_lookup_pubchem
import pubchempy as pcp
import re
import json

cas = "7732-18-5"
text_pka = pka_lookup_pubchem(cas)
#print(text_pka)

if 'pKa' in text_pka:
    pKa_value = text_pka['pKa']
print(pKa_value)

None


"\nif 'pKa' in text_pka:\n    pKa_value = text_pka['pKa']\nprint(pKa_value)\n"

In [None]:
def find_pka(inchikey_string):
    text_pka = pka_lookup_pubchem(inchikey_string, "inchikey")
    pKa_value = text_pka['pKa']
    return pKa_value

In [6]:
import pandas as pd
from pubchemprops.pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props
import urllib.error
import urllib.parse
from pka_lookup import pka_lookup_pubchem
import pubchempy as pcp
import re
import json

def find_pka(inchikey_string):
    text_pka = pka_lookup_pubchem(inchikey_string, "inchikey")
    if text_pka is not None and 'pKa' in text_pka:
        pKa_value = float(text_pka['pKa'])
        return pKa_value
    else:
        return None
"""
water = XLYOFNOQVPJJNP-UHFFFAOYSA-N
acetone = CSCPPACGZOOCGX-UHFFFAOYSA-N	
maleic_acid = VZCYOOQTPOCHFL-UPHRSURJSA-N
Acetic Acid = QTBSBXVTEAMEQO-UHFFFAOYSA-N
"""
list = ['XLYOFNOQVPJJNP-UHFFFAOYSA-N', 'CSCPPACGZOOCGX-UHFFFAOYSA-N', 'VZCYOOQTPOCHFL-UPHRSURJSA-N', 'QTBSBXVTEAMEQO-UHFFFAOYSA-N']
for compound in list:
    print(find_pka(compound))
    print(type(find_pka(compound)))

None
<class 'NoneType'>
20.0
<class 'float'>
1.83
<class 'float'>
4.756
<class 'float'>


In [40]:
import re
from pka_lookup import pka_lookup_pubchem
import pubchempy as pcp
import re
import json

#This code tries to find the pka value from the text found by pka_lookup

cas = "67-56-1"
text_pka = pka_lookup_pubchem(cas)
pattern = r''pKa': ''

for text in text_pka:
    match = re.search(pattern, text)

    if match:
        for value in match.groups():
            if value is not None:
                float_value = float(value)
                print("Float value:", float_value)
                break  # Break the loop once the value is found
    else:
        print("Pattern not found.")

SyntaxError: unterminated string literal (detected at line 11) (951748959.py, line 11)

In [None]:
{'Boiling Point': [{'ReferenceNumber': 2, 'Reference': ['National'], 'Value': {'StringWithMarkup': [{'String': '133 °F at 760 mmHg (NTP, 1992)'}]}}, {'ReferenceNumber': 39, 'Description': 'PEER REVIEWED', 'Reference': ['Haynes, W.M. (ed.). CRC Handbook of Chemistry and Physics. 95th Edition. CRC Press LLC, Boca Raton: FL 2014-2015, p. 3-4'], 'Value': {'StringWithMarkup': [{'String': '56.08 °C'}]}}, {'ReferenceNumber': 40, 'Reference': ['The Good Scents Company Information System'], 'Value': {'StringWithMarkup': [{'String': '56.00 to 57.00 °C. @ 760.00 mm Hg'}]}}, {'ReferenceNumber': 55, 'Value': {'StringWithMarkup': [{'String': '56 °C'}]}}, {'ReferenceNumber': 101, 'Value': {'StringWithMarkup': [{'String': '133 °F'}]}}, {'ReferenceNumber': 139, 'Value': {'StringWithMarkup': [{'String': '133 °F'}]}}], 'Vapor Pressure': [{'ReferenceNumber': 2, 'Reference': ['National Toxicology Program, Institute of Environmental Health Sciences, National Institutes of Health (NTP). 1992. National Toxicology Program Chemical Repository Database. Research Triangle Park, North Carolina.'], 'Value': {'StringWithMarkup': [{'String': '180 mmHg at 68 °F ; 270 mmHg at 86 °F (NTP, 1992)'}]}}, {'ReferenceNumber': 37, 'Value': {'StringWithMarkup': [{'String': '231.0 [mmHg]'}]}}, {'ReferenceNumber': 39, 'Description': 'PEER REVIEWED', 'Reference': ['PMID:7676461', 'Alarie Y et al; Toxicol Appl Pharmacol 134: 92-99 (1995)'], 'Value': {'StringWithMarkup': [{'String': '231 mm Hg at 25 °C'}]}}, {'ReferenceNumber': 55, 'Value': {'StringWithMarkup': [{'String': 'Vapor pressure, kPa at 20 °C: 24'}]}}, {'ReferenceNumber': 101, 'Value': {'StringWithMarkup': [{'String': '180 mmHg'}]}}, {'ReferenceNumber': 139, 'Value': {'StringWithMarkup': [{'String': '180 mmHg'}]}}]}