The following code just imports all the functions and packages necessary for the functionning of the functions

In [1]:
import sys
import os

src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import pandas as pd
from pubchemprops import get_cid_by_name, get_first_layer_props, get_second_layer_props
import urllib.error
import urllib.parse
from pka_lookup import pka_lookup_pubchem
import re
import json

### <ins>The following code is part of the get_df_properties(mixture) function.</ins>

This part of the code uses pubchempy to easily find most of the properties. It takes into arguments a list of compound's names. It will return a dictionnary for each compound with the following properties: 'CID', 'MolecularFormula', 'MolecularWeight', 'InChIKey', 'IUPACName', 'XLogP'.

In [6]:
compound_list = ["caffeine", "Aspartame", "Acesulfame K"]
#Delete '#' for a list with a wrong name
#compound_list = ["Water", "Acetone", "Wrong name"]


for compound_name in compound_list:
    compound_name_encoded = urllib.parse.quote(compound_name.strip())
    try: 
        first_data = get_first_layer_props(compound_name_encoded, ['MolecularFormula', 'MolecularWeight', 'InChIKey', 'IUPACName', 'XLogP'])
        compound_info = {}
        for prop in ['CID', 'MolecularFormula', 'MolecularWeight', 'InChIKey', 'IUPACName', 'XLogP']:
            if prop == 'MolecularWeight':
                MolecularWeight_string = first_data.get(prop)
                if MolecularWeight_string is not None:
                    MolecularWeight_float = float(MolecularWeight_string)
                    compound_info[prop] = MolecularWeight_float
                else:
                    compound_info[prop] = None
            else:
                compound_info[prop] = first_data.get(prop)
    except urllib.error.HTTPError as e:
        if e.code == 404:
            print(f'{compound_name} not found on PubChem')
        else:
            print(f'An error occurred: {e}')
    print(compound_info)

{'CID': 2519, 'MolecularFormula': 'C8H10N4O2', 'MolecularWeight': 194.19, 'InChIKey': 'RYYVLZVUVIJVGH-UHFFFAOYSA-N', 'IUPACName': '1,3,7-trimethylpurine-2,6-dione', 'XLogP': -0.1}
{'CID': 134601, 'MolecularFormula': 'C14H18N2O5', 'MolecularWeight': 294.3, 'InChIKey': 'IAOZJIPTCAWIRG-QWRGUYRKSA-N', 'IUPACName': '(3S)-3-amino-4-[[(2S)-1-methoxy-1-oxo-3-phenylpropan-2-yl]amino]-4-oxobutanoic acid', 'XLogP': -2.7}
{'CID': 11074431, 'MolecularFormula': 'C4H4KNO4S', 'MolecularWeight': 201.24, 'InChIKey': 'WBZFUFAFFUEMEI-UHFFFAOYSA-M', 'IUPACName': 'potassium;6-methyl-2,2-dioxo-1-oxa-2lambda6-thia-3-azanidacyclohex-5-en-4-one', 'XLogP': None}


### <ins> Finding pka using Pubchem from InchiKey String</ins>

Using the InchiKey String found by the function right before, the following function **returns the first pka found on PubChem as string**. This value, similary to the Boiling temperature, is a lot harder to find. This script uses a file of *Khoi Van* named **pka_lookup.py** which request the needed dictionnary of strings on PubChem. This means it takes **quite a while to find the string**, but creating a database is in scope. 

From the string found, this code extracts the pka_value from the dictionnary and returns the value as a string which will be converted in float in the function of Chrfinder.py.

In [9]:
#inchikey of caffeine
inchikey_string = 'RYYVLZVUVIJVGH-UHFFFAOYSA-N'

def find_pka(inchikey_string):
    text_pka = pka_lookup_pubchem(inchikey_string, "inchikey")
    print(text_pka)
    if text_pka is not None and 'pKa' in text_pka:
            pKa_value = text_pka['pKa']
            return pKa_value
    else:
        return None

find_pka(inchikey_string)

{'source': 'Pubchem', 'Pubchem_CID': '2519', 'pKa': '14', 'reference': 'https://www.sigmaaldrich.com/content/dam/sigma-aldrich/docs/Sigma-Aldrich/Product_Information_Sheet/c0750pis.pdf', 'Substance_CASRN': '58-08-2', 'Canonical_SMILES': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C', 'Isomeric_SMILES': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C', 'InChI': 'InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3', 'InChIKey': 'RYYVLZVUVIJVGH-UHFFFAOYSA-N', 'IUPAC_Name': '1,3,7-trimethylpurine-2,6-dione'}


'14'

### <ins> Finding Boiling Temperature using Pubchem from name</ins>

Using the names of compound_list, the following function **returns the mean of the celsius and Fahrenheit Boiling Temperatures found on PubChem as float**. This value, similary to the Boiling temperature, is a lot harder to find. This script uses a file of *Maxim Shevelev* named **pubchemprops.py** which request the needed dictionnary of strings on PubChem. This means it takes **quite a while to find the string**, but creating a database is in scope. 

From the string_value in text_dict (extracted from PubChem), this code extracts all the boiling points from the dictionnary and returns the mean after converting Fahrenheit in celsius. The output is a float with 2 decimals.

In [12]:
def find_boiling_point(name):
    text_dict = get_second_layer_props(str(name), ['Boiling Point', 'Vapor Pressure'])
    Boiling_point_values = []
    pattern_celsius = r'([-+]?\d*\.\d+|\d+) °C'
    pattern_F = r'([-+]?\d*\.\d+|\d+) °F'

    print(text_dict)
    
    for item in text_dict['Boiling Point']:
        if 'Value' in item and 'StringWithMarkup' in item['Value']:
            string_value = item['Value']['StringWithMarkup'][0]['String']

            #Search for Celsius values, if found: adds to the list Boiling_point_values
            match_celsius = re.search(pattern_celsius, string_value)
            if match_celsius:
                celsius = float(match_celsius.group(1))
                Boiling_point_values.append(celsius)

            #Search for Farenheit values, if found: converts farenheit to celsius before adding to the list Boiling_point_values
            match_F = re.search(pattern_F, string_value)
            if match_F:
                fahrenheit_temp = float(match_F.group(1))
                celsius_from_F = round(((fahrenheit_temp - 32) * (5/9)), 2)
                Boiling_point_values.append(celsius_from_F)
                
    if Boiling_point_values:
        Boiling_temp = round((sum(Boiling_point_values) / len(Boiling_point_values)), 2)
    else:
        Boiling_temp = None
    return Boiling_temp

find_boiling_point("Caffeine")

{'Boiling Point': [{'ReferenceNumber': 2, 'Reference': ['National Toxicology Program, Institute of Environmental Health Sciences, National Institutes of Health (NTP). 1992. National Toxicology Program Chemical Repository Database. Research Triangle Park, North Carolina.'], 'Value': {'StringWithMarkup': [{'String': '352 °F at 760 mmHg (sublimes) (NTP, 1992)'}]}}, {'ReferenceNumber': 24, 'Reference': ['http://www.inchem.org/documents/icsc/icsc/eics0405.htm'], 'Value': {'Number': [178]}}, {'ReferenceNumber': 51, 'Description': 'PEER REVIEWED', 'Reference': ["O'Neil, M.J. (ed.). The Merck Index - An Encyclopedia of Chemicals, Drugs, and Biologicals. Cambridge, UK:  Royal Society of Chemistry, 2013., p. 289"], 'Value': {'StringWithMarkup': [{'String': '178 °C (sublimes)'}]}}, {'ReferenceNumber': 71, 'Value': {'StringWithMarkup': [{'String': '178 °C (sublimes)'}]}}], 'Vapor Pressure': [{'ReferenceNumber': 49, 'Value': {'StringWithMarkup': [{'String': '0.00000001 [mmHg]'}]}}, {'ReferenceNumbe

177.93