In [32]:
api = 'sk-proj-***'# use your own keys

system_message = """
You are an expert in NMR spectroscopy and organic chemistry.
"""

user_prompt = """
Here are the peaks from the HNMR spectrum: {spectrum_HNMR}, and peaks from the CNMR spectrum: {spectrum_CNMR}.
The chemical formula is {chemical_formula}. What's the molecule? 
Think step-by-step, making extensive use of a scratchpad to record your thoughts. Consider finding ways to group related peaks 
together, and keep track of the stoichiometry and the amount of unassigned H atoms as you make provisional assignments.
Please combine HNMR and CNMR to determine molecule.
Format the final answer like this - 
### Scratchpad ### <scratchpad> ### Scratchpad ###
### Start answer ### <prediction> ### End answer ###
The prediction should only contain the name of the molecule and no other text or cha

"""

In [26]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
# from langchain_anthropic import ChatAnthropic
# from langchain_core.output_parsers import StrOutputParser
import csv

In [33]:
def call_openAI(spec_data1, spec_data2, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = api,
             model = "gpt-4-0613"):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatOpenAI(api_key=api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_HNMR": spec_data1, "spectrum_CNMR": spec_data2,  "chemical_formula": chem_formula})
    return llm_output

In [34]:
# Path to the folder containing the files
folder_path = 'csvfiles_update'

# Dictionary to store HNMR and CNMR data for each CAS number
cas_data = {}

# Iterate over the filenames in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('_HNMR.csv'):
        # Extract CAS number and chemical formula from the filename
        parts = filename.split('_')
        cas_number = parts[0]
        chemical_formula = parts[1]
        
        # Read HNMR spectrum data from the file
        with open(os.path.join(folder_path, filename), 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)  # Skip header
            
            data_string_hnmr = ""
              
            for row in csvreader:
                row_string = ", ".join(row)
                data_string_hnmr += row_string + "\n"
                
        # Find corresponding CNMR file
        cnmr_filename = filename.replace('HNMR', 'CNMR')
        cnmr_path = os.path.join(folder_path, cnmr_filename)
        if not os.path.exists(cnmr_path):
            continue
            
        # Read CNMR spectrum data from the file
        with open(cnmr_path, 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)  # Skip header
            
            data_string_cnmr = ""
              
            for row in csvreader:
                row_string = ", ".join(row)
                data_string_cnmr += row_string + "\n"
        
        # Add HNMR and CNMR data to the dictionary
        if cas_number in cas_data:
            cas_data[cas_number]['spectrum_HNMR'] += data_string_hnmr
            cas_data[cas_number]['spectrum_CNMR'] += data_string_cnmr
        else:
            cas_data[cas_number] = {'spectrum_HNMR': data_string_hnmr, 'spectrum_CNMR': data_string_cnmr}
        
# List to store the results
results = []

# Iterate over the CAS numbers and call the OpenAI API
for cas_number, data in cas_data.items():
    output = call_openAI(spec_data1=data['spectrum_HNMR'],  spec_data2=data['spectrum_CNMR'], \
                         chem_formula = data['chemical_formula'])
    results.append((cas_number, output))

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=['cas_number', 'guess'])

In [65]:
df['guess'].values

array(['### Scratchpad ### \nThe chemical formula C7H14O2 suggests that the molecule could be an ester, as the structure COO is common for esters. \n\nLooking at the CNMR, the peak at 166.64 ppm indicates a carbonyl carbon, which is consistent with our ester hypothesis. The peak at 61.47 ppm could correspond to a carbon bonded to an oxygen atom, again suggesting an ester. The peak at 14.09 ppm is indicative of a terminal methyl group (CH3). The peak at 41.71 ppm could correspond to a carbon in a chain or a carbon bonded to a heteroatom.\n\nThe HNMR data show three different types of protons: those around 4.2 ppm (1272.5, 1265.41, 1258.29, 1251.2 Hz) are likely to be protons on a carbon next to an oxygen atom, those around 3.361 ppm (1008.29 Hz) could be protons on a carbon in the middle of a chain, and those around 1.3 ppm (392.57, 385.51, 378.39 Hz) are likely to be methyl protons.\n\nCombining all these data, the molecule could be 2-ethylbutanoic acid ethyl ester, which would have a 

In [67]:
# Extract guess answer from output
import re
df['guess_answer'] = df['guess'].\
apply(lambda x: re.search(r'### Start answer ###\s*(.*?)\s*### End answer ###', x).group(1))
df

Unnamed: 0,cas_number,guess,guess_answer
0,105-53-3,### Scratchpad ### \nThe chemical formula C7H1...,2-ethylbutanoic acid ethyl ester
1,13475-82-6,### Scratchpad ###\n\nThe molecular formula pr...,Dodecane
2,93-58-3,### Scratchpad ### \n\nThe chemical formula pr...,p-Toluic acid
3,52-52-8,### Scratchpad ###\nLet's get started with the...,Caprolactam
4,78-78-4,### Scratchpad ### \n\nGiven the chemical form...,Neopentane
5,591-76-4,### Scratchpad ### \nThe formula C7H16 indicat...,Heptane
6,108-38-3,### Scratchpad ### \n\nThe molecular formula g...,Styrene
7,124-19-6,### Scratchpad ### \n\nGiven chemical formula ...,"The molecule is 2,2,4-trimethylpentan-3-one"
8,107-83-5,"The formula C6H14 suggests an alkane, which is...",Hexane
9,540-84-1,### Scratchpad ###\n\nThe chemical formula giv...,Octane


In [69]:
df2=pd.read_csv('CandidateList_HL.csv')

In [70]:
# Merge df and df2 on 'cas_number'
merged_df = pd.merge(df, df2, how='left', left_on='cas_number', right_on='CAS')

# Drop the redundant 'CAS' column
merged_df.drop(columns=['CAS'], inplace=True)

# Rename the common name column to 'True answer'
merged_df.rename(columns={'Common name': 'True answer'}, inplace=True)

merged_df

Unnamed: 0,cas_number,guess,guess_answer,molecular formula,True answer
0,105-53-3,### Scratchpad ### \nThe chemical formula C7H1...,2-ethylbutanoic acid ethyl ester,C7H14O2,Diethyl malonate
1,13475-82-6,### Scratchpad ###\n\nThe molecular formula pr...,Dodecane,C12H26,"Heptane, 2,2,4,6,6-pentamethyl-"
2,93-58-3,### Scratchpad ### \n\nThe chemical formula pr...,p-Toluic acid,C8H8O2,"Benzoic acid, methyl ester"
3,52-52-8,### Scratchpad ###\nLet's get started with the...,Caprolactam,C6H11NO2,"Cyclopentanecarboxylic acid, 1-amino-"
4,78-78-4,### Scratchpad ### \n\nGiven the chemical form...,Neopentane,C5H12,"Butane, 2-methyl-"
5,591-76-4,### Scratchpad ### \nThe formula C7H16 indicat...,Heptane,C7H16,"Hexane, 2-methyl-"
6,108-38-3,### Scratchpad ### \n\nThe molecular formula g...,Styrene,C8H10,"Benzene, 1,3-dimethyl-"
7,124-19-6,### Scratchpad ### \n\nGiven chemical formula ...,"The molecule is 2,2,4-trimethylpentan-3-one",C9H18O,Nonanol
8,107-83-5,"The formula C6H14 suggests an alkane, which is...",Hexane,C6H14,"Pentane, 2-methyl-"
9,540-84-1,### Scratchpad ###\n\nThe chemical formula giv...,Octane,C8H18,"Pentane, 2,2,4-trimethyl-"


In [71]:
merged_df.to_csv('merged_df.csv', index=False)