## Use your API key here

In [77]:
api = 'sk-proj-***'

## Basic Prompt Structure with input variables for data and chemical formula

In [78]:
system_message = """
You are an expert in NMR spectroscopy and organic chemistry.
"""

user_prompt = """
Here are the peaks from an NMR spectrum: {spectrum_data}.
The chemical formula is {chemical_formula}. What's the molecule? 
Think step-by-step, making extensive use of a scratchpad to record your thoughts. Consider finding ways to group related peaks 
together, and keep track of the stoichiometry and the amount of unassigned H atoms as you make provisional assignments.
Format the final answer like this - 
### Scratchpad ### <scratchpad> ### Scratchpad ###
### Start answer ### <prediction> ### End answer ###
The prediction should only contain the name of the molecule and no other text or cha

"""

## Call to OpenAI

In [80]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import json

def call_openAI(spec_data, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = api,
             model = "gpt-4-0613"):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatOpenAI(api_key=api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_data": spec_data, "chemical_formula": chem_formula})
    return llm_output


## Call to Anthropic

In [81]:
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser

claude_api_key = "replace with actual key
def call_claude(spec_data, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = api,
             model = 'claude-3-opus-20240229'):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatAnthropic(api_key=claude_api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_data": spec_data, "chemical_formula": chem_formula})
    return llm_output


In [82]:
import os
import csv

with open("LLMSpectroscopy/dummy_dataset/1H-NMR-513-81-5.csv", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)
            
    data_string = ""
              
    for row in csvreader:
        row_string = ", ".join(row)
        data_string += row_string + "\n"  
specs_data = data_string

In [83]:
output_gpt4 = call_openAI(specs_data, 'C6H10')
output_gpt4


"### Scratchpad ###\n\nThe chemical formula is C6H10. This tells us that there are 6 carbon atoms and 10 hydrogen atoms in the molecule.\n\nThe NMR spectrum shows several peaks, which are likely due to different types of protons in the molecule. \n\nI'll start off by grouping related peaks together. \n\nThe peaks at 452.75, 451.94, 446.44, 445.44, 444.06, and 443.13 all have similar shifts and intensities, which suggests they may be due to the same type of proton. This could suggest a symmetrical structure.\n\nThe peaks at 172.19 and 171.75 also have similar shifts and intensities, which again suggests they are due to the same type of proton. \n\nNow, let's try to assign these peaks to the protons in the molecule.\n\nThe chemical shift of protons can give us a hint about the type of environment they are in. The peaks at 452.75, 451.94, 446.44, 445.44, 444.06, and 443.13 have high shifts, which could suggest they are in an electron-withdrawing environment, such as a carbonyl group.\n\nT

In [74]:
output_claude = call_claude(specs_data, 'C6H10')
output_claude

'### Scratchpad ###\nChemical formula C6H10 - benzene is ruled out (not enough H). Likely a cyclic or linear alkene.\n\nPeaks (ppm, # H):\n5.056  (1H)\n5.047  (1H)\n4.985  (1H)\n4.974  (1H)  \n4.959  (1H)\n4.948  (1H)\n1.923  (5H)\n1.918  (5H)\n\n6 H from 4.9-5.1 ppm - likely alkene protons\n10 H from 1.9-1.95 ppm - likely alkyl protons \n\nRatios:\n6H : 10H alkene to alkyl\nAlkene protons split into 6 separate peaks, alkyl into 2 (5H each)\n\nPossibilities:\n1) 1,5-hexadiene (matches 6 alkene H, 4 alkyl H - 6 too few)  \n2) 2,4-hexadiene (matches 6 alkene H, 4 alkyl H - 6 too few)\n3) methylcyclopentene (5 alkene H - 1 too few, 5 alkyl - 5 too few)\n4) methylenecyclopentane (2 alkene H - 4 too few, 8 alkyl - 2 too few)\n\nNone match perfectly. Methylcyclopentene is closest. The 6 alkene peaks could be the 5 ring protons next to the double bond, with long range coupling causing the extra splitting. The 10 alkyl protons match the 5 ring methylenes + the 5 methyl protons.\n### Scratchpad

## Output Parser

In [84]:
def extract_answer(analysis_string):
    start_pos = analysis_string.find("### Start answer ###") + len("### Start answer ###")
    end_pos = analysis_string.find("### End answer ###")

    answer = analysis_string[start_pos:end_pos].strip()
    return answer

print("GPT4:", extract_answer(output_gpt4) )
print("Claude:", extract_answer(output_claude))

GPT4: Cyclohexene
Claude: methylcyclopentene
