In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
# from langchain_anthropic import ChatAnthropic
# from langchain_core.output_parsers import StrOutputParser
import csv
import os
import pandas as pd
import re

In [2]:
api = 'sk-proj'# use your own keys

system_message = """
You are an expert in NMR spectroscopy and organic chemistry.
"""

user_prompt = """
Here are the peaks from the HNMR spectrum: {spectrum_HNMR}, and peaks from the CNMR spectrum: {spectrum_CNMR}.
The chemical formula is {chemical_formula}. What's the molecule name? 
Think step-by-step, making extensive use of a scratchpad to record your thoughts. Consider finding ways to group related peaks 
together, and keep track of the stoichiometry and the amount of unassigned H atoms as you make provisional assignments.
Please solely look at the HNMR to determine molecule.
Format the final answer like this - 
### Scratchpad ### <scratchpad> ### Scratchpad ###
### Start answer ### <prediction> ### End answer ###
The prediction should only contain the name of the molecule and no other text
"""

In [3]:
# Path to the folder containing the files
folder_path = 'csvfiles_update'

# Dictionary to store HNMR and CNMR data for each CAS number
cas_data = {}

# Iterate over the filenames in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('_HNMR.csv'):
        # Extract CAS number and chemical formula from the filename
        parts = filename.split('_')
        cas_number = parts[0]
        chemical_formula = parts[1]
        
        # Read HNMR spectrum data from the file
        with open(os.path.join(folder_path, filename), 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)  # Skip header
            
            data_string_hnmr = ""
              
            for row in csvreader:
                row_string = ", ".join(row)
                data_string_hnmr += row_string + "\n"
                
        # Find corresponding CNMR file
        cnmr_filename = filename.replace('HNMR', 'CNMR')
        cnmr_path = os.path.join(folder_path, cnmr_filename)
        if not os.path.exists(cnmr_path):
            continue
            
        # Read CNMR spectrum data from the file
        with open(cnmr_path, 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)  # Skip header
            
            data_string_cnmr = ""
              
            for row in csvreader:
                row_string = ", ".join(row)
                data_string_cnmr += row_string + "\n"
        
       # Add HNMR and CNMR data to the dictionary
        if cas_number in cas_data:
            cas_data[cas_number]['spectrum_HNMR'] += data_string_hnmr
            cas_data[cas_number]['spectrum_CNMR'] += data_string_cnmr
        else:
            cas_data[cas_number] = {
                'spectrum_HNMR': data_string_hnmr, 
                'spectrum_CNMR': data_string_cnmr,
                'chemical_formula': chemical_formula  # Ensure chemical formula is stored
            }

# List to store the results
results = []

def call_openAI(spec_data1, spec_data2, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = api,
             model = "gpt-4-0613"):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatOpenAI(api_key=api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_HNMR": spec_data1, "spectrum_CNMR": spec_data2,  "chemical_formula": chem_formula})
    return llm_output

# Iterate over the CAS numbers and call the OpenAI API
for cas_number, data in cas_data.items():
    output = call_openAI(spec_data1=data['spectrum_HNMR'],  spec_data2=data['spectrum_CNMR'],
                         chem_formula = data['chemical_formula'])
    results.append((cas_number, output))

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=['cas_number', 'guess'])


In [4]:
import re

# Define a function to extract the guess answer
def extract_guess_answer(x):
    match = re.search(r'### Start answer ###\s*(.*?)\s*### End answer ###', x)
    if match:
        return match.group(1)
    else:
        return None  # or any other fallback value you prefer

# Apply the function to create a new column 'guess_answer'
df['guess_answer'] = df['guess'].apply(extract_guess_answer)

# Display the DataFrame with the new 'guess_answer' column
print(df)

    cas_number                                              guess  \
0     105-53-3  ### Scratchpad ###\n\nThe molecule is C7H14O2,...   
1   13475-82-6  ### Scratchpad ### \n\nConsidering the chemica...   
2      93-58-3  ### Scratchpad ### \n\nThe chemical formula C8...   
3      52-52-8  ### Scratchpad ###\nThe chemical formula is C6...   
4      78-78-4  ### Scratchpad ### \nThe chemical formula C5H1...   
5     591-76-4  ### Scratchpad ### \n\nThe given formula is C7...   
6     108-38-3  ### Scratchpad ### \nThe molecular formula is ...   
7     124-19-6  ### Scratchpad ### \nThe chemical formula is C...   
8     107-83-5  ### Scratchpad ### \nThe chemical formula C6H1...   
9     540-84-1  ### Scratchpad ### \nGiven the molecular formu...   
10  22204-53-1  ### Scratchpad ### \nFirstly, taking a look at...   
11    513-81-5  ### Scratchpad ### \nThe chemical formula is C...   
12    123-72-8  ### Scratchpad ### \nLooking at the HNMR spect...   
13    620-02-0  ### Scratchpad ###

In [5]:
df['guess'].values


array(['### Scratchpad ###\n\nThe molecule is C7H14O2, which indicates that there is likely an ester or carboxylic acid functional group present due to the presence of an oxygen atom. The CNMR spectrum shows 4 unique carbon environments, meaning that there is symmetry in the molecule. The most deshielded carbon at 166.64 ppm is likely a carbonyl carbon.\n\nLooking at the HNMR spectrum, there are four distinct sets of protons. The most deshielded set at 4.242, 4.218, 4.194, and 4.171 ppm are likely the protons next to the carbonyl group. The set of protons at 3.361 ppm could be the protons next to an oxygen atom. The two sets of protons at 1.309 and 1.285 ppm are likely the protons on the rest of the carbon chain.\n\nThe molecular formula C7H14O2 also suggests that there might be a CH3 group (since the number of hydrogens is even), which could correspond to the highest peak in the HNMR spectrum at 1.285 ppm.\n\nTaking all this into account, the molecule could be an ester with a four-car

In [6]:
import re

# Define a function to extract the guess answer
def extract_guess_answer(x):
    match = re.search(r'### Start answer ###\s*(.*?)\s*### End answer ###', x)
    if match:
        return match.group(1)
    else:
        return None  # or any other fallback value you prefer

# Apply the function to create a new column 'guess_answer'
df['guess_answer'] = df['guess'].apply(extract_guess_answer)

# Display the DataFrame with the new 'guess_answer' column
print(df)


    cas_number                                              guess  \
0     105-53-3  ### Scratchpad ###\n\nThe molecule is C7H14O2,...   
1   13475-82-6  ### Scratchpad ### \n\nConsidering the chemica...   
2      93-58-3  ### Scratchpad ### \n\nThe chemical formula C8...   
3      52-52-8  ### Scratchpad ###\nThe chemical formula is C6...   
4      78-78-4  ### Scratchpad ### \nThe chemical formula C5H1...   
5     591-76-4  ### Scratchpad ### \n\nThe given formula is C7...   
6     108-38-3  ### Scratchpad ### \nThe molecular formula is ...   
7     124-19-6  ### Scratchpad ### \nThe chemical formula is C...   
8     107-83-5  ### Scratchpad ### \nThe chemical formula C6H1...   
9     540-84-1  ### Scratchpad ### \nGiven the molecular formu...   
10  22204-53-1  ### Scratchpad ### \nFirstly, taking a look at...   
11    513-81-5  ### Scratchpad ### \nThe chemical formula is C...   
12    123-72-8  ### Scratchpad ### \nLooking at the HNMR spect...   
13    620-02-0  ### Scratchpad ###

In [7]:
df2=pd.read_csv('CandidateList_HL.csv')



In [11]:
import pandas as pd

# Your existing code for merging dataframes
# Assuming df and df2 are already defined and populated with your data
merged_df = pd.merge(df, df2, how='left', left_on='cas_number', right_on='CAS')

# Drop the redundant 'CAS' column
merged_df.drop(columns=['CAS'], inplace=True)

# Rename the common name column to 'True answer'
merged_df.rename(columns={'Common name': 'True answer'}, inplace=True)

# Path to save the DataFrame to your Mac desktop
desktop_path = "/Users/username/Desktop/merged_dataframe.csv"  # Replace 'your_username' with your actual username

# Save the DataFrame to the specified path
merged_df.to_csv(desktop_path, index=False)

# Output the merged DataFrame
print(merged_df)

    cas_number                                              guess  \
0     105-53-3  ### Scratchpad ###\n\nThe molecule is C7H14O2,...   
1   13475-82-6  ### Scratchpad ### \n\nConsidering the chemica...   
2      93-58-3  ### Scratchpad ### \n\nThe chemical formula C8...   
3      52-52-8  ### Scratchpad ###\nThe chemical formula is C6...   
4      78-78-4  ### Scratchpad ### \nThe chemical formula C5H1...   
5     591-76-4  ### Scratchpad ### \n\nThe given formula is C7...   
6     108-38-3  ### Scratchpad ### \nThe molecular formula is ...   
7     124-19-6  ### Scratchpad ### \nThe chemical formula is C...   
8     107-83-5  ### Scratchpad ### \nThe chemical formula C6H1...   
9     540-84-1  ### Scratchpad ### \nGiven the molecular formu...   
10  22204-53-1  ### Scratchpad ### \nFirstly, taking a look at...   
11    513-81-5  ### Scratchpad ### \nThe chemical formula is C...   
12    123-72-8  ### Scratchpad ### \nLooking at the HNMR spect...   
13    620-02-0  ### Scratchpad ###

In [12]:
merged_df.to_csv('merged_df.csv', index=False)


In [13]:
df

Unnamed: 0,cas_number,guess,guess_answer
0,105-53-3,"### Scratchpad ###\n\nThe molecule is C7H14O2,...",Butyl propionate
1,13475-82-6,### Scratchpad ### \n\nConsidering the chemica...,Dodecane
2,93-58-3,### Scratchpad ### \n\nThe chemical formula C8...,p-Toluic acid
3,52-52-8,### Scratchpad ###\nThe chemical formula is C6...,"2,2,4-Trimethylpentanamide"
4,78-78-4,### Scratchpad ### \nThe chemical formula C5H1...,Pentane
5,591-76-4,### Scratchpad ### \n\nThe given formula is C7...,"2,2,3-trimethylbutane"
6,108-38-3,### Scratchpad ### \nThe molecular formula is ...,m-Xylene
7,124-19-6,### Scratchpad ### \nThe chemical formula is C...,Nonanol
8,107-83-5,### Scratchpad ### \nThe chemical formula C6H1...,Hexane
9,540-84-1,### Scratchpad ### \nGiven the molecular formu...,Octane
