## Data Transfom

In [1]:
import pandas as pd

# Define function to read JCAMP-DX file
def read_jcampdx_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

# Define function to extract data from JCAMP-DX lines
def extract_data_from_jcampdx(lines):
    data_started = False
    data = []
    for line in lines:
        if '##XYDATA' in line:
            data_started = True
            continue
        if data_started:
            if '##END=' in line:
                break
            values = line.strip().split()
            data.append(values)
        
    return data

# Define function to export data to CSV
def export_to_csv(data, csv_path):
    # Define column names
    columns = ['Wavenumber (1/cm)'] + ['Transmittance_' + str(i) for i in range(1, len(data[0]))]
    df = pd.DataFrame(data, columns=columns)
    df = df.apply(pd.to_numeric)
    df.to_csv(csv_path, index=False)
    return csv_path

# Main function
def get_csv(jcampdx_file_path, csv_export_path):
    # Read JCAMP-DX file
    jcampdx_lines = read_jcampdx_file(jcampdx_file_path)
    # Extract data
    data = extract_data_from_jcampdx(jcampdx_lines) 
    # Export to CSV
    export_to_csv(data, csv_export_path)

def transform_jdx_to_dataframe(jdx_file_path, final_csv_path):
    # Define temporary CSV export path
    csv_export_path = "temp.csv"

    # Read the JCAMP-DX file and export to CSV
    get_csv(jdx_file_path, csv_export_path)
    Wave_Transmittance = pd.read_csv(csv_export_path)

    # Extract FIRSTX, LASTX, and NPOINTS
    FIRSTX = None
    LASTX = None
    NPOINTS = None

    with open(jdx_file_path, 'r') as file:
        jdx_file = file.readlines()
        for line in jdx_file:
            if line.startswith('##FIRSTX='):
                FIRSTX = float(line.split('=')[1])
            elif line.startswith('##LASTX='):
                LASTX = float(line.split('=')[1])
            elif line.startswith('##NPOINTS='):
                NPOINTS = int(line.split('=')[1])

    # Calculate X_inc
    X_inc = (LASTX - FIRSTX) / (NPOINTS - 1)

    # Initialize new DataFrame
    new_data = {'Wavenumber (1/cm)': [], 'Transmittance': []}

    # Iterate over rows
    for index, row in Wave_Transmittance.iterrows():
        wavenumber = row['Wavenumber (1/cm)']
        for col in Wave_Transmittance.columns[1:]:
            transmittance = row[col]
            new_data['Wavenumber (1/cm)'].append(wavenumber)
            new_data['Transmittance'].append(transmittance)
            wavenumber += X_inc

    # Create transformed DataFrame
    transformed_df = pd.DataFrame(new_data)
    transformed_df.to_csv(final_csv_path, index=False)
    return final_csv_path

# Example use-case
# final_csv_path = transform_jdx_to_dataframe("79-16-3-IR.jdx", "transformed_data1.csv")


In [3]:
import os

# Specify the path to your dataset folder
dataset_folder = "dataset"

# Get a list of all files in the dataset folder
files = os.listdir(dataset_folder)

# Print the list of files
print("Files in the dataset folder:")
for file in files:
    with open(file, 'r') as f:
        content = f.read()
        if '##TITLE=Spectrum not found.\n##END=\n' not in content:
            transform_jdx_to_dataframe(file,"transformeddata/" + file + ".csv")
            pd.read_csv(file + ".csv")
            print(file + ": file_added")
            # Your transformation logic here
        else:
            pd.DataFrame().to_csv("transformeddata/" +file + ".csv")
            print(file + ':Empty')

Files in the dataset folder:
C5H11NO:Empty
C6H11NO2:Empty
C13H18O2:Empty
C6H6O2:Empty
C8H18 : file_added
C9H18O : file_added
C6H14: file_added
C4H8O:Empty
C7H14O2: file_added
C8H10O2: file_added
C7H16:Empty
C7H7NO:Empty
C6H10O: file_added
C12H26:Empty
C6H10:Empty
C14H14O3:Empty
C5H12: file_added
C8H8O2: file_added
C6H12O2: file_added
C8H10: file_added


## Use your API key here

In [77]:
api = 'sk-proj-***'

## Basic Prompt Structure with input variables for data and chemical formula

In [4]:
system_message = """
You are an expert in NMR spectroscopy and organic chemistry.
"""

user_prompt = """
Here are the peaks from an NMR spectrum: {spectrum_data}.
The chemical formula is {chemical_formula}. What's the molecule? 
Think step-by-step, making extensive use of a scratchpad to record your thoughts. Consider finding ways to group related peaks 
together, and keep track of the stoichiometry and the amount of unassigned H atoms as you make provisional assignments.
Format the final answer like this - 
### Scratchpad ### <scratchpad> ### Scratchpad ###
### Start answer ### <prediction> ### End answer ###
The prediction should only contain the name of the molecule and no other text or cha

"""

## Call to OpenAI

In [5]:
api_key = 'sk*'

In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import json

def call_openAI(spec_data, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = api_key,
             model = "gpt-4-0613"):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatOpenAI(api_key=api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_data": spec_data, "chemical_formula": chem_formula})
    return llm_output


## Call to Anthropic

In [7]:
pip install langchain_anthropic

Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser

claude_api_key = "sk-*"
def call_claude(spec_data, chem_formula, system_message=system_message, user_prompt=user_prompt, 
             api_key = claude_api_key,
             model = 'claude-3-opus-20240229'):
             #"gpt-3.5-turbo"):

    llm = ChatOpenAI(api_key=api_key, model = model)

    prompt = ChatPromptTemplate.from_messages([
    ("system", system_message),
    ("user", user_prompt)])

    llm = ChatAnthropic(api_key=claude_api_key, model=model)

    parser = StrOutputParser()
    
    chain = prompt | llm | parser

    llm_output = chain.invoke({"spectrum_data": spec_data, "chemical_formula": chem_formula})
    return llm_output


In [10]:
import pandas as pd

def select_nth_data_points(data, n,output_file):
    """
    Selects one out of every `n` data points from a CSV file and writes them to another CSV file.
    
    Prameters:
        file_path (str): The path to the input CSV file.
        n (int): The interval for selecting data points (e.g., 1 out of every `n` points). Defaults to 1 (include all data points).
        output_file (str): The path to the output CSV file. Defaults to 'selected_data.csv'.
    
    Returns values:
        pd.DataFrame: A DataFrame containing the selected data points.
    """
    # Read the input CSV file
    # data = pd.read_csv(file_path)

    # Select one out of every `n` data points
    selected_data = data.iloc[::n, :]

    # Write the selected data points to a new CSV file
    selected_data.to_csv(output_file, index=False)

    return selected_data

    ##select_nth_data_points("transformed_data.csv",5)


In [11]:
def extract_answer(analysis_string):
    start_pos = analysis_string.find("### Start answer ###") + len("### Start answer ###")
    end_pos = analysis_string.find("### End answer ###")

    answer = analysis_string[start_pos:end_pos].strip()
    return answer

In [13]:
import os
import csv

# Specify the path to your transformed data folder
transformed_folder = "transformeddata"

# Get a list of all files in the transformed data folder
files = os.listdir(transformed_folder)
print("GPT4:")
for file in files:
    if not file.endswith(".csv"):
        continue
    data = pd.read_csv(file)
    select_nth_data_points(data,10,'selected.csv')
    with open('selected.csv', 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        data_string = ""

        for row in csvreader:
            row_string = ", ".join(row)
            data_string += row_string + "\n"
    specs_data = data_string
    output_gpt4 = call_openAI(specs_data, file.split('.')[0])
    print(file.split('.')[0], ": ",extract_answer(output_gpt4))

GPT4:
C6H12O2 :  al formula C6H12O2, the compound could be an ester or a cyclic ether. The NMR data provided does not seem to be in a typical format for an NMR spectrum. Normally, NMR data would be presented with chemical shifts (in ppm) and integration values, which represent the relative number of protons causing each peak. The numbers provided here do not seem to represent chemical shifts or integrations. 

Without the correct NMR data, it's not possible to make a definitive assignment of the structure. However, one common molecule with the formula C6H12O2 is the ester ethyl butanoate (also known as ethyl butyrate). Another possibility could be the cyclic ether 1,4-dioxane.

Please provide the correct NMR data to get a more accurate prediction.
C6H6O2 :  didn't provide the peaks from the NMR spectrum. Could you please provide them so I can help you identify the molecule
C6H14 :  ovided here is a little bit confusing, as in typical NMR spectra, the chemical shift values (in ppm) are 

In [28]:
import os
import csv

# Specify the path to your transformed data folder
transformed_folder = "transformeddata"

# Get a list of all files in the transformed data folder
files = os.listdir(transformed_folder)
print("GPT4:")
for file in files:
    if not file.endswith(".csv"):
        continue
    data = pd.read_csv(file)
    select_nth_data_points(data,10,'selected.csv')
    with open('selected.csv', 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        data_string = ""

        for row in csvreader:
            row_string = ", ".join(row)
            data_string += row_string + "\n"
    specs_data = data_string
    output_gpt4 = call_openAI(specs_data, file.split('.')[0])
    print(file.split('.')[0], ": ",extract_answer(output_gpt4))

GPT4:
C6H12O2 :  eems to be a list of peaks from a mass spectrometry data rather than NMR spectroscopy due to the high values and decimal point precision of the peaks. NMR peaks usually fall within a range of 0 to 10 ppm (parts per million) and are associated with the type of hydrogen or carbon atom in the molecule, which can be correlated to the structure of the molecule. 

For a NMR spectroscopy data, you would typically see the chemical shift (usually between 0 and 10), multiplicity (s for singlet, d for doublet, t for triplet, q for quartet, etc.), and integration (how many hydrogens that signal represents). The chemical shift tells us what type of environment the hydrogen or carbon is in, the multiplicity tells us how many hydrogens are on the neighboring carbons, and the integration tells us how many hydrogens are represented by that signal.

If the data was indeed NMR spectroscopy data for the compound C6H12O2, we would expect to see signals representing different types of hydro

In [33]:
# I am getting connection error for Claude but the code works
"""
import os
import csv

# Specify the path to your transformed data folder
transformed_folder = "transformeddata"

# Get a list of all files in the transformed data folder
files = os.listdir(transformed_folder)
print("Claude:")
for file in files:
    if not file.endswith(".csv"):
        continue
    data = pd.read_csv(file)
    select_nth_data_points(data,10,'selected.csv')
    with open('selected.csv', 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)

        data_string = ""

        for row in csvreader:
            row_string = ", ".join(row)
            data_string += row_string + "\n"
    specs_data = data_string
    output_claude = call_claude(specs_data, file.split('.')[0])
    print(file.split('.')[0], ": ",extract_answer(output_claude))
"""

'\nimport os\nimport csv\n\n# Specify the path to your transformed data folder\ntransformed_folder = "transformeddata"\n\n# Get a list of all files in the transformed data folder\nfiles = os.listdir(transformed_folder)\nprint("Claude:")\nfor file in files:\n    if not file.endswith(".csv"):\n        continue\n    data = pd.read_csv(file)\n    select_nth_data_points(data,10,\'selected.csv\')\n    with open(\'selected.csv\', \'r\') as csvfile:\n        csvreader = csv.reader(csvfile)\n        next(csvreader)\n\n        data_string = ""\n\n        for row in csvreader:\n            row_string = ", ".join(row)\n            data_string += row_string + "\n"\n    specs_data = data_string\n    output_claude = call_claude(specs_data, file.split(\'.\')[0])\n    print(file.split(\'.\')[0], ": ",extract_answer(output_claude))\n'

## Output Parser

In [34]:
def extract_answer(analysis_string):
    start_pos = analysis_string.find("### Start answer ###") + len("### Start answer ###")
    end_pos = analysis_string.find("### End answer ###")

    answer = analysis_string[start_pos:end_pos].strip()
    return answer

print("GPT4:", extract_answer(output_gpt4) )
# print("Claude:", extract_answer(output_claude))

GPT4: ossible for me to give an accurate prediction without having the specific values of the peaks from the NMR spectrum. NMR spectra provide essential information about the structure and environment of atoms within a molecule, but without this data, I can't make an informed prediction. Could you please provide the peak values


In [35]:
import os
import shutil

# Define paths to datasets folders
nmr_folder = "LLMSpectroscopy/csvfiles_update"
#hnmr_folder = "HNMR_files"

# Create the folder for HNMR files if it doesn't exist
#if not os.path.exists(hnmr_folder):
#    os.makedirs(hnmr_folder)

# Get the list of NMR files
nmr_files = os.listdir(nmr_folder)

# Loop over each NMR file
for nmr_file in nmr_files:
    # Check if the file ends with "HNMR.csv"
    if nmr_file.endswith("HNMR.csv"):
        print(nmr_file)
        if not nmr_file.endswith(".csv"):
            continue
        data = pd.read_csv("LLMSpectroscopy/csvfiles_update/"+nmr_file)
        select_nth_data_points(data,10,'selected.csv')
        with open('selected.csv', 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            next(csvreader)

            data_string = ""

            for row in csvreader:
                row_string = ", ".join(row)
                data_string += row_string + "\n"
        specs_data = data_string
        output_gpt4 = call_openAI(specs_data, nmr_file.split('_')[1])
        print(nmr_file.split('_')[1], ": ",extract_answer(output_gpt4))
        # Move the file to the HNMR folder

105-53-3_C7H14O2_Diethyl malonate_HNMR.csv
C7H14O2 :  
13475-82-6_C12H26_Heptane, 2,2,4,6,6-pentamethyl-_HNMR.csv
C12H26 :  m unable to help with this request because it's missing the actual NMR peaks data. NMR peaks are crucial to determine the structure of the molecule. Please provide the NMR spectrum data for further assistance
93-58-3_C8H8O2_Benzoic acid, methyl ester_HNMR.csv
C8H8O2 :  Insufficient information to make a prediction.
52-52-8_C6H11NO2_Cyclopentanecarboxylic acid, 1-amino-_HNMR.csv
C6H11NO2 :  Insufficient data to predict the molecule.
78-78-4_C5H12_Butane, 2-methyl-_HNMR.csv
C5H12 :  Pentane
591-76-4_C7H16_Hexane, 2-methyl-_HNMR.csv
C7H16 :  Insufficient data for accurate prediction.
108-38-3_C8H10_Benzene, 1,3-dimethyl-_HNMR.csv
C8H10 :  Ethylbenzene
124-19-6_C9H18O _Nonanol_HNMR.csv
C9H18O  :  Nonan-1-ol
107-83-5_C6H14_Pentane, 2-methyl-_HNMR.csv
C6H14 :  2,3-dimethylbutane
540-84-1_C8H18 _Pentane, 2,2,4-trimethyl-_HNMR.csv
C8H18  :  Without the specific NMR data, 