In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from openai import OpenAI

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key='YOUR_OPENAI_KEY')

## Evaluating Molecular Structure Extraction Models Using LLMs as a Judge 

This code compares SMILES strings extracted by different models against the ground truth from the IMAGE2SMILES dataset (as presented in 'Image2SMILES: Transformer-based Molecular Optical Recognition Engine'). The workflow involves:

Merging the extracted SMILES with their corresponding ground truth values in a DataFrame
Using GPT-4 to evaluate the accuracy of the extracted SMILES by comparing them to the ground truth
Recording GPT-4's binary assessments ('YES'/'NO') in a separate column
Calculating the frequency of correct and incorrect extractions based on these assessments

In [None]:
# Function to merge two csv files by the image name
def merge_csv_by_filename(file1, file2, output_file, suffix2='_model2'):
    # load the csv files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # delete the .png from the Image column
    df2['Image'] = df2['Image'].str.replace('.png', '')

    # add the ground truth to the dataframe
    merged_df = pd.merge(df1, df2, on="Image", suffixes=('_ground_trouth', suffix2))

    # save the merged dataframe
    merged_df.to_csv(output_file, index=False)
    print(f"Die Dateien wurden erfolgreich zusammengeführt und in '{output_file}' gespeichert.")

In [None]:
merge_csv_by_filename('FG-SMILES-test-dataset.csv', 'smiles_results.csv', 'Metriken_gpt_4o_mini_smiles.csv', '_gpt_4o_mini')
merge_csv_by_filename('FG-SMILES-test-dataset.csv', 'smiles_results_gpt_4o.csv', 'Metriken_gpt_4o.csv', '_gpt_4o')
merge_csv_by_filename('FG-SMILES-test-dataset.csv', 'gemini_smiles_results.csv', 'Metriken_gemini_smiles.csv', '_gemini')
merge_csv_by_filename('FG-SMILES-test-dataset.csv', 'gemini_smiles_experimental_results.csv', 'Metriken_gemini_experimental_1206_smiles.csv', '_gemini_experimental_1206')

### GPT-4o-mini

In [None]:
# load the csv files
df_gpt_4o_mini = pd.read_csv('Metriken_gpt_4o_mini_smiles.csv')

# delete the columns DOI and Page
df_gpt_4o_mini = df_gpt_4o_mini.drop(columns=['DOI', 'Page'])

df_gpt_4o_mini

### GPT-4o

In [None]:
# load the csv files
df_gpt_4o = pd.read_csv('Metriken_gpt_4o.csv')

# delete the columns DOI and Page
df_gpt_4o = df_gpt_4o.drop(columns=['DOI', 'Page'])

df_gpt_4o

### Gemini

In [None]:
# load the csv files
df_gemini = pd.read_csv('Metriken_gemini_smiles.csv')

# delete the columns DOI and Page
df_gemini = df_gemini.drop(columns=['DOI', 'Page'])

df_gemini

### Gemini-experimental-1206

In [None]:
# load the csv files
df_gemini_experimental_1206 = pd.read_csv('Metriken_gemini_experimental_1206_smiles.csv')

# delete the columns DOI and Page
df_gemini_experimental_1206 = df_gemini_experimental_1206.drop(columns=['DOI', 'Page'])

df_gemini_experimental_1206

## GPT-4o as a Judge - Evaluation of the SMILES extraction 

In [None]:
# Description of the differences between SMILES and FG-SMILES for the LLM
description = """SMILES notation represents molecules, while Markush structures are molecular templates. 
There is no way of representing molecular templates in standard SMILES, so we designed a modified syntax. 
We named it FG-SMILES (functional groups smiles). This is an extension of standard SMILES, where a substituent or 
R-group can be written as a single pseudo-atom. If a substituent is a functional group, FG-SMILES can be translated 
to SMILES directly by replacing corresponding pseudo-atoms. An example:

SMILES: Cc1cc(C)c(-c2ccccc2)c(-c2ccc([N+](=O)[O-])cc2)c1

FG-SMILES: [Me]c1cc([Me])c(-[Ph])c(-c2ccc([NO2])cc2)c1

FG-SMILES notation allows describing variable R-group position. We add the v symbol to denote the variable R-group 
inside an aromatic system. For example, the template c1[vR’]cccc([R2])c1 represents the template in (Figure 6). 
Formally, this notation breaks SMILES grammar because the branching atom is inside the ring, but it represents the 
case when R-group is attached not to a specific place in the ring but to the ring itself."""

### GPT-4o-mini

In [None]:
df = df_gpt_4o_mini

In [None]:
# iterate over the rows of the dataframe
for row in df.itertuples(index=True, name="Pandas"):
    image = row.Image
    smiles = row.SMILES_gpt_4o_mini
    fg_smiles = row.SMILES_ground_trouth
    print(f"Index: {row.Image}, SMILES: {row.SMILES_gpt_4o_mini}, FG-SMILES: {row.SMILES_ground_trouth}")
    
    # Prompt for the GPT-4 model
    prompt = f"""
    Compare the following SMILES and FG-SMILES. Determine whether the molecules they represent are structurally equivalent. 
    Respond only with \"Yes\" or \"No\".
    
    SMILES: {smiles}
    FG-SMILES: {fg_smiles}
    """
    
    # send prompt to OpenAI
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )
    
    # answer from OpenAI
    answer = response.choices[0].message.content.strip()
    
    # results dictionary
    results[image] = answer

# show results
print(results)

In [None]:
df_results_gpt_4o_mini = pd.DataFrame(results.items(), columns=["Image", "Result"])

# combine the results with the original dataframe
df_gpt_4o_mini = pd.merge(df_gpt_4o_mini, df_results_gpt_4o_mini, on="Image")
df_gpt_4o_mini

In [None]:
# save the results to a csv file
df_gpt_4o_mini.to_csv("Metriken_gpt_4o_mini_smiles_results.csv", index=False)

In [None]:
# selt all rows where the result is not "Yes" and set the result to "No"
df_gpt_4o_mini.loc[df_gpt_4o_mini["Result"] != "Yes", "Result"] = "No"

# count the results for Yes and No
df_gpt_4o_mini["Result"].value_counts()

### GPT-4o

In [None]:
df = df_gpt_4o

In [None]:
# iterate over the rows of the dataframe
for row in df.itertuples(index=True, name="Pandas"):
    image = row.Image
    smiles = row.SMILES_gpt_4o_mini
    fg_smiles = row.SMILES_ground_trouth
    print(f"Index: {row.Image}, SMILES: {row.SMILES_gpt_4o_mini}, FG-SMILES: {row.SMILES_ground_trouth}")
    
    # Prompt for the GPT-4 model
    prompt = f"""
    Compare the following SMILES and FG-SMILES. Determine whether the molecules they represent are structurally equivalent. 
    Respond only with \"Yes\" or \"No\".
    
    SMILES: {smiles}
    FG-SMILES: {fg_smiles}
    """
    
    # send prompt to OpenAI
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )
    
    # answer from OpenAI
    answer = response.choices[0].message.content.strip()
    
    # results dictionary
    results[image] = answer

# show results
print(results)

In [None]:
df_results_gpt_4o = pd.DataFrame(results.items(), columns=["Image", "Result"])

# combine the results with the original dataframe
df_gpt_4o = pd.merge(df_gpt_4o, df_results_gpt_4o, on="Image")
df_gpt_4o

In [None]:
# save the results to a csv file
df_gpt_4o.to_csv("Metriken_gpt_4o_smiles_results.csv", index=False)

In [None]:
# selt all rows where the result is not "Yes" and set the result to "No"
df_gpt_4o.loc[df_gpt_4o["Result"] != "Yes", "Result"] = "No"

# count the results for Yes and No
df_gpt_4o["Result"].value_counts()

### Gemini

In [None]:
df = df_gemini

In [None]:
# iterate over the rows of the dataframe
for row in df.itertuples(index=True, name="Pandas"):
    image = row.Image
    smiles = row.SMILES_gpt_4o_mini
    fg_smiles = row.SMILES_ground_trouth
    print(f"Index: {row.Image}, SMILES: {row.SMILES_gpt_4o_mini}, FG-SMILES: {row.SMILES_ground_trouth}")
    
    # Prompt for the GPT-4 model
    prompt = f"""
    Compare the following SMILES and FG-SMILES. Determine whether the molecules they represent are structurally equivalent. 
    Respond only with \"Yes\" or \"No\".
    
    SMILES: {smiles}
    FG-SMILES: {fg_smiles}
    """
    
    # send prompt to OpenAI
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )
    
    # answer from OpenAI
    answer = response.choices[0].message.content.strip()
    
    # results dictionary
    results[image] = answer

# show results
print(results)

In [None]:
df_results_gemini = pd.DataFrame(results.items(), columns=["Image", "Result"])

# combine the results with the original dataframe
df_gemini = pd.merge(df_gemini, df_results_gemini, on="Image")

In [None]:
# save the results to a csv file
df_gemini.to_csv("Metriken_gemini_smiles_results.csv", index=False)

In [None]:
# selt all rows where the result is not "Yes" and set the result to "No"
df_gemini.loc[df_gpt_4o["Result"] != "Yes", "Result"] = "No"

# count the results for Yes and No
df_gemini["Result"].value_counts()

### Gemini-experimental-1206

In [None]:
df = df_gemini_experimental_1206

In [None]:
# iterate over the rows of the dataframe
for row in df.itertuples(index=True, name="Pandas"):
    image = row.Image
    smiles = row.SMILES_gpt_4o_mini
    fg_smiles = row.SMILES_ground_trouth
    print(f"Index: {row.Image}, SMILES: {row.SMILES_gpt_4o_mini}, FG-SMILES: {row.SMILES_ground_trouth}")
    
    # Prompt for the GPT-4 model
    prompt = f"""
    Compare the following SMILES and FG-SMILES. Determine whether the molecules they represent are structurally equivalent. 
    Respond only with \"Yes\" or \"No\".
    
    SMILES: {smiles}
    FG-SMILES: {fg_smiles}
    """
    
    # send prompt to OpenAI
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )
    
    # answer from OpenAI
    answer = response.choices[0].message.content.strip()
    
    # results dictionary
    results[image] = answer

# show results
print(results)

In [None]:
df_results_gemini_experimental_1206 = pd.DataFrame(results.items(), columns=["Image", "Result"])

# combine the results with the original dataframe
df_gemini_experimental_1206 = pd.merge(df_gemini_experimental_1206, df_results_gemini_experimental_1206, on="Image")

In [None]:
# save the results to a csv file
df_gemini.to_csv("Metriken_gemini_experimental_1206__smiles_results.csv", index=False)

In [None]:
# selt all rows where the result is not "Yes" and set the result to "No"
df_gemini_experimental_1206.loc[df_gemini_experimental_1206["Result"] != "Yes", "Result"] = "No"

# count the results for Yes and No
df_gemini_experimental_1206["Result"].value_counts()