# HW4: LLM prompting for entity labeling
This notebook contains starter code for prompting an LLM API for the task of entity recognition. It has minimal text so you can easily copy it to **handin.py** when you submit.  Please read all the comments in the code as they contain important information.

In [None]:
# This code block just contains standard setup code for running in Python
import json
import string
import re
import time
from tqdm.auto import tqdm

# PyTorch imports
import torch
from torch.utils.data import DataLoader
import numpy as np

# Fix the random seed(s) for reproducability
random_seed = 8942764
torch.random.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)

#!pip install ipytest
#!pip install transformers
#!pip install datasets
#!pip install evaluate
#!pip install seqeval
#!pip install ratelimit

from transformers import AutoTokenizer, BertModel, DefaultDataCollator

from datasets import load_dataset

import evaluate
from ratelimit import limits

# Just a helper function for efficiently removing punctuation from a string
def strip_punct(s):  return s.translate(string.punctuation)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### GeminiAI version

In [None]:
!pip install python-gemini-api




In [None]:
import google.generativeai as genai

# Configure your API key for Google's GenAI
genai.configure(api_key="")

# Define the model (e.g., Gemini 1.5)
model = genai.GenerativeModel("gemini-1.5-flash")

# Create the messages similar to OpenAI's chat structure
messages = [
    {
        "role": "system",
        "content": """You will be given input text containing different types of entities that you will label.
        This is the list of entity types to label: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
        Label the entities by surrounding them with tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."""
    },
    {
        "role": "user",
        "content": """Text: Once paired in later myths with her Titan brother Hyperion as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), Selene (the Moon), and Eos (the Dawn)."""
    }
]

# Format the conversation context as input text
input_text = "\n".join([f"{message['role']}: {message['content']}" for message in messages])

# Generate a response from the Google model
response = model.generate_content(input_text)

# Output the response
print(response.text)
print(response)


### OpenAI version

 #### Prepare Chat Messages | Send a Request

In [None]:
!pip install openai --force-reinstall -v "openai==1.55.3"

In [None]:
from openai import OpenAI

# Use the API key that we
client = OpenAI(api_key='') #base_url="https://cmu.litellm.ai")

USER_STR = "user"
SYSTEM_STR = "system"
MSG_STR = "content"

In [None]:
# Here is how you can use the API to prompt the OpenAI model.
# Docs: https://platform.openai.com/docs/api-reference
messages = [
    {'role': SYSTEM_STR, MSG_STR:
     """You will be given input text containing different types of entities that you will label.
     This is the list of entity types to label: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
     Label the enities by surrounding them with tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."""
     },
     {'role': USER_STR, MSG_STR: """Text: Once paired in later myths with her Titan brother Hyperion as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), Selene (the Moon), and Eos (the Dawn)."""},
     {'role': SYSTEM_STR, MSG_STR: """Labels: Once paired in later myths with her Titan brother <Deity> Hyperion </Deity> as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), <Goddess> Selene </Goddess> (the Moon), and <Goddess> Eos </Goddess> (the Dawn)."""},
     {'role': USER_STR, MSG_STR: """Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.\nLabels: """}
]

# # This is where you provide the final prompt that we want the model to complete to give us the answer.
# message = f"""Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.
# Labels: """

response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.5,
    seed=random_seed,
    messages=messages
)

print(response.choices[0].message.content)

# You can also print out the usage, in number of tokens.
# Pricing is per input/output token, listed here: https://openai.com/pricing
print(f"Usage: {response.usage.prompt_tokens} input, {response.usage.completion_tokens} output, {response.usage.total_tokens} total tokens")

From her ideological conception, <Goddess> Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Goddess> hippopotamus </Goddess> goddesses: <Goddess> Ipet </Goddess>, <Goddess> Reret </Goddess>, and <Goddess> Hedjet </Goddess>.
Usage: 307 input, 87 output, 394 total tokens


#### Prepare data | Processing

In [None]:
# Load the dataset
from datasets import Dataset, ClassLabel, Sequence

data_splits = load_dataset('json', data_files={'train': '/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_train_bio.jsonl', 'dev': '/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_dev_bio_sm.jsonl', 'test': '/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_test_bio_nolabels.jsonl'})

# Load dicts for mapping int labels to strings, and vice versa
label_names_fname = "/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_train_bio.jsonl.labels"
labels_int2str = []
with open(label_names_fname) as f:
    labels_int2str = f.read().split()
print(f"Labels: {labels_int2str}")
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

# Also create a set containing the original labels, without B- and I- tags
orig_labels = set()
for label in labels_str2int.keys():
    orig_label = label[2:]
    if orig_label:
        orig_labels.add(orig_label)
print(f"Orig labels: {orig_labels}")

# data_splits.cast_column("ner_tags", Sequence(ClassLabel(names=labels_int2str)))
print(data_splits)

In [None]:
# Let's inspect a single example
dev_example = data_splits['dev'][5]
print(json.dumps(dev_example, indent=4))

{
    "para_index": 0,
    "title": "Hadingus",
    "doc_id": "Hadingus-0",
    "content": "Hadingus was one of the earliest legendary Danish kings according to Saxo Grammaticus' Gesta Danorum, where he has a detailed biography. Georges Dum\u00e9zil and others have argued that Hadingus was partially modelled on the god Nj\u00f6r\u00f0r.",
    "page_id": "4283756",
    "id": "Gy_0WYcB1INCf0UycBhm",
    "tokens": [
        "Hadingus",
        "was",
        "one",
        "of",
        "the",
        "earliest",
        "legendary",
        "Danish",
        "kings",
        "according",
        "to",
        "Saxo",
        "Grammaticus'",
        "Gesta",
        "Danorum,",
        "where",
        "he",
        "has",
        "a",
        "detailed",
        "biography.",
        "Georges",
        "Dum\u00e9zil",
        "and",
        "others",
        "have",
        "argued",
        "that",
        "Hadingus",
        "was",
        "partially",
        "modelled",
        "on",

In [None]:
# Ok, now let's make the prompting a bit more programmatic. First, implement a function that takes an example from
# the dataset, and converts it into a message for the model using the format we specified above.
# You might want to use the Python string "format" function to make this a bit easier, especially since
# You will be experimenting with different prompts later.
#
# TODO: implement this.
def get_message(example):
    """
    Converts an example into a single user message for the model.

    Args:
        example (dict): A single example from the dataset.
                        Expected keys: 'tokens' (list of words).

    Returns:
        str: The user message content as a string.
    """
    # Retrieve the text content by joining the tokens
    text = " ".join(example["tokens"])  # Combine tokens into a single string

    # Format the text for the user message
    user_message_content = text

    return user_message_content

In [None]:
# Next we're going to implement a function to return the chat_history, but in order to do that we first need
# to be able to convert labeled examples from the dataset into a format that makes more sense for the model,
# in this case the HTML-style format we specified in the example. That's the task for this function: take
# an example from the dataset as input, and return a string that has tagged the text with labels in the given
# HTML-style format.
#
# TODO: implement this.
def convert_bio_to_prompt(example):
    """
    Converts a labeled example from the dataset into an HTML-style formatted string
    with entities tagged according to the specified BIO labels.

    Args:
        example (dict): A single example from the dataset.
                        Expected keys: 'tokens' (list of words) and 'ner_strings' (list of BIO labels).

    Returns:
        str: A string where entities in the text are tagged with the specified HTML-style format.
    """
    tokens = example["tokens"]  # List of tokens
    ner_strings = example["ner_strings"]  # Corresponding BIO labels

    # Initialize variables for building the output string
    formatted_text = ""
    current_entity = None
    current_entity_tokens = []

    # Iterate over tokens and their corresponding BIO tags
    for token, s in zip(tokens, ner_strings):
        if s == "O":  # If the token is outside any entity
            if current_entity:  # Close the current entity tag
                formatted_text += f"<{current_entity[2:]}> {' '.join(current_entity_tokens)} </{current_entity[2:]}> "
                current_entity = None
                current_entity_tokens = []
            formatted_text += token + " "  # Add the token as normal text
        else:
            # Use the label directly if it's not 0
            if current_entity == s:  # Continue the current entity
                current_entity_tokens.append(token)
            else:
                if current_entity:  # Close the previous entity tag
                    formatted_text += f"<{current_entity[2:]}> {' '.join(current_entity_tokens)} </{current_entity[2:]}> "
                # Start a new entity
                current_entity = s
                current_entity_tokens = [token]

    # Handle the last entity if it exists
    #if current_entity:
    #    formatted_text += f"<{current_entity}> {' '.join(current_entity_tokens)} </{current_entity}> "

    return formatted_text.strip()  # Return the formatted text, removing trailing spaces.

In [None]:
# Now we can write a function that takes the number of shots, dataset, list of entity types, and
# convert_bio_to_prompt function, and returns the chat_history (a list of maps) structured as in
# the example.
#
# TODO: implement this.
def get_chat_history(shots, dataset, entity_types_list, convert_bio_to_prompt_fn):
    """
    Generates a chat history formatted as a list of maps for few-shot learning.

    Args:
        shots (int): Number of examples to include in the chat history (few-shot examples).
        dataset (list): The dataset containing examples (list of dictionaries with 'tokens' and 'ner_tags').
        entity_types_list (list): List of entity types to include in the system prompt.
        convert_bio_to_prompt_fn (function): Function that converts labeled examples to the desired prompt format.

    Returns:
        list: Chat history structured as a list of dictionaries with roles and content.
    """
    # Create the system prompt
    system_prompt = {
        "role": "system",
        "content": (
            f"You will be given input text containing different types of entities that you will label.\n"
            f"This is the list of entity types to label: {', '.join(entity_types_list)}.\n"
            f"Label the entities by surrounding them with tags like '<Entity_Type> Entity </Entity_Type>'."
        )
    }

    # Initialize the chat history with the system prompt
    chat_history = [system_prompt]

    # Add the specified number of examples (shots) from the dataset
    for i in range(min(shots, len(dataset))):
        example = dataset[i]

        # Convert the example to the prompt format
        formatted_example = convert_bio_to_prompt_fn(example)
        #print(formatted_example)

        # Add the user message (text input)
        user_message = {
            "role": "user",
            "content": f"{' '.join(example['tokens'])}"
        }
        chat_history.append(user_message)

        # Add the assistant message (labeled output)
        assistant_message = {
            "role": "system",
            "content": f"{formatted_example}"
        }
        chat_history.append(assistant_message)

    return chat_history


In [None]:
''' Gemini
import json
import random

# Constants for roles
USER_STR = "user"
SYSTEM_STR = "system"
MSG_STR = "content"

# Example number of shots and random seed
num_shots = 20

# Functions that need to be defined beforehand
# get_chat_history, get_message, data_splits, orig_labels, convert_bio_to_prompt, dev_example
# Assuming these are already defined

# Get chat history (example function provided separately)
chat_history = get_chat_history(num_shots, data_splits['train'], orig_labels, convert_bio_to_prompt)

# Add the current user message to the chat history
message = {'role': USER_STR, MSG_STR: get_message(dev_example)}
chat_history.append(message)

# Combine the chat history into a single input prompt for Google's API
combined_prompt = "\n".join([f"{msg['role']}: {msg[MSG_STR]}" for msg in chat_history])

# Print the combined prompt for debugging
print("Combined Prompt:")
print(combined_prompt)

# Generate a response using the Google model
response = genai.GenerativeModel("gemini-1.5-flash").generate_content(combined_prompt)

# Output the generated response
print("Generated Response:")
print(response.text)
'''

In [None]:
## OpenAI
USER_STR = "user"
SYSTEM_STR = "system"
MSG_STR = "content"

# Now we can put all of those together to prompt the model more automagically!
num_shots = 20

chat_history = get_chat_history(num_shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
message = {'role': USER_STR, MSG_STR: get_message(dev_example)}
chat_history.append(message)
print(json.dumps(chat_history, indent=4))

response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.5,
    seed=random_seed,
    messages=chat_history
)

print(response.choices[0].message.content)

[
    {
        "role": "system",
        "content": "You will be given input text containing different types of entities that you will label.\nThis is the list of entity types to label: Cretaceous_dinosaur, Mythological_king, Goddess, Deity, Aquatic_mammal, Aquatic_animal.\nLabel the entities by surrounding them with tags like '<Entity_Type> Entity </Entity_Type>'."
    },
    {
        "role": "user",
        "content": "Mahakala is based on IGM 100/1033, a partial skeleton including skull bones, vertebrae, limb bones, and portions of the pelvis and shoulder girdle. Although this individual was small, comparable in size to Archaeopteryx, Caudipteryx, and Mei, it was close to adulthood. This genus can be distinguished from other paravians (dromaeosaurids, troodontids, and birds) by details of the ulna, thighbone, ilium, and tail vertebrae. Like Archaeopteryx and derived dromaeosaurids, but unlike basal troodontids and other dromaeosaurids, the middle (third) metatarsal was not compres

In [None]:
'''Function to call Gemini API
def call_api_gemini(shots, example):
    """
    Calls the Google Gemini API with the specified shots and example.

    Args:
        shots (int): Number of few-shot examples to include in the chat history.
        example (dict): Example input for the current prediction.

    Returns:
        str: The generated content from the Google Gemini API.
    """
    success = False
    #max_retries = 3  # Define maximum retries
    #retry_count = 0  # Track retry attempts

    while not success:
        try:
            # Prepare chat history with shots (convert OpenAI-style chat history to a Gemini-friendly prompt)
            chat_history = get_chat_history(shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
            message = get_message(example)

            # Combine chat history and current example into a single prompt
            combined_prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history])
            combined_prompt += f"\nuser: {message}"  # Add the user query at the end

            # Call Gemini API
            response = genai.GenerativeModel("gemini-1.5-flash").generate_content(combined_prompt)

            # Return the response content
            return response.text

        except Exception as err:
            continue
            # tqdm.write(f"Caught exception: {err}")
'''

In [None]:
### OpenAI
# Now let's wrap that call in a function that takes shots and an example, calls the API and returns the response.
def call_api_openai(shots, example):
    success = False
    #print(type(example['tokens']), example['tokens'])

    while not success:
        try:
            chat_history = get_chat_history(shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
            message = {'role': USER_STR, 'content': get_message(example)}
            chat_history.append(message)
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.5,
                messages=chat_history
            )
            success = 1
        except Exception as err:
            tqdm.write(f"Caught exception: {err}")
    return response.choices[0].message.content

In [None]:
# Now we want to be able to evaluate the model, in order to compare it to e.g. the fine-tuned BERT model.
# In order to do this, we need to write the reverse of the convert_bio_to_prompt function, so that we can
# convert in the other direction, from the generated response in prompt format, back to bio for evaluation
# using seqeval.
#
# The input to this function is the string response from the model, and the output should be a list of
# text BIO labels corresponding to the labeling implied by the tagged output produced by the model, as
# well as the list of tokens (since the generative model could return something different than we gave it,
# and we need to handle that somehow in the eval).
#
# TODO: implement this
import re
import string

def convert_response_to_bio(response):
    """
    Converts the model-generated response with HTML-style tags back into BIO format.

    Args:
        response (str): The generated response from the model in HTML-style format.

    Returns:
        tuple: A tuple containing:
            - tokens (list of str): The tokens extracted from the response.
            - bio_labels (list of str): The corresponding BIO labels for the tokens.
    """
        # Remove the 'Labels:' prefix if it exists
    if response.startswith('Labels:'):
        response = response[len('Labels:'):].strip()

    tokens = []
    bio_labels = []

    # Regular expression to match tags and plain text
    tag_pattern = re.compile(r"(</?[\w\-]+>)|([^<>]+)")  # Matches <tag>, </tag>, and plain text

    current_label = "O"  # Start with "O" (outside any entity)
    inside_entity = False  # Track whether we are inside an entity tag

    for match in tag_pattern.finditer(response):
        tag_or_text = match.group()

        if tag_or_text.startswith("</"):  # Closing tag
            current_label = "O"
            inside_entity = False
        elif tag_or_text.startswith("<"):  # Opening tag
            current_label = tag_or_text[1:-1]  # Extract tag name without <>
            inside_entity = True
        else:
            # Process plain text
            for i, token in enumerate(tag_or_text.split()):
                tokens.append(token)

                if inside_entity:
                    #bio_labels.append(current_label)
                    if i == 0:
                        bio_labels.append(f"B-{current_label}")  # Start of an entity
                    else:
                        bio_labels.append(f"I-{current_label}")  # Continuation of the same entity
                else:
                    bio_labels.append("O")  # Outside any entity

    punctuations = set(string.punctuation)
    merged_tokens = []
    merged_bio_labels = []

    for token, label in zip(tokens, bio_labels):
        if token in punctuations and merged_tokens:
            merged_tokens[-1] += token
        else:
            merged_tokens.append(token)
            merged_bio_labels.append(label)

    return merged_bio_labels, merged_tokens


In [None]:
html_str = 'From <Goddess> her</Goddess> ideological conception, <Goddess> the deity Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Aquatic_mammal>hippopotamus</Aquatic_mammal> <Goddess>goddesses </Goddess>: <Goddess> Ipet ("the Nurse")</Goddess>, <Goddess>Reret ("the Sow") </Goddess>, and <Goddess>Hedjet ("the White One")</Goddess>.'
labels, text = convert_response_to_bio(html_str)
true_labels = ['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
true_text = ['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']
print(labels)
print(text)
assert len(labels) == len(true_labels)
assert len(text) == len(true_text)

['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']


In [None]:

# Here's a test example you can use to validate/debug your code (note that this was constructed to simulate various
# spacing/tokenization scenarios and does not necessarily reflect "correct" labeling wrt the training data):
import ipytest
ipytest.autoconfig()
def test_convert_html_to_bio():
    html_str = 'From <Goddess> her</Goddess> ideological conception, <Goddess> the deity Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Aquatic_mammal>hippopotamus</Aquatic_mammal> <Goddess>goddesses </Goddess>: <Goddess> Ipet ("the Nurse")</Goddess>, <Goddess>Reret ("the Sow") </Goddess>, and <Goddess>Hedjet ("the White One")</Goddess>.'
    labels, text = convert_response_to_bio(html_str)
    true_labels = ['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
    true_text = ['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']
    print(labels)
    print(text)
    assert labels == true_labels
    assert text == true_text

def test_convert_html_to_bio_labels():
    html_str = 'Labels: From <Goddess> her</Goddess> ideological conception, <Goddess> the deity Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Aquatic_mammal>hippopotamus</Aquatic_mammal> <Goddess>goddesses </Goddess>: <Goddess> Ipet ("the Nurse")</Goddess>, <Goddess>Reret ("the Sow") </Goddess>, and <Goddess>Hedjet ("the White One")</Goddess>.'
    labels, text = convert_response_to_bio(html_str)
    true_labels = ['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
    true_text = ['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']
    print(labels)
    print(text)
    assert labels == true_labels
    assert text == true_text

ipytest.run('-vv')  # '-vv' for increased verbosity

platform linux -- Python 3.10.12, pytest-8.3.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content
plugins: anyio-4.7.0, typeguard-4.4.1
[1mcollecting ... [0mcollected 2 items

t_9ab71cf2779f41a5858c4460f4aaf63a.py::test_convert_html_to_bio [32mPASSED[0m[33m                       [ 50%][0m
t_9ab71cf2779f41a5858c4460f4aaf63a.py::test_convert_html_to_bio_labels [32mPASSED[0m[33m                [100%][0m

../usr/local/lib/python3.10/dist-packages/_pytest/config/__init__.py:1277
    self._mark_plugins_for_rewrite(hook)



<ExitCode.OK: 0>

In [None]:
# Now we can put all of the above together to evaluate!
metric = evaluate.load("seqeval")
output_path = "test_predictions_llm.json"
def run_eval(dataset, shots):
  all_predictions = []

  for example in tqdm(dataset, total=len(dataset), desc="Evaluating", position=tqdm._get_free_pos()):

      # String list of labels (BIO)
      true_labels = [labels_int2str[l] for l in example['ner_tags']]
      example["tokens"] = [t if isinstance(t, str) else " ".join(t) for t in example["tokens"]]

      example_tokens = example['tokens']

      response_text = call_api_openai(shots, example)
      #print(f'response text: { response_text}')

      # String list of predicted labels (BIO)
      predictions, generated_tokens = convert_response_to_bio(response_text)
      all_predictions.append(predictions)

      # Handle case where the generated text doesn't align with the input text.
      # Basically, we'll eval everything up to where the two strings start to diverge.
      # We relax this slightly by ignoring punctuation (sometimes we lose a paren or something,
      # but that's not catastrophic for eval/tokenization).
      # Just predict 'O' for anything following mismatch.
      matching_elements = [strip_punct(i) == strip_punct(j) for i, j in zip(example_tokens, generated_tokens)]

      if False in matching_elements:
         last_matching_idx = matching_elements.index(False)
      else:
         last_matching_idx = min(len(generated_tokens), len(example_tokens))

      predictions = predictions[:last_matching_idx] + ['O']*(len(example_tokens)-last_matching_idx)
      metric.add(predictions=predictions, references=true_labels)

  return metric.compute(zero_division=0)

In [None]:
# Run the eval on the dev set
dev_examples_to_take = 0

dev_set = data_splits['dev']
if dev_examples_to_take > 0:
    dev_set = data_splits['dev'].select(range(dev_examples_to_take))

for num_shots in [0, 1, 5, 10, 20, 100]:  # Test with different numbers of examples
    print(f"shots: {num_shots}")
    result = run_eval(dev_set, shots=num_shots)
    print(f"Results for {num_shots} shots: {result}")

shots: 0


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 0 shots: {'Aquatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 36}, 'Deity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 115}, 'Goddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 68}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'amayo': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'eity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'iety': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'oddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'quatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'quatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'retaceous_dinosaur': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'ythological_



Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 1 shots: {'Aquatic_animal': {'precision': 0.043795620437956206, 'recall': 0.0967741935483871, 'f1': 0.06030150753768844, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0989010989010989, 'recall': 0.2571428571428571, 'f1': 0.14285714285714285, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.3582089552238806, 'recall': 0.6666666666666666, 'f1': 0.4660194174757281, 'number': 36}, 'Deity': {'precision': 0.375, 'recall': 0.3391304347826087, 'f1': 0.35616438356164376, 'number': 115}, 'God': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Goddess': {'precision': 0.17197452229299362, 'recall': 0.39705882352941174, 'f1': 0.24, 'number': 68}, 'Mythological_king': {'precision': 0.0425531914893617, 'recall': 0.42857142857142855, 'f1': 0.07741935483870968, 'number': 14}, 'ammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'eity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'oddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number':



Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 5 shots: {'Aquatic_animal': {'precision': 0.06382978723404255, 'recall': 0.0967741935483871, 'f1': 0.07692307692307691, 'number': 62}, 'Aquatic_mammal': {'precision': 0.15384615384615385, 'recall': 0.22857142857142856, 'f1': 0.18390804597701152, 'number': 35}, 'BG_Deity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'BG_Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Cretaceous_dinosaur': {'precision': 0.3013698630136986, 'recall': 0.6111111111111112, 'f1': 0.4036697247706422, 'number': 36}, 'Deity': {'precision': 0.4740740740740741, 'recall': 0.5565217391304348, 'f1': 0.512, 'number': 115}, 'Goddess': {'precision': 0.13692946058091288, 'recall': 0.4852941176470588, 'f1': 0.21359223300970873, 'number': 68}, 'Mythological_king': {'precision': 0.03529411764705882, 'recall': 0.21428571428571427, 'f1': 0.0606060606060606, 'number': 14}, 'eity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'ipponosaurus': {'precision': 



Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 10 shots: {'Aquatic_animal': {'precision': 0.1111111111111111, 'recall': 0.1935483870967742, 'f1': 0.1411764705882353, 'number': 62}, 'Aquatic_mammal': {'precision': 0.13924050632911392, 'recall': 0.3142857142857143, 'f1': 0.1929824561403509, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.3582089552238806, 'recall': 0.6666666666666666, 'f1': 0.4660194174757281, 'number': 36}, 'Deity': {'precision': 0.4397163120567376, 'recall': 0.5391304347826087, 'f1': 0.484375, 'number': 115}, 'Goddess': {'precision': 0.18716577540106952, 'recall': 0.5147058823529411, 'f1': 0.27450980392156865, 'number': 68}, 'Mythological_king': {'precision': 0.0625, 'recall': 0.42857142857142855, 'f1': 0.10909090909090909, 'number': 14}, 'overall_precision': 0.22123893805309736, 'overall_recall': 0.45454545454545453, 'overall_f1': 0.29761904761904767, 'overall_accuracy': 0.9495700335228101}
shots: 20


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 20 shots: {'Aquatic_animal': {'precision': 0.11702127659574468, 'recall': 0.1774193548387097, 'f1': 0.14102564102564102, 'number': 62}, 'Aquatic_mammal': {'precision': 0.12727272727272726, 'recall': 0.2, 'f1': 0.15555555555555553, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.4576271186440678, 'recall': 0.75, 'f1': 0.5684210526315788, 'number': 36}, 'Deity': {'precision': 0.45132743362831856, 'recall': 0.4434782608695652, 'f1': 0.4473684210526316, 'number': 115}, 'Goddess': {'precision': 0.20253164556962025, 'recall': 0.47058823529411764, 'f1': 0.2831858407079646, 'number': 68}, 'Mythological_king': {'precision': 0.11320754716981132, 'recall': 0.42857142857142855, 'f1': 0.1791044776119403, 'number': 14}, 'overall_precision': 0.2518796992481203, 'overall_recall': 0.40606060606060607, 'overall_f1': 0.3109048723897912, 'overall_accuracy': 0.9559830928436088}
shots: 100


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Results for 100 shots: {'Aquatic_animal': {'precision': 0.136986301369863, 'recall': 0.16129032258064516, 'f1': 0.14814814814814814, 'number': 62}, 'Aquatic_mammal': {'precision': 0.09333333333333334, 'recall': 0.2, 'f1': 0.1272727272727273, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.48148148148148145, 'recall': 0.7222222222222222, 'f1': 0.5777777777777777, 'number': 36}, 'Deity': {'precision': 0.44954128440366975, 'recall': 0.4260869565217391, 'f1': 0.43750000000000006, 'number': 115}, 'Goddess': {'precision': 0.2916666666666667, 'recall': 0.5147058823529411, 'f1': 0.3723404255319149, 'number': 68}, 'Mythological_king': {'precision': 0.05, 'recall': 0.14285714285714285, 'f1': 0.07407407407407408, 'number': 14}, 'overall_precision': 0.27388535031847133, 'overall_recall': 0.39090909090909093, 'overall_f1': 0.32209737827715357, 'overall_accuracy': 0.9586794927853083}




#### Test json

In [None]:
def run_test(dataset, shots, output_filename="test_predictions_llm_baseline.json"):

    all_predictions = []

    for example in tqdm(dataset, total=len(dataset), desc="Evaluating", position=tqdm._get_free_pos()):
        # String list of labels (BIO)
        #true_labels = [labels_int2str[l] for l in example['ner_tags']]
        example_tokens = example['tokens']

        response_text = call_api_openai(shots, example)

        # String list of predicted labels (BIO)
        predictions, generated_tokens = convert_response_to_bio(response_text)

        # Handle case where the generated text doesn't align with the input text
        matching_elements = [strip_punct(i) == strip_punct(j) for i, j in zip(example_tokens, generated_tokens)]

        if False in matching_elements:
            last_matching_idx = matching_elements.index(False)
        else:
            last_matching_idx = min(len(generated_tokens), len(example_tokens))

        # Adjust predictions for mismatch
        predictions = predictions[:last_matching_idx] + ['O'] * (len(example_tokens) - last_matching_idx)
        print(predictions)
        # Save predictions for this sentence
        all_predictions.append(predictions)

    # Write predictions to the JSON file
    with open(output_filename, "w") as f:
        json.dump(all_predictions, f, indent=4)

    print(f"Predictions saved to {output_filename}")

In [None]:
# Load dicts for mapping int labels to strings, and vice versa
test_data_path = "/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_test_bio_nolabels.jsonl"
with open(test_data_path, "r") as f:
    test_data = [json.loads(line.strip()) for line in f]

# Convert test data into a Hugging Face Dataset
test_data = Dataset.from_list(test_data)
print(test_data)

label_names_fname = "/content/drive/MyDrive/Colab Notebooks/HW4/dinos_and_deities_train_bio.jsonl.labels"
labels_int2str = []
with open(label_names_fname) as f:
    labels_int2str = f.read().split()
print(f"Labels: {labels_int2str}")
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

# Also create a set containing the original labels, without B- and I- tags
orig_labels = set()
for label in labels_str2int.keys():
    orig_label = label[2:]
    if orig_label:
        orig_labels.add(orig_label)
print(f"Orig labels: {orig_labels}")

print(f"Labels in label file: {labels_int2str}")
print(f"Original labels detected: {orig_labels}")


Dataset({
    features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
    num_rows: 303
})
Labels: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
Orig labels: {'Cretaceous_dinosaur', 'Mythological_king', 'Goddess', 'Deity', 'Aquatic_mammal', 'Aquatic_animal'}
Labels in label file: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
Original labels detected: {'Cretaceous_dinosaur', 'Mythological_king', 'Goddess', 'Deity', 'Aquatic_mammal', 'Aquatic_animal'}


In [None]:
for num_shots in [10]:  # Test with different numbers of examples
    print(f"shots: {num_shots}")
    result = run_test(test_data, shots=num_shots)
    print(f"Results for {num_shots} shots: {result}")

shots: 10


Evaluating:   0%|          | 0/303 [00:00<?, ?it/s]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B-Aquatic_animal', 'B-Aquatic_animal', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

## Output for Evaluation

In the following cells, run your trained model on the test data, and produce a list of lists of tags, with one list per sentence, e.g.

```
[
    [
        "B-Aquatic_animal",
        "I-Aquatic_animal",
        "I-Aquatic_animal",
...
        "O",
        "O",
        "B-Aquatic_animal",
        "I-Aquatic_animal"
    ],
    [...]
]
```

Serialize your predictions into a file named `test_predictions_llm_baseline.json` for your initial attempt at an LLM tagger. Your expected f1 on the wittheld test set should be 0.2. Then, serialize your predictions for any further experiments into a file called `test_predictions_llm_experiment.json`. This file will reflect the results that you were able to achieve with more prompt engineering.

### Experiment

In [None]:
# Ok, now let's make the prompting a bit more programmatic. First, implement a function that takes an example from
# the dataset, and converts it into a message for the model using the format we specified above.
# You might want to use the Python string "format" function to make this a bit easier, especially since
# You will be experimenting with different prompts later.
#
# TODO: implement this.
def get_message(example):
    """
    Converts an example into a single user message for the model.

    Args:
        example (dict): A single example from the dataset.
                        Expected keys: 'tokens' (list of words).

    Returns:
        str: The user message content as a string.
    """
    # Retrieve the text content by joining the tokens
    text = " ".join(example["tokens"])  # Combine tokens into a single string

    # Format the text for the user message
    user_message_content = text

    return user_message_content

In [None]:
# Next we're going to implement a function to return the chat_history, but in order to do that we first need
# to be able to convert labeled examples from the dataset into a format that makes more sense for the model,
# in this case the HTML-style format we specified in the example. That's the task for this function: take
# an example from the dataset as input, and return a string that has tagged the text with labels in the given
# HTML-style format.
#
# TODO: implement this.
def convert_bio_to_prompt(example):
    """
    Converts a labeled example from the dataset into an HTML-style formatted string
    with entities tagged according to the specified BIO labels.

    Args:
        example (dict): A single example from the dataset.
                        Expected keys: 'tokens' (list of words) and 'ner_strings' (list of BIO labels).

    Returns:
        str: A string where entities in the text are tagged with the specified HTML-style format.
    """
    tokens = example["tokens"]  # List of tokens
    ner_strings = example["ner_strings"]  # Corresponding BIO labels

    # Initialize variables for building the output string
    formatted_text = ""
    current_entity = None
    current_entity_tokens = []

    # Iterate over tokens and their corresponding BIO tags
    for token, s in zip(tokens, ner_strings):
        if s == "O":  # If the token is outside any entity
            if current_entity:  # Close the current entity tag
                formatted_text += f"<{current_entity[2:]}> {' '.join(current_entity_tokens)} </{current_entity[2:]}> "
                current_entity = None
                current_entity_tokens = []
            formatted_text += token + " "  # Add the token as normal text
        else:
            # Use the label directly if it's not 0
            if current_entity == s:  # Continue the current entity
                current_entity_tokens.append(token)
            else:
                if current_entity:  # Close the previous entity tag
                    formatted_text += f"<{current_entity[2:]}> {' '.join(current_entity_tokens)} </{current_entity[2:]}> "
                # Start a new entity
                current_entity = s
                current_entity_tokens = [token]

    # Handle the last entity if it exists
    #if current_entity:
    #    formatted_text += f"<{current_entity}> {' '.join(current_entity_tokens)} </{current_entity}> "

    return formatted_text.strip()  # Return the formatted text, removing trailing spaces.

#### Prompt Formatting | Demonstration selection

In [None]:
from sentence_transformers import SentenceTransformer, util

# Initialize the embedding model (you can load it once globally)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute embeddings for the training dataset
def compute_train_embeddings(dataset):
    """
    Compute embeddings for the training dataset.

    Args:
        dataset (list): The training dataset, where each example is a dictionary with 'tokens'.

    Returns:
        list: A list of embeddings for the training examples.
    """
    texts = [" ".join(example['tokens']) for example in dataset]
    embeddings = embedding_model.encode(texts, convert_to_tensor=True)
    return embeddings

# Assume `train_embeddings` is precomputed
train_embeddings = compute_train_embeddings(data_splits['train'])

print(train_embeddings)


In [None]:
def get_similar_examples(input_text, train_data, embeddings, top_k=None):
    """
    Retrieve all examples from the training dataset sorted by cosine similarity with the input text.

    Args:
        input_text (str): The input text for which we want similar examples.
        train_data (list): The training dataset.
        embeddings (torch.Tensor): Precomputed embeddings for the training data.
        top_k (int, optional): Number of similar examples to retrieve. If None, return all sorted examples.

    Returns:
        list: Sorted examples from the training data based on cosine similarity.
    """
    input_embedding = embedding_model.encode(input_text, convert_to_tensor=True)
    cosine_scores = util.cos_sim(input_embedding, embeddings).squeeze(0)

    # Get the sorted indices based on cosine similarity
    sorted_indices = torch.argsort(cosine_scores, descending=True).tolist()

    # Return all examples sorted by cosine similarity, or top_k if specified
    if top_k:
        sorted_indices = sorted_indices[:top_k]

    return [train_data[i] for i in sorted_indices]


def get_chat_history(shots, dataset, entity_types_list, convert_bio_to_prompt_fn):
    """
    Generates a chat history formatted as a list of maps for few-shot learning.

    Args:
        shots (int): Number of examples to include in the chat history (few-shot examples).
        dataset (list): The dataset containing examples (list of dictionaries with 'tokens' and 'ner_tags').
        entity_types_list (list): List of entity types to include in the system prompt.
        convert_bio_to_prompt_fn (function): Function that converts labeled examples to the desired prompt format.

    Returns:
        list: Chat history structured as a list of dictionaries with roles and content.
    """
    system_prompt = {
        "role": "system",
        "content": (
            f"You are a highly capable NER labeling model. Your task is to extract and tag entities in the input "
            f"text according to the BIO format. Entity types include: {', '.join(entity_types_list)}.\n\n"
            f"Example of tagging:\n"
            f"Input: 'John Doe works at Acme Corp in New York.'\n"
            f"Output: 'John Doe <Person> works at <Organization> Acme Corp </Organization> in New York <Location>'.\n\n"
            f"Format your output with the tags exactly as shown. Use 'O' for words that do not belong to an entity."
        )
    }
    # Initialize the chat history with the system prompt
    chat_history = [system_prompt]

    # Input text for similarity-based selection (example: from the dev/test dataset)
    input_text = " ".join(dataset[0]['tokens'])  # Replace dataset[0] with the actual input example


    ## Select the top-k similar examples
    similar_examples = get_similar_examples(input_text, data_splits['train'], train_embeddings, 5)
    #print(similar_examples)

    # Add the selected examples to the chat history
    for i in range(min(shots, len(similar_examples))):

        # Convert the example to the prompt format
        example = similar_examples[i]
        #print(example)
        formatted_example = convert_bio_to_prompt_fn(example)

        # Add the user message (text input)
        user_message = {
            "role": "user",
            "content": f"{' '.join(example['tokens'])}"
        }
        #print(type(example['tokens']), example['tokens'])

        chat_history.append(user_message)

        # Add the assistant message (labeled output)
        assistant_message = {
            "role": "system",
            "content": f"{formatted_example}"
        }
        chat_history.append(assistant_message)

    return chat_history

In [None]:
# Precompute training embeddings
#train_embeddings = compute_train_embeddings(data_splits['train'])

def call_api_openai(shots, example):
    success = False
    #print(type(example['tokens']), example['tokens'])

    while not success:
        try:
            # Retrieve tokens from the current example to compute similarity
            #input_text = " ".join(example["tokens"])
            ##print(input_text)
            ## Get top-k similar examples
            #entity_type = ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
            ### Select the top-k similar examples
            #diverse_examples = get_diverse_examples(input_text, data_splits['train'], train_embeddings, entity_type, 5)
            # Generate chat history using similar examples
            chat_history = get_chat_history(
                shots, data_splits['train'], orig_labels, convert_bio_to_prompt
            )

            # Add the message for the current example
            message = {'role': USER_STR, 'content': get_message(example)}
            chat_history.append(message)

            # Call the OpenAI API
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.5,
                messages=chat_history
            )
            success = 1
        except Exception as err:
            tqdm.write(f"Caught exception: {err}")
    return response.choices[0].message.content

In [None]:
# Now we want to be able to evaluate the model, in order to compare it to e.g. the fine-tuned BERT model.
# In order to do this, we need to write the reverse of the convert_bio_to_prompt function, so that we can
# convert in the other direction, from the generated response in prompt format, back to bio for evaluation
# using seqeval.
#
# The input to this function is the string response from the model, and the output should be a list of
# text BIO labels corresponding to the labeling implied by the tagged output produced by the model, as
# well as the list of tokens (since the generative model could return something different than we gave it,
# and we need to handle that somehow in the eval).
#
# TODO: implement this
# Now we want to be able to evaluate the model, in order to compare it to e.g. the fine-tuned BERT model.
# In order to do this, we need to write the reverse of the convert_bio_to_prompt function, so that we can
# convert in the other direction, from the generated response in prompt format, back to bio for evaluation
# using seqeval.
#
# The input to this function is the string response from the model, and the output should be a list of
# text BIO labels corresponding to the labeling implied by the tagged output produced by the model, as
# well as the list of tokens (since the generative model could return something different than we gave it,
# and we need to handle that somehow in the eval).
#
import re
import string

def convert_response_to_bio(response):
    """
    Converts the model-generated response with HTML-style tags back into BIO format.

    Args:
        response (str): The generated response from the model in HTML-style format.

    Returns:
        tuple: A tuple containing:
            - tokens (list of str): The tokens extracted from the response.
            - bio_labels (list of str): The corresponding BIO labels for the tokens.
    """
        # Remove the 'Labels:' prefix if it exists
    if response.startswith('Labels:'):
        response = response[len('Labels:'):].strip()

    tokens = []
    bio_labels = []

    # Regular expression to match tags and plain text
    tag_pattern = re.compile(r"(</?[\w\-]+>)|([^<>]+)")  # Matches <tag>, </tag>, and plain text

    current_label = "O"  # Start with "O" (outside any entity)
    inside_entity = False  # Track whether we are inside an entity tag

    for match in tag_pattern.finditer(response):
        tag_or_text = match.group()

        if tag_or_text.startswith("</"):  # Closing tag
            current_label = "O"
            inside_entity = False
        elif tag_or_text.startswith("<"):  # Opening tag
            current_label = tag_or_text[1:-1]  # Extract tag name without <>
            inside_entity = True
        else:
            # Process plain text
            for i, token in enumerate(tag_or_text.split()):
                tokens.append(token)

                if inside_entity:
                    #bio_labels.append(current_label)
                    if i == 0:
                        bio_labels.append(f"B-{current_label}")  # Start of an entity
                    else:
                        bio_labels.append(f"I-{current_label}")  # Continuation of the same entity
                else:
                    bio_labels.append("O")  # Outside any entity

    punctuations = set(string.punctuation)
    merged_tokens = []
    merged_bio_labels = []

    for token, label in zip(tokens, bio_labels):
        if token in punctuations and merged_tokens:
            merged_tokens[-1] += token
        else:
            merged_tokens.append(token)
            merged_bio_labels.append(label)

    return merged_bio_labels, merged_tokens


In [None]:
# Run the evaluation
dev_set = data_splits['dev']  # Development set

for num_shots in [10]:  # Test with different numbers of examples
    print(f"shots: {num_shots}")
    result = run_eval(dev_set, shots=num_shots)
    print(f"Results for {num_shots} shots: {result}")