# HW4: LLM prompting for entity labeling
This notebook contains starter code for prompting an LLM API for the task of entity recognition. It has minimal text so you can easily copy it to **handin.py** when you submit.  Please read all the comments in the code as they contain important information.

In [1]:
# This code block just contains standard setup code for running in Python
import json
import string
import re
import time
from tqdm.auto import tqdm

# PyTorch imports
import torch
from torch.utils.data import DataLoader
import numpy as np

# Fix the random seed(s) for reproducability
random_seed = 8942764
torch.random.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)

!pip install ipytest -q
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q
!pip install seqeval -q
!pip install ratelimit -q
!pip install openai -q
from openai import OpenAI

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, BertModel, DefaultDataCollator

from datasets import load_dataset

import evaluate
from ratelimit import limits

# Just a helper function for efficiently removing punctuation from a string
def strip_punct(s):  return s.translate(string.punctuation)

In [2]:

# Use the API key that we provided.
client = OpenAI(api_key='sk-kmW_moO8leEUOX7PGxTijA', base_url="https://cmu.litellm.ai")
USER_STR = "user"
SYSTEM_STR = "system"
MSG_STR = "content"

In [3]:
# Here is how you can use the API to prompt the OpenAI model.
# Docs: https://platform.openai.com/docs/api-reference
# messages = [
#     {'role': 'system', 'content':
#      """You will be given input text containing different types of entities that you will label.
#      This is the list of entity types to label: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
#      Label the enities by surrounding them with tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."""
#      }, 
#      {'role': 'user', 'content': """Text: Once paired in later myths with her Titan brother Hyperion as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), Selene (the Moon), and Eos (the Dawn)."""},
#      {'role': 'system', 'content': """Labels: Once paired in later myths with her Titan brother <Deity> Hyperion </Deity> as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), <Goddess> Selene </Goddess> (the Moon), and <Goddess> Eos </Goddess> (the Dawn)."""},
#      {'role': 'user', 'content': """Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.\nLabels: """}
# ]

# 1. Few shots...
messages = [
    {'role': 'system', 'content':
     """Label the following text with the given entity types: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
     Use tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."""
     }, 
     {'role': 'user', 'content': """Text: Once paired in later myths with her Titan brother Hyperion as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), Selene (the Moon), and Eos (the Dawn)."""},
     {'role': 'system', 'content': """Labels: Once paired in later myths with her Titan brother <Deity> Hyperion </Deity> as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), <Goddess> Selene </Goddess> (the Moon), and <Goddess> Eos </Goddess> (the Dawn)."""},
     {'role': 'user', 'content': """Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.\nLabels: """}
]

# 2. Chain of thoughts
messages = [
    {'role': 'system', 'content':
     """Label the following text with the given entity types: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
     Use tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'. For each label, explain the reasoning step by step."""},
    
    {'role': 'user', 'content': """Text: Once paired in later myths with her Titan brother Hyperion as her husband, mild-eyed Euryphaessa, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of Helios (the Sun), Selene (the Moon), and Eos (the Dawn)."""},
    
    {'role': 'system', 'content': 
     """Reasoning: 
     - "Hyperion" is described as a Titan, which is a mythological figure, making it a <Deity>.
     - "Euryphaessa" is described as the mother of Helios (the Sun), aligning with mythological roles, making her a <Goddess>.
     - "Helios" represents the Sun in mythology, so it is labeled as <Deity>.
     - "Selene" is described as the Moon and associated with mythological figures, making her a <Goddess>.
     - "Eos" is described as the Dawn, also aligning with a mythological role, making her a <Goddess>.

     Labels: Once paired in later myths with her Titan brother <Deity> Hyperion </Deity> as her husband, mild-eyed <Goddess> Euryphaessa </Goddess>, the far-shining one of the Homeric Hymn to Helios, was said to be the mother of <Deity> Helios </Deity> (the Sun), <Goddess> Selene </Goddess> (the Moon), and <Goddess> Eos </Goddess> (the Dawn)."""},
    
    {'role': 'user', 'content': 
     """Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.\nLabels: """}
]

# 3. Dynamic prompt construction
messages = [
    {'role': 'system', 'content':
     """Label the following text with the given entity types: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.
     Use tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'. Select examples that closely match the context or sentence structure of the input text."""},
    
    # Dynamically selected demonstration 1
    {'role': 'user', 'content': """Text: The well-known dinosaur Tyrannosaurus rex lived during the late Cretaceous period and was one of the most famous theropods."""},
    {'role': 'system', 'content': """Labels: The well-known dinosaur <Cretaceous_dinosaur> Tyrannosaurus rex </Cretaceous_dinosaur> lived during the late Cretaceous period and was one of the most famous theropods."""},

    # Dynamically selected demonstration 2
    {'role': 'user', 'content': """Text: Neptune was considered the god of the sea in Roman mythology, often depicted with a trident."""},
    {'role': 'system', 'content': """Labels: <Deity> Neptune </Deity> was considered the god of the sea in Roman mythology, often depicted with a trident."""},

    # Dynamically selected demonstration 3
    {'role': 'user', 'content': """Text: Selene was frequently associated with the Moon and was part of Greek mythology's pantheon of deities."""},
    {'role': 'system', 'content': """Labels: <Goddess> Selene </Goddess> was frequently associated with the Moon and was part of Greek mythology's pantheon of deities."""},

    # Input text to label
    {'role': 'user', 'content': """Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.\nLabels: """}
]


# # This is where you provide the final prompt that we want the model to complete to give us the answer.
# message = f"""Text: From her ideological conception, Taweret was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.
# Labels: """

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    temperature=0.0,
    seed=random_seed,
    messages=messages
)

print(response.choices[0].message.content)

# You can also print out the usage, in number of tokens. 
# Pricing is per input/output token, listed here: https://openai.com/pricing
print(f"Usage: {response.usage.prompt_tokens} input, {response.usage.completion_tokens} output, {response.usage.total_tokens} total tokens")

Labels: <Goddess> Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective hippopotamus goddesses: Ipet, Reret, and Hedjet.
Usage: 338 input, 49 output, 387 total tokens


In [4]:
# Load the dataset
from datasets import Dataset, ClassLabel, Sequence

data_splits = load_dataset('json', data_files={'train': 'dinos_and_deities_train_bio.jsonl', 'dev': 'dinos_and_deities_dev_bio_sm.jsonl', 'test': 'dinos_and_deities_test_bio_nolabels.jsonl'})

# Load dicts for mapping int labels to strings, and vice versa
label_names_fname = "dinos_and_deities_train_bio.jsonl.labels"
labels_int2str = []
with open(label_names_fname) as f:
    labels_int2str = f.read().split()
print(f"Labels: {labels_int2str}")
labels_str2int = {l: i for i, l in enumerate(labels_int2str)}

# Also create a set containing the original labels, without B- and I- tags
orig_labels = set()
for label in labels_str2int.keys():
    orig_label = label[2:]
    if orig_label:
        orig_labels.add(orig_label)
print(f"Orig labels: {orig_labels}")

data_splits.cast_column("ner_tags", Sequence(ClassLabel(names=labels_int2str)))
print(data_splits)

Labels: ['I-Aquatic_animal', 'B-Deity', 'B-Mythological_king', 'I-Mythological_king', 'I-Cretaceous_dinosaur', 'B-Aquatic_animal', 'B-Aquatic_mammal', 'I-Goddess', 'I-Deity', 'B-Cretaceous_dinosaur', 'I-Aquatic_mammal', 'B-Goddess', 'O']
Orig labels: {'Aquatic_mammal', 'Cretaceous_dinosaur', 'Deity', 'Goddess', 'Aquatic_animal', 'Mythological_king'}
DatasetDict({
    train: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 1749
    })
    dev: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 150
    })
    test: Dataset({
        features: ['para_index', 'title', 'doc_id', 'content', 'page_id', 'id', 'tokens', 'ner_strings', 'ner_tags'],
        num_rows: 303
    })
})


In [5]:
# Let's inspect a single example
dev_example = data_splits['dev'][5]
print(json.dumps(dev_example, indent=4))

{
    "para_index": 0,
    "title": "Hadingus",
    "doc_id": "Hadingus-0",
    "content": "Hadingus was one of the earliest legendary Danish kings according to Saxo Grammaticus' Gesta Danorum, where he has a detailed biography. Georges Dum\u00e9zil and others have argued that Hadingus was partially modelled on the god Nj\u00f6r\u00f0r.",
    "page_id": "4283756",
    "id": "Gy_0WYcB1INCf0UycBhm",
    "tokens": [
        "Hadingus",
        "was",
        "one",
        "of",
        "the",
        "earliest",
        "legendary",
        "Danish",
        "kings",
        "according",
        "to",
        "Saxo",
        "Grammaticus'",
        "Gesta",
        "Danorum,",
        "where",
        "he",
        "has",
        "a",
        "detailed",
        "biography.",
        "Georges",
        "Dum\u00e9zil",
        "and",
        "others",
        "have",
        "argued",
        "that",
        "Hadingus",
        "was",
        "partially",
        "modelled",
        "on",

In [6]:
# Ok, now let's make the prompting a bit more programmatic. First, implement a function that takes an example from
# the dataset, and converts it into a message for the model using the format we specified above. 
# You might want to use the Python string "format" function to make this a bit easier, especially since 
# You will be experimenting with different prompts later.
#
# TODO: implement this.
def get_message(example):
    """
    Convert a dataset example into the message format expected by the model.
    
    :param example: Example from the dataset, which should include 'content' and 'ner_strings' fields.
    :return: A string formatted to pass as input to the model, which includes text and its corresponding labels.
    """
    # Extract content (the text to be analyzed).
    text = example['content']
    
    # Return the formatted message to pass to the model
    message = f"Text: {text}"
    return message

In [7]:
# Next we're going to implement a function to return the chat_history, but in order to do that we first need
# to be able to convert labeled examples from the dataset into a format that makes more sense for the model,
# in this case the HTML-style format we specified in the example. That's the task for this function: take
# an example from the dataset as input, and return a string that has tagged the text with labels in the given
# HTML-style format.
# 
# TODO: implement this.
def convert_bio_to_prompt(example):
    """
    Convert the BIO-labeled text into a tagged format for model input.
    
    :param example: A dataset example with 'tokens' and 'ner_tags' fields.
    :return: A string with the text, tagged with labels in the specified format.
    """
    tokens = example['tokens']  # List of tokens from the example
    ner_tags = example['ner_tags']  # List of BIO tags corresponding to each token
    
    # Start with the text being empty
    text = ""
    
    for token, tag_id in zip(tokens, ner_tags):
        # Get the string label from the integer tag id
        tag = labels_int2str[tag_id]
        
        # If the tag starts with 'B-' or 'I-', it's an entity
        if tag.startswith('B-'):
            entity_type = tag[2:]  # Get the entity type (e.g., 'Deity', 'Cretaceous_dinosaur')
            text += f" <{entity_type}>{token}</{entity_type}>"
        elif tag.startswith('I-'):
            entity_type = tag[2:]  # Continuation of the entity
            text += f" <{entity_type}>{token}</{entity_type}>"
        else:
            text += f" {token}"
    
    return text


In [None]:
# Now we can write a function that takes the number of shots, dataset, list of entity types, and 
# convert_bio_to_prompt function, and returns the chat_history (a list of maps) structured as in 
# the example.
#
# TODO: implement this.
def get_chat_history(shots, dataset, entity_types_list, convert_bio_to_prompt_fn):
    """
    Generate a list of prompt examples for the model, using `shots` as the number of demonstration examples.
    
    :param shots: Number of examples to include for few-shot learning.
    :param dataset: The dataset to sample examples from.
    :param entity_types_list: List of entity types (e.g., ['Deity', 'Cretaceous_dinosaur']).
    :param convert_bio_to_prompt_fn: Function to convert BIO examples to a string.
    :return: List of message dictionaries for the model's chat history.
    """
    chat_history = []
    
    # First, add the system message that introduces the entity types
    chat_history.append({
        'role': 'system',
        'content': f"You will be given input text containing different types of entities that you will label.\
                     This is the list of entity types to label: Deity, Mythological_king, Cretaceous_dinosaur, Aquatic_mammal, Aquatic_animal, Goddess.\
                     Label the enities by surrounding them with tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."
    })
    
    # Now, add the user messages based on the dataset examples
    for i in range(shots):
        example = dataset[i]
        print("Example:", example)
        formatted_example = convert_bio_to_prompt_fn(example)
        print("Formatted Example:", formatted_example)
        
        # Example 1: Show the text with expected labels
        chat_history.append({
            'role': 'user',
            'content': f"Text: {example['content']}\nLabels: {formatted_example}"
        })
    
    return chat_history

**Chain-of-thought Implementation**

In [None]:
# Chain-of-Thought Implementation

def get_chat_history(shots, dataset, entity_types_list, convert_bio_to_prompt_fn):
    """
    Generate a list of prompt examples for the model, using `shots` as the number of demonstration examples.
    
    :param shots: Number of examples to include for few-shot learning.
    :param dataset: The dataset to sample examples from.
    :param entity_types_list: List of entity types (e.g., ['Deity', 'Cretaceous_dinosaur']).
    :param convert_bio_to_prompt_fn: Function to convert BIO examples to a string.
    :return: List of message dictionaries for the model's chat history.
    """
    chat_history = []
    
    # First, add the system message that introduces the entity types and explains the chain of thought process
    chat_history.append({
        'role': 'system',
        'content': f"You will be given input text containing different types of entities that you will label.\
                     This is the list of entity types to label: {', '.join(entity_types_list)}.\
                     For each example, think step-by-step and provide detailed reasoning before labeling the entities.\
                     Label the entities by surrounding them with tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'."
    })
    
    # Now, add the user messages based on the dataset examples
    for i in range(shots):
        example = dataset[i]
        print("Example:", example)
        formatted_example = convert_bio_to_prompt_fn(example)
        print("Example:", formatted_example)
        
        # Example 1: Show the text with expected labels and include reasoning steps
        chat_history.append({
            'role': 'user',
            'content': f"Text: {example['content']}\n"
                       f"Step 1: Identify the entities in the text.\n"
                       f"Step 2: Determine the type of each entity based on the context.\n"
                       f"Step 3: Label the entities by surrounding them with the appropriate tags.\n"
                       f"Labels: {formatted_example}"
        })
    
    return chat_history

**Dynamic Prompt Implementation**

In [10]:
import random

def get_chat_history(shots, dataset, entity_types_list, convert_bio_to_prompt_fn):
    """
    Generate a dynamic prompt with contextually relevant examples for the model.
    
    :param shots: Number of examples to include for few-shot learning.
    :param dataset: The dataset to sample examples from.
    :param entity_types_list: List of entity types (e.g., ['Deity', 'Cretaceous_dinosaur']).
    :param convert_bio_to_prompt_fn: Function to convert BIO examples to a string.
    :return: List of message dictionaries for the model's chat history.
    """
    chat_history = []

    # Add the system message introducing the task
    chat_history.append({
        'role': 'system',
        'content': f"""Label the following text with the given entity types: {', '.join(entity_types_list)}.
                        Use tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'.
                        Select examples that closely match the context or sentence structure of the input text."""
    })

    # Select contextually relevant examples dynamically
    selected_examples = []
    for example in dataset:
        if len(selected_examples) >= shots:
            break
        # Simple heuristic: check if input text shares entity types or structure with the example
        if any(entity in example['content'] for entity in entity_types_list):
            selected_examples.append(example)
    
    # If not enough examples are found, pad with random examples
    while len(selected_examples) < shots:
        selected_examples.append(random.choice(dataset))
    
    # Add the dynamically selected examples to the chat history
    for example in selected_examples:
        formatted_example = convert_bio_to_prompt_fn(example)
        chat_history.append({
            'role': 'user',
            'content': f"Text: {example['content']}"
        })
        chat_history.append({
            'role': 'system',
            'content': f"Labels: {formatted_example}"
        })

    return chat_history

In [11]:
# Now we can put all of those together to prompt the model more automagically!
num_shots = 1

chat_history = get_chat_history(num_shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
print(chat_history)

[{'role': 'system', 'content': "Label the following text with the given entity types: Aquatic_mammal, Cretaceous_dinosaur, Deity, Goddess, Aquatic_animal, Mythological_king.\n                        Use tags like '<Cretaceous_dinosaur> Beipiaognathus </Cretaceous_dinosaur>'.\n                        Select examples that closely match the context or sentence structure of the input text."}, {'role': 'user', 'content': 'Text: One of the several forms of the Hindu God Shiva, is Ardhanarishwar (literally half-female God). Here Shiva manifests himself so that the left half is Female and the right half is Male. The left represents Shakti (energy, power) in the form of Goddess Parvati (otherwise his consort) and the right half Shiva. Whereas Parvati is the cause of arousal of Kama (desires), Shiva is the killer. Shiva is pervaded by the power of Parvati and Parvati is pervaded by the power of Shiva.'}, {'role': 'system', 'content': 'Labels:  One of the several forms of the Hindu God Shiva, is 

In [12]:
# Now we can put all of those together to prompt the model more automagically!
num_shots = 20

chat_history = get_chat_history(num_shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
message = {'role': 'user', 'content': get_message(dev_example)}
chat_history.append(message)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    temperature=0.0,
    seed=random_seed,
    messages=chat_history
)

print(response.choices[0].message.content)

Entity: <Mythological_king>Hadingus</Mythological_king>, <Deity>Njörðr</Deity>


In [13]:
# Now let's wrap that call in a function that takes shots and an example, calls the API and returns the response.
def call_api_openai(shots, example):
    success = False
    while not success:
        try:
            chat_history = get_chat_history(shots, data_splits['train'], orig_labels, convert_bio_to_prompt)
            message = {'role': USER_STR, 'content': get_message(example)}
            chat_history.append(message)
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                temperature=0.0,
                messages=chat_history
            )
            success = 1
        except Exception as err:
            tqdm.write(f"Caught exception: {err}")
    return response.choices[0].message.content 

In [14]:
# Now we want to be able to evaluate the model, in order to compare it to e.g. the fine-tuned BERT model.
# In order to do this, we need to write the reverse of the convert_bio_to_prompt function, so that we can
# convert in the other direction, from the generated response in prompt format, back to bio for evaluation
# using seqeval.
#
# The input to this function is the string response from the model, and the output should be a list of 
# text BIO labels corresponding to the labeling implied by the tagged output produced by the model, as 
# well as the list of tokens (since the generative model could return something different than we gave it,
# and we need to handle that somehow in the eval).
#
# TODO: implement this
import re

def convert_response_to_bio(response):
    """
    Convert a model's generated response with HTML-style tags into BIO format.
    
    :param response: The string response from the model with tagged entities in HTML format.
    :return: A tuple containing two lists:
        - bio_labels: The list of BIO labels corresponding to each token.
        - tokens: The list of tokens corresponding to each entity or non-entity.
    """
    start_labels = r"^Labels:"
    response =re.sub(start_labels, "", response).strip()
    
    tag_pattern = r"<(/?)([a-zA-Z_]+)>([^<]*)"
    punctuation_pattern = rf"^[{re.escape(string.punctuation)}]+$"
    

    # Initialize variables
    labels = []
    tokens = []

    # Split the response into tokens and tags
    for match_idx, match in enumerate(re.finditer(tag_pattern, response)):
        if match_idx == 0 and match.start() != 0:
            text = response[:match.start()].strip()
            texts = text.split(" ")
            for t in texts:
                tokens.append(t)
                labels.append("O")
        
        # Extract the tag and text
        tag, entity, text = match.groups()
        text = text.strip()
        
        # Split the text into tokens
        text_tokens = text.split(" ")
        
        text_tokens_no_punctuation = []
        
        for i, token in enumerate(text_tokens):
            if re.match(punctuation_pattern, token):
                if i == 0:
                    tokens[-1] = tokens[-1] + token
                else:
                    if len(text_tokens_no_punctuation) == 0:
                        labels[-1] = labels[-1] + token
                    else:   
                        text_tokens_no_punctuation[-1] = text_tokens_no_punctuation[-1] + token
            else:
                text_tokens_no_punctuation.append(token)
        
        # Add the tokens and labels
        for i, token in enumerate(text_tokens_no_punctuation):
            if token:
                tokens.append(token)
                if tag == "/":
                    labels.append("O")
                elif i == 0:
                    labels.append(f"B-{entity}")
                else:
                    labels.append(f"I-{entity}")
                    
    return labels, tokens

In [15]:
# Here's a test example you can use to validate/debug your code (note that this was constructed to simulate various
# spacing/tokenization scenarios and does not necessarily reflect "correct" labeling wrt the training data):
import ipytest
ipytest.autoconfig()
def test_convert_html_to_bio():
    html_str = 'From <Goddess> her</Goddess> ideological conception, <Goddess> the deity Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Aquatic_mammal>hippopotamus</Aquatic_mammal> <Goddess>goddesses </Goddess>: <Goddess> Ipet ("the Nurse")</Goddess>, <Goddess>Reret ("the Sow") </Goddess>, and <Goddess>Hedjet ("the White One")</Goddess>.'
    labels, text = convert_response_to_bio(html_str)
    true_labels = ['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
    true_text = ['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']
    print(labels)
    print(text)
    assert labels == true_labels
    assert text == true_text

def test_convert_html_to_bio_labels():
    html_str = 'Labels: From <Goddess> her</Goddess> ideological conception, <Goddess> the deity Taweret </Goddess> was closely grouped with (and is often indistinguishable from) several other protective <Aquatic_mammal>hippopotamus</Aquatic_mammal> <Goddess>goddesses </Goddess>: <Goddess> Ipet ("the Nurse")</Goddess>, <Goddess>Reret ("the Sow") </Goddess>, and <Goddess>Hedjet ("the White One")</Goddess>.'
    labels, text = convert_response_to_bio(html_str)
    true_labels = ['O', 'B-Goddess', 'O', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aquatic_mammal', 'B-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'O', 'B-Goddess', 'I-Goddess', 'I-Goddess', 'I-Goddess']
    true_text = ['From', 'her', 'ideological', 'conception,', 'the', 'deity', 'Taweret', 'was', 'closely', 'grouped', 'with', '(and', 'is', 'often', 'indistinguishable', 'from)', 'several', 'other', 'protective', 'hippopotamus', 'goddesses:', 'Ipet', '("the', 'Nurse"),', 'Reret', '("the', 'Sow"),', 'and', 'Hedjet', '("the', 'White', 'One").']
    print(labels)
    print(text)
    assert labels == true_labels
    assert text == true_text

ipytest.run('-vv')  # '-vv' for increased verbosity

platform linux -- Python 3.8.20, pytest-8.3.4, pluggy-1.5.0 -- /home/damilare1012/miniconda3/envs/NLP_Assignments/bin/python
cachedir: .pytest_cache
rootdir: /mnt/c/Users/Damilare/Desktop/CMU-Africa/Second Year/Natural Language Processing/Assignments/Assignment - 4/HW4/HW4
plugins: anyio-4.5.2, typeguard-4.3.0
[1mcollecting ... [0mcollected 2 items

t_9506146ecdae4d869e07082209da1392.py::test_convert_html_to_bio [32mPASSED[0m[33m                       [ 50%][0m
t_9506146ecdae4d869e07082209da1392.py::test_convert_html_to_bio_labels [32mPASSED[0m[33m                [100%][0m

../../../../../../../../../../../../home/damilare1012/miniconda3/envs/NLP_Assignments/lib/python3.8/site-packages/_pytest/config/__init__.py:1277
    self._mark_plugins_for_rewrite(hook)



<ExitCode.OK: 0>

In [16]:
# Now we can put all of the above together to evaluate!
metric = evaluate.load("seqeval")

def run_eval(dataset, shots):

  for example in tqdm(dataset, total=len(dataset), desc="Evaluating", position=tqdm._get_free_pos()):

      # String list of labels (BIO)
      true_labels = [labels_int2str[l] for l in example['ner_tags']]
      example_tokens = example['tokens']

      response_text = call_api_openai(shots, example)

      # String list of predicted labels (BIO)
      predictions, generated_tokens = convert_response_to_bio(response_text)

      # Handle case where the generated text doesn't align with the input text.
      # Basically, we'll eval everything up to where the two strings start to diverge.
      # We relax this slightly by ignoring punctuation (sometimes we lose a paren or something, 
      # but that's not catastrophic for eval/tokenization).
      # Just predict 'O' for anything following mismatch.
      matching_elements = [strip_punct(i) == strip_punct(j) for i, j in zip(example_tokens, generated_tokens)]

      if False in matching_elements:
         last_matching_idx = matching_elements.index(False)
      else:
         last_matching_idx = min(len(generated_tokens), len(example_tokens))

      predictions = predictions[:last_matching_idx] + ['O']*(len(example_tokens)-last_matching_idx)
      metric.add(predictions=predictions, references=true_labels)
  
  return predictions, metric.compute()

In [17]:
import csv

# Initialize the CSV file
csv_file = 'evaluation_results_dynamic_prompts.csv'
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['shots', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])

    # Run the eval on the dev set
    dev_examples_to_take = 0

    dev_set = data_splits['dev']
    if dev_examples_to_take > 0:
        dev_set = data_splits['dev'].select(range(dev_examples_to_take))

    # for num_shots in [0,1]:
    for num_shots in [0, 1, 5, 10, 20, 30, 40]:
        print(f"shots: {num_shots}")
        predictions, result = run_eval(dev_set, shots=num_shots)
        print(result)

        # Extract the required metrics
        overall_precision = result['overall_precision']
        overall_recall = result['overall_recall']
        overall_f1 = result['overall_f1']
        overall_accuracy = result['overall_accuracy']

        # Write the results to the CSV file
        writer.writerow([num_shots, overall_precision, overall_recall, overall_f1, overall_accuracy])

print(f"Results saved to {csv_file}")

shots: 0


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.18181818181818182, 'recall': 0.0967741935483871, 'f1': 0.12631578947368424, 'number': 62}, 'Aquatic_mammal': {'precision': 0.17647058823529413, 'recall': 0.08571428571428572, 'f1': 0.11538461538461539, 'number': 35}, 'Bambiraptor': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Cretaceous_dinosaur': {'precision': 0.3783783783783784, 'recall': 0.3888888888888889, 'f1': 0.3835616438356164, 'number': 36}, 'Deity': {'precision': 0.35135135135135137, 'recall': 0.22608695652173913, 'f1': 0.2751322751322751, 'number': 115}, 'Dionysus': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Goddess': {'precision': 0.125, 'recall': 0.04411764705882353, 'f1': 0.06521739130434782, 'number': 68}, 'Mythological_king': {'precision': 0.13636363636363635, 'recall': 0.21428571428571427, 'f1': 0.16666666666666663, 'number': 14}, 'overall_precision': 0.25821596244131456, 'overall_recall': 0.16666666666666666, 'overall_f1': 0.2025782688766114, 'overall_

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.2, 'recall': 0.14516129032258066, 'f1': 0.16822429906542055, 'number': 62}, 'Aquatic_deity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Aquatic_insect': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Aquatic_mammal': {'precision': 0.38461538461538464, 'recall': 0.14285714285714285, 'f1': 0.20833333333333331, 'number': 35}, 'Aquatic_plant': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Cretaceous_dinosaur': {'precision': 0.359375, 'recall': 0.6388888888888888, 'f1': 0.45999999999999996, 'number': 36}, 'Deity': {'precision': 0.3275862068965517, 'recall': 0.16521739130434782, 'f1': 0.21965317919075142, 'number': 115}, 'Goddess': {'precision': 0.2894736842105263, 'recall': 0.16176470588235295, 'f1': 0.2075471698113208, 'number': 68}, 'Hapi': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Mythological_king': {'precision': 0.047619047619047616, 'recall': 0.14285714285714285, 'f1': 0.07142857142857142,

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 36}, 'Deity': {'precision': 0.3333333333333333, 'recall': 0.008695652173913044, 'f1': 0.01694915254237288, 'number': 115}, 'Goddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 68}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'overall_precision': 0.06666666666666667, 'overall_recall': 0.0030303030303030303, 'overall_f1': 0.0057971014492753615, 'overall_accuracy': 0.9695379682262061}
shots: 10


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 1.0, 'recall': 0.027777777777777776, 'f1': 0.05405405405405406, 'number': 36}, 'Deity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 115}, 'Goddess': {'precision': 0.3333333333333333, 'recall': 0.014705882352941176, 'f1': 0.028169014084507043, 'number': 68}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'overall_precision': 0.11764705882352941, 'overall_recall': 0.006060606060606061, 'overall_f1': 0.011527377521613832, 'overall_accuracy': 0.9693922168780061}
shots: 20


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 36}, 'Deity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 115}, 'Entity': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Goddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 68}, 'Hypostasis_of_the_Archons': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'spermaceti_organ': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.9555458387990089}
shots: 30


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.18181818181818182, 'recall': 0.03225806451612903, 'f1': 0.0547945205479452, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Bambiraptor': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Cretaceous_dinosaur': {'precision': 0.5, 'recall': 0.027777777777777776, 'f1': 0.05263157894736842, 'number': 36}, 'Deity': {'precision': 0.2777777777777778, 'recall': 0.043478260869565216, 'f1': 0.07518796992481204, 'number': 115}, 'Goddess': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 68}, 'Hawaiian_Volcano_Observatory': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'overall_precision': 0.2222222222222222, 'overall_recall': 0.024242424242424242, 'overall_f1': 0.04371584699453552, 'overall_accuracy': 0.9674974493514065}
shots: 40


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

{'Aquatic_animal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 62}, 'Aquatic_mammal': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 35}, 'Cretaceous_dinosaur': {'precision': 0.5, 'recall': 0.027777777777777776, 'f1': 0.05263157894736842, 'number': 36}, 'Deity': {'precision': 0.4166666666666667, 'recall': 0.08695652173913043, 'f1': 0.14388489208633093, 'number': 115}, 'Goddess': {'precision': 1.0, 'recall': 0.029411764705882353, 'f1': 0.05714285714285715, 'number': 68}, 'Mythological_king': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 14}, 'overall_precision': 0.3939393939393939, 'overall_recall': 0.03939393939393939, 'overall_f1': 0.07162534435261707, 'overall_accuracy': 0.9704853519895059}
Results saved to evaluation_results_dynamic_prompts.csv


In [18]:
# # Run the eval on the dev set
# dev_examples_to_take = 0

# dev_set = data_splits['dev']
# if dev_examples_to_take > 0:
#     dev_set = data_splits['dev'].select(range(dev_examples_to_take))

# for num_shots in [0, 1, 5, 10, 20, 50, 100]:
#     print(f"shots: {num_shots}")
#     predictions, result = run_eval(dev_set, shots=num_shots)
#     print(result)

## Output for Evaluation

In the following cells, run your trained model on the test data, and produce a list of lists of tags, with one list per sentence, e.g. 

```
[
    [
        "B-Aquatic_animal",
        "I-Aquatic_animal",
        "I-Aquatic_animal",
...
        "O",
        "O",
        "B-Aquatic_animal",
        "I-Aquatic_animal"
    ],
    [...]
]
```

Serialize your predictions into a file named `test_predictions_llm_baseline.json` for your initial attempt at an LLM tagger. Your expected f1 on the wittheld test set should be 0.2. Then, serialize your predictions for any further experiments into a file called `test_predictions_llm_experiment.json`. This file will reflect the results that you were able to achieve with more prompt engineering.

In [19]:
# Now we can put all of the above together to evaluate!
metric = evaluate.load("seqeval")

def run_eval_test(dataset, shots):
  pred = []
  for example in tqdm(dataset, total=len(dataset), desc="Evaluating", position=tqdm._get_free_pos()):

      # String list of labels (BIO)
      true_labels = [labels_int2str[l] for l in example['ner_tags']]
      example_tokens = example['tokens']

      response_text = call_api_openai(shots, example)

      # String list of predicted labels (BIO)
      predictions, generated_tokens = convert_response_to_bio(response_text)

      # Handle case where the generated text doesn't align with the input text.
      # Basically, we'll eval everything up to where the two strings start to diverge.
      # We relax this slightly by ignoring punctuation (sometimes we lose a paren or something, 
      # but that's not catastrophic for eval/tokenization).
      # Just predict 'O' for anything following mismatch.
      matching_elements = [strip_punct(i) == strip_punct(j) for i, j in zip(example_tokens, generated_tokens)]

      if False in matching_elements:
         last_matching_idx = matching_elements.index(False)
      else:
         last_matching_idx = min(len(generated_tokens), len(example_tokens))

      predictions = predictions[:last_matching_idx] + ['O']*(len(example_tokens)-last_matching_idx)
      pred.append(predictions)


  return pred

In [20]:
dev_examples_to_take = 0

test_dataset = data_splits['test']
if dev_examples_to_take > 0:
    test_dataset = data_splits['test'].select(range(dev_examples_to_take))

for num_shots in [5]:
    print(f"shots: {num_shots}")
    predictions = run_eval_test(test_dataset, shots=num_shots)
    print(predictions)

shots: 5


Evaluating:   0%|          | 0/303 [00:00<?, ?it/s]

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [21]:
len(predictions)

303

In [22]:
import json

# Save the mapped predictions to a JSON file
output_file = "test_predictions_llm_experiment.json"
with open(output_file, "w") as f:
    json.dump(predictions, f, indent=4)

print(f"{output_file}")

test_predictions_llm_experiment.json


In [23]:
import pandas as pd
import matplotlib.pyplot as plt

# Data for Few-shot
few_shot_data = {
    'Shots': [0, 1, 5, 10, 20, 30, 40],
    'Precision': [0.2235, 0.2647, 0.2079, 0.1993, 0.1898, 0.1942, 0.2000],
    'Recall': [0.0576, 0.2455, 0.1758, 0.1727, 0.2030, 0.2030, 0.2030],
    'F1': [0.0916, 0.2547, 0.1905, 0.1851, 0.1962, 0.1985, 0.2015],
    'Accuracy': [0.9665, 0.9580, 0.9559, 0.9568, 0.9468, 0.9446, 0.9350]
}

# Data for Chain-of-Thought
chain_of_thought_data = {
    'Shots': [0, 1, 5, 10, 20, 30, 40],
    'Precision': [0.2424, 0.2441, 0.2096, 0.2116, 0.1994, 0.1860, 0.1737],
    'Recall': [0.0485, 0.2515, 0.1727, 0.1879, 0.2152, 0.1939, 0.1758],
    'F1': [0.0808, 0.2478, 0.1894, 0.1990, 0.2070, 0.1899, 0.1747],
    'Accuracy': [0.9677, 0.9548, 0.9551, 0.9570, 0.9545, 0.9389, 0.9262]
}

# Data for Dynamic Prompting
dynamic_prompt_data = {
    'Shots': [0, 1, 5, 10, 20, 30, 40],
    'Precision': [0.2857, 0.2840, 0.2230, 0.2261, 0.1940, 0.1816, 0.1988],
    'Recall': [0.0424, 0.2909, 0.1939, 0.1939, 0.2152, 0.1909, 0.2000],
    'F1': [0.0739, 0.2874, 0.2075, 0.2088, 0.2040, 0.1861, 0.1994],
    'Accuracy': [0.9687, 0.9572, 0.9567, 0.9573, 0.9529, 0.9359, 0.9283]
}

# Function to plot metrics with annotations for maximum values
def plot_metrics_with_highlight(metric_name, few_shot_df, chain_of_thought_df, dynamic_prompt_df):
    plt.figure(figsize=(10, 6))

    # Plotting data
    plt.plot(few_shot_df['Shots'], few_shot_df[metric_name], label='Few-shot-Baseline', marker='o')
    plt.plot(chain_of_thought_df['Shots'], chain_of_thought_df[metric_name], label='Chain-of-Thought', marker='o')
    plt.plot(dynamic_prompt_df['Shots'], dynamic_prompt_df[metric_name], label='Dynamic Prompting', marker='o')

    # Find maximum value and corresponding shot count
    max_value = max(
        max(few_shot_df[metric_name]),
        max(chain_of_thought_df[metric_name]),
        max(dynamic_prompt_df[metric_name])
    )
    max_shot_few_shot = few_shot_df.loc[few_shot_df[metric_name].idxmax(), 'Shots']
    max_shot_chain_of_thought = chain_of_thought_df.loc[chain_of_thought_df[metric_name].idxmax(), 'Shots']
    max_shot_dynamic_prompt = dynamic_prompt_df.loc[dynamic_prompt_df[metric_name].idxmax(), 'Shots']

    # Adding horizontal and vertical lines for the maximum value
    plt.axhline(max_value, color='red', linestyle='--', label=f'Max {metric_name} = {max_value:.4f}')
    if max_value == max(few_shot_df[metric_name]):
        plt.axvline(max_shot_few_shot, color='blue', linestyle='--', label=f'Max at Shots={max_shot_few_shot}')
    elif max_value == max(chain_of_thought_df[metric_name]):
        plt.axvline(max_shot_chain_of_thought, color='green', linestyle='--', label=f'Max at Shots={max_shot_chain_of_thought}')
    elif max_value == max(dynamic_prompt_df[metric_name]):
        plt.axvline(max_shot_dynamic_prompt, color='purple', linestyle='--', label=f'Max at Shots={max_shot_dynamic_prompt}')

    # Add labels and title
    plt.title(f'Comparison of {metric_name} Across Prompting Techniques')
    plt.xlabel('Shots')
    plt.ylabel(metric_name)
    plt.legend()
    plt.grid(True)
    plt.show()

# Generate plots with highlights for Precision, Recall, F1, and Accuracy
for metric in ['Precision', 'Recall', 'F1', 'Accuracy']:
    plot_metrics_with_highlight(metric, few_shot_df, chain_of_thought_df, dynamic_prompt_df)


NameError: name 'few_shot_df' is not defined