## Approach 1: Measure quality of generated by LLM entities with their labels


~~For each pairwise matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. Similarity for each entity without a pair is -1. As a metric, the maximal sum of similarities is taken and divided by max(number_of_generated_entites, number_of_gold_entities).~~

### Metric 1.1: Maximal average cosinus similarity of embeddings
The metric consists in fact of several metrics:

1) 
- a) Maximal average cosinus similarity of embeddings (brute force)
For each pairwise matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. As a metric, the max sum of similarities is taken and divided by min(number_of_generated_entites, number_of_gold_entities). 

- b) Average cosinus similarity of embeddings (greedy)
For greedy matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. The sum is divided by min(number_of_generated_entites, number_of_gold_entities).

2) Found entities measure
- | generated entities | / | gold entities | - 1
- <-1; 0) - LLM did not found all entities
- 0 - LLM found all entities
- (0; +oo) - LLM halucynated some entities

#### Load benchmark dataset

In [None]:
import json

with open('..\\data\\external\\benchmark\\approach_1_benchmark.json') as f:
    benchmark_dataset = json.load(f)
    benchmark_dataset = benchmark_dataset['Data']

sentences = []
perfect = []
good = []
ok = []

for element in benchmark_dataset:
    sentences.append(element['Sentence'])
    perfect.append(element['Perfect'])
    good.append(element['Good'])
    ok.append(element['Ok'])

#### Get text embeddings using Ollama

In [3]:
import ollama

EMBED_MODEL = 'nomic-embed-text'

def get_text_embedding_using_ollama(text:str):
    
    if text:

        response = ollama.embed(
            model=EMBED_MODEL,
            input=text,
        )

        return response["embeddings"]
    
    return None

#### Cosine similarity

In [4]:
import numpy as np   

def cosine_similarity(v1, v2):
    v1 = np.squeeze(np.asarray(v1))
    v2 = np.squeeze(np.asarray(v2))
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

#### Test metric

In [5]:
def test_metric(metric_fun, sentences, perfect, good, ok, verbose = False):

    n_passed = 0
    n_failed = 0

    for s, p, g, o in zip(sentences, perfect, good, ok):

        s1 = metric_fun(p, p)
        s2 = metric_fun(p, g)
        s3 = metric_fun(p, o)

        # if verbose: print(f'{p}\t{g}\t{o}')
        if s1 >= s2 >= s3:
            # if verbose: print(f'TEST PASSED\t{s1}\t{s2}\t{s3}\n')
            n_passed += 1
        else:
            if verbose: print(f'{p}\t{g}\t{o}')
            if verbose: print(f'TEST FAILED\t{s1}\t{s2}\t{s3}\n')
            n_failed += 1

    return n_passed, n_failed
    

#### Tests on sample sentences

In [11]:
healthy_1 = "Healthy."
sick_1 = "Not healthy."
sick_2 = "Ill."
unrelated_1 = "Cat."

embed_healthy_1 = get_text_embedding_using_ollama(healthy_1)
embed_sick_1 = get_text_embedding_using_ollama(sick_1)
embed_sick_2 = get_text_embedding_using_ollama(sick_2)
embed_unrelated_1 = get_text_embedding_using_ollama(unrelated_1)

print(f"Cosine similarity of healthy_1 & healthy_1: {cosine_similarity(embed_healthy_1, embed_healthy_1)}")
print(f"Cosine similarity of healthy_1 & sick_1: {cosine_similarity(embed_healthy_1, embed_sick_1)}")
print(f"Cosine similarity of healthy_1 & sick_2: {cosine_similarity(embed_healthy_1, embed_sick_2)}")
print(f"Cosine similarity of healthy_1 & unrelated_1: {cosine_similarity(embed_healthy_1, embed_unrelated_1)}")

print(f"Cosine similarity of sick_1 & sick_1: {cosine_similarity(embed_sick_1, embed_sick_1)}")
print(f"Cosine similarity of sick_1 & sick_2: {cosine_similarity(embed_sick_1, embed_sick_2)}")
print(f"Cosine similarity of sick_1 & unrelated_1: {cosine_similarity(embed_sick_1, embed_unrelated_1)}")

print(f"Cosine similarity of sick_2 & sick_2: {cosine_similarity(embed_sick_2, embed_sick_2)}")
print(f"Cosine similarity of sick_2 & unrelated_1: {cosine_similarity(embed_sick_2, embed_unrelated_1)}")

print(f"Cosine similarity of unrelated_1 & unrelated_1: {cosine_similarity(embed_unrelated_1, embed_unrelated_1)}")

Cosine similarity of healthy_1 & healthy_1: 1.0000000000000002
Cosine similarity of healthy_1 & sick_1: 0.8546183276069252
Cosine similarity of healthy_1 & sick_2: 0.5154896673187178
Cosine similarity of healthy_1 & unrelated_1: 0.452276206883047
Cosine similarity of sick_1 & sick_1: 1.0
Cosine similarity of sick_1 & sick_2: 0.5267311993602283
Cosine similarity of sick_1 & unrelated_1: 0.4003778194513241
Cosine similarity of sick_2 & sick_2: 0.9999999999999999
Cosine similarity of sick_2 & unrelated_1: 0.45925846382791224
Cosine similarity of unrelated_1 & unrelated_1: 0.9999999999999999


In [None]:
healthy_1 = "The patient is healthy and shows no signs of illness."
sick_1 = "The patient is not healthy and shows signs of illness."
sick_2 = "The patient is extremely sick and shows signs of multiple serious illnesses."
unrelated_1 = "The cat slept on the windowsill all afternoon."

embed_healthy_1 = get_text_embedding_using_ollama(healthy_1)
embed_sick_1 = get_text_embedding_using_ollama(sick_1)
embed_sick_2 = get_text_embedding_using_ollama(sick_2)
embed_unrelated_1 = get_text_embedding_using_ollama(unrelated_1)

print(f"Cosine similarity of healthy_1 & healthy_1: {cosine_similarity(embed_healthy_1, embed_healthy_1)}")
print(f"Cosine similarity of healthy_1 & sick_1: {cosine_similarity(embed_healthy_1, embed_sick_1)}")
print(f"Cosine similarity of healthy_1 & sick_2: {cosine_similarity(embed_healthy_1, embed_sick_2)}")
print(f"Cosine similarity of healthy_1 & unrelated_1: {cosine_similarity(embed_healthy_1, embed_unrelated_1)}")

print(f"Cosine similarity of sick_1 & sick_1: {cosine_similarity(embed_sick_1, embed_sick_1)}")
print(f"Cosine similarity of sick_1 & sick_2: {cosine_similarity(embed_sick_1, embed_sick_2)}")
print(f"Cosine similarity of sick_1 & unrelated_1: {cosine_similarity(embed_sick_1, embed_unrelated_1)}")

print(f"Cosine similarity of sick_2 & sick_2: {cosine_similarity(embed_sick_2, embed_sick_2)}")
print(f"Cosine similarity of sick_2 & unrelated_1: {cosine_similarity(embed_sick_2, embed_unrelated_1)}")

print(f"Cosine similarity of unrelated_1 & unrelated_1: {cosine_similarity(embed_unrelated_1, embed_unrelated_1)}")

Cosine similarity of healthy_1 & healthy_1: 1.0000000000000002
Cosine similarity of healthy_1 & sick_1: 0.8680167736583562
Cosine similarity of healthy_1 & sick_2: 0.7767000617900933
Cosine similarity of healthy_1 & unrelated_1: 0.42161666638300177
Cosine similarity of sick_1 & sick_1: 1.0
Cosine similarity of sick_1 & sick_2: 0.9112396526264309
Cosine similarity of sick_1 & unrelated_1: 0.4267955805726447
Cosine similarity of sick_2 & sick_2: 1.0000000000000002
Cosine similarity of sick_2 & unrelated_1: 0.39593371671041494
Cosine similarity of unrelated_1 & unrelated_1: 0.9999999999999999


#### 1.1.1.a Maximal average cosinus similarity of embeddings (brute force)

In [5]:
import itertools
import copy

def max_avg_cos_similarity(generated_entities, golden_entities):

    generated_entities = copy.copy(generated_entities)
    golden_entities = copy.copy(golden_entities)

    min_len = min(len(golden_entities), len(generated_entities))
    max_len = max(len(golden_entities), len(generated_entities))
    
    # Fill list with padding, to obtain two lists of the same size
    for n in range(len(golden_entities), max_len):
        golden_entities.append(None)

    for n in range(len(generated_entities), max_len):  
        generated_entities.append(None)
        
    # Get text embeddings for each entity
    golden_embeddings = [get_text_embedding_using_ollama(entity) for entity in golden_entities]
    generated_embeddings = [get_text_embedding_using_ollama(entity) for entity in generated_entities]
    

    best_permutation = generated_entities
    maximal_similarity = -1

    for permutation in list(itertools.permutations(list(range(max_len)))):

        sum_cosine_similarities = 0
        permutated_generated_embeddings = [generated_embeddings[i] for i in permutation]

        for golden_embedding, generated_embedding in zip(golden_embeddings, permutated_generated_embeddings):

            if golden_embedding and generated_embedding:
                sum_cosine_similarities += cosine_similarity(golden_embedding, generated_embedding)

        if sum_cosine_similarities/min_len > maximal_similarity:
            maximal_similarity = sum_cosine_similarities/min_len
            best_permutation = [generated_entities[i] for i in permutation]

    return maximal_similarity

In [32]:
p, f = test_metric(max_avg_cos_similarity, sentences, perfect, good, ok, verbose=True)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

['severe pain in the muscles in the shoulder area']	['severe pain in the muscles']	['severe pain in the muscles', 'severe pain in the shoulder area']
TEST FAILED	1.0	0.8627380522048517	0.9574064207803092

['increased blood pressure', 'increased heart rate']	['blood pressure', 'heart rate']	['increased blood pressure and heart rate']
TEST FAILED	1.0	0.8590391771065524	0.9162281472371826

['mild shortness of breath on exertion']	['shortness of breath']	['breath on exertion']
TEST FAILED	1.0000000000000002	0.8685195001479415	0.8846911153655715

['left leg swelling', 'left leg redness']	['leg swelling', 'redness']	['leg swelling and redness']
TEST FAILED	1.0	0.8519824239361447	0.8553893403675195

['upper abdominal pain', 'upper abdominal bloating']	['abdominal pain', 'bloating']	['abdominal pain and bloating']
TEST FAILED	1.0	0.833672914990464	0.8464600754327717

['joint stiffness in the morning']	['joint stiffness']	['stiffness in morning']
TEST FAILED	1.0000000000000002	0.827328030557259

#### 1.1.1.b Average cosinus similarity of embeddings (greedy)

In [21]:
def find_max_idx(x):
    k = x.argmax()
    ncol = x.shape[1]
    return int(k/ncol), int(k%ncol)

In [None]:
import numpy as np

def avg_cos_similarity(generated_entities, golden_entities):

    generated_entities = copy.copy(generated_entities)
    golden_entities = copy.copy(golden_entities)

    min_len = min(len(golden_entities), len(generated_entities))
    max_len = max(len(golden_entities), len(generated_entities))
    
    # Fill list with padding, to obtain two lists of the same size
    for n in range(len(golden_entities), max_len):
        golden_entities.append(None)

    for n in range(len(generated_entities), max_len):  
        generated_entities.append(None)
        
    # Get text embeddings for each entity
    golden_embeddings = [get_text_embedding_using_ollama(entity) for entity in golden_entities]
    generated_embeddings = [get_text_embedding_using_ollama(entity) for entity in generated_entities]

    # Create matrix of calculated similarities between golden and generated embeddings
    try:
        arr = np.array([[ cosine_similarity(i,j) for i in golden_embeddings] for j in generated_embeddings])
    except TypeError:   # if golden_embeddings or generated_embeddings is empty
        arr = np.array([[]]) 

    greedy_similarity = -1

    while arr.any():
        max_i, max_j = find_max_idx(arr)
        greedy_similarity += arr[max_i, max_j]
        arr = np.delete(arr, max_i, axis=0)
        arr = np.delete(arr, max_j, axis=1)

    return greedy_similarity/min_len

In [33]:
p, f = test_metric(avg_cos_similarity, sentences, perfect, good, ok)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

TESTS PASSED: 102	 TESTS FAILED: 10


#### 1.1.2 Found entities measure

In [None]:
def found_entities(generated_entities, golden_entities):
    # <-1; 0) - LLM did not found all entities
    # 0 - LLM found all entities
    # (0; +oo) - LLM halucynated some entities
    return len(generated_entities)/len(golden_entities) - 1

In [34]:
p, f = test_metric(found_entities, sentences, perfect, good, ok)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

TESTS PASSED: 57	 TESTS FAILED: 55


## Approach 2: Measure accuracy (or other metric) of labeling task performed by LLM on each word in a sentence

### Metric 2.1: Accuracy of generated labels
For each generated label compare it to gold label and calculate number of correct generated labels, then divide it by a number of words in a sentence.

```
Mam przygotowany dataset w formacie: 
	
	{"word": "I", "start": 0, "end": 1, "label": []}
	{"word": "feel", "start": 2, "end": 6, "label": []}
	{"word": "a", "start": 7, "end": 8, "label": []}
	{"word": "bit", "start": 9, "end": 12, "label": [{"tag": "ADR", "tag number": "T1"}]}
	{"word": "drowsy", "start": 13, "end": 19, "label": [{"tag": "ADR", "tag number": "T1"}]}
	{"word": "have", "start": 22, "end": 26, "label": []}
		...
Plan działania brzmi następująco:
- podzielenie tekstu na zdania
- zczytanie przez LLM zdania
- przedstawianie LLmowi kolejnych słów z tego zdania z przygotowanego datasetu z zadaniem przypisania tagów
- sprawdzanie za pomocą accurracy czy ile tagów zostało dobrze przydzielonych
```

#### Load benchmark dataset

In [16]:
import os

word_tags_dir_path = "..\\data\\external\\benchmark\\approach_2_benchmark"
full_text_dir_path =  "..\\data\\external\\cadec\\text"
dir_list = os.listdir(word_tags_dir_path)

In [18]:
import ast

word_tags_path = f'{word_tags_dir_path}\\{dir_list[0]}'
full_text_path = f'{full_text_dir_path}\\{dir_list[0][:-6]}'

with open(word_tags_path) as f:
    word_tags = []
    for line in f:
        word_tags += [ast.literal_eval(line)]
    f.close()

with open(full_text_path) as f:
    full_text = f.read()
    f.close()

In [23]:
def make_llama_3_prompt(user, system="", assistant=""):
    system_prompt = ""
    if system:
        system_prompt = (
            f"<|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>"
        )
    
    user_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{user}<|eot_id|>"
    assistant_prompt = f"<|start_header_id|>assistant<|end_header_id|>\n\n{assistant}<|eot_id|>" if assistant else "<|start_header_id|>assistant<|end_header_id|>\n\n"
    
    return f"<|begin_of_text|>{system_prompt}{user_prompt}{assistant_prompt}"

In [25]:
import ollama

def tag_words(text, words):
    system = "You are an english text annotator, that have 10 years of experience in medical words tagging. \n"
    system += "Consider the following text:\n"
    system += "i had a back operation 20 years ago and my time of mobility was going from bad to worse."
    system += "Consider the following words and tags:\n"

    words = [
        'i', 'had', 'a', 'back', 'operation', '20', 'years', 'ago', 
        'and', 'my', 'time', 'of', 'mobility', 'was', 'going', 'from', 'bad', 'to' 'worse']
    tags = [
        [],[],[],[],[],[],[],[],[],[],[],[],
        ['Symptom'], ['Symptom'], ['Symptom'], ['Symptom'], ['Symptom'], ['Symptom'], ['Symptom']
    ]

    for w,t in zip(words, tags):
        system += "Word: " + w + "\n"
        system += "Tags: " + str(t) + "\n"

    
    user = "Based on the above example, read and analyse the following text, looking for medical entities: " + text + "\n"
    user = "For each word in a text, list all medical tags (remember, that there are words, that are not part of any entity)"
    user += "Format the words and tags as a JSON object, i.e.\n"
    user += '{"word" : str, "tags": list(str) }.\n'
   
    user += """\
            Make sure to only return pairs of words and tags in JSON format. \
            Don't give any comments. \
            Make sure to list all words present in a text in an original order. \
            """
    system += text
    
    prompt = make_llama_3_prompt(user, system)

    # Generate the result from the model
    result = ollama.generate(model='llama3.2', prompt=prompt)

    return result

In [29]:
resp = tag_words(full_text, word_tags)

In [31]:
resp

GenerateResponse(model='llama3.2', created_at='2025-11-06T11:02:06.1995517Z', done=True, done_reason='stop', total_duration=86926254900, load_duration=35265000, prompt_eval_count=459, prompt_eval_duration=110729800, eval_count=752, eval_duration=86779707400, response='{\n    "i": ["Symptom"],\n    "had": [],\n    "a": [],\n    "back": [],\n    "operation": [],\n    "20": [],\n    "years": [],\n    "ago": [],\n    "and": [],\n    "my": [],\n    "time": [],\n    "of": [],\n    "mobility": ["Symptom"],\n    "was": ["Symptom"],\n    "going": ["Symptom"],\n    "from": ["Symptom"],\n    "bad": ["Symptom"],\n    "toworse": ["Symptom"],\n    "i": ["Symptom"],\n    "feel": [],\n    "a": [],\n    "bit": [],\n    "drowsy": [],\n    "and": [],\n    "have": [],\n    "a": [],\n    "little": [],\n    "blurred": [],\n    "vision", ["Symptom"],\n    "so": [],\n    "far": [],\n    "no": [],\n    "gastric": ["Condition"],\n    "problems": ["Condition"],\n    "I\'ve": [],\n    "been": [],\n    "on": [],\n

## 3. Other Approaches

### Precision, recall, F1-score, micro F1-scores of perfect match
A predicted entity is considered correct only when its span and type match with the gold entity. For discontinuous entity: each span should match a span of the gold entity.

#### Sources
- A Span-Based Model for Joint Overlapped and Discontiunuous Named Entity Recognition
- A Supervised Multi-Head Self-Attention Network for Nested Named Entity Recognition
- Discontinuous Named Entity Recognition as Maximal Clique Discovery
- Locate and Label - A Two-stage Indentifier for Nested Named Entity Recognition
- Nested Named Entity Recognition as Latent Lexicalized Constituency Parsing
- Neural Architectures for Named Entity Recognition
- Rethinking Boundaries End-To-End Recognition of Discontinuous Mentions with Pionter Networks
- Span-based Unified Named Entity Recognition Framework via Contrastive Learning
- Unified Named Entity Recognition as Word-Word Relation Classification

#### F1-score
[A Span-Based Model for Joint Overlapped and Discontinuous Named Entity Recognition](https://github.com/foxlf823/sodner)

In [None]:
def safe_div(num, denom):
    if denom > 0:
        return num / denom
    else:
        return 0


def compute_f1(predicted, gold, matched):
    precision = safe_div(matched, predicted)
    recall = safe_div(matched, gold)
    f1 = safe_div(2 * precision * recall, precision + recall)
    return precision, recall, f1

#### Precision, recall micro-averaged F1
[A Span-Based Model for Joint Overlapped and Discontinuous Named Entity Recognition](https://github.com/foxlf823/sodner)

In [None]:
from overrides import overrides
from typing import Optional

import torch

from allennlp.training.metrics.metric import Metric

class NERMetrics(Metric):
    """
    Computes precision, recall, and micro-averaged F1 from a list of predicted and gold labels.
    """
    def __init__(self, number_of_classes: int, none_label: int=0):
        self.number_of_classes = number_of_classes
        self.none_label = none_label
        self.reset()

    @overrides
    def __call__(self,
                 predictions: torch.Tensor,
                 gold_labels: torch.Tensor,
                 mask: Optional[torch.Tensor] = None):
        predictions = predictions.cpu()
        gold_labels = gold_labels.cpu()
        mask = mask.cpu()
        for i in range(self.number_of_classes):
            if i == self.none_label:
                continue
            self._true_positives += ((predictions==i)*(gold_labels==i)*mask.bool()).sum()
            self._false_positives += ((predictions==i)*(gold_labels!=i)*mask.bool()).sum()
            self._true_negatives += ((predictions!=i)*(gold_labels!=i)*mask.bool()).sum()
            self._false_negatives += ((predictions!=i)*(gold_labels==i)*mask.bool()).sum()

    @overrides
    def get_metric(self, reset=False):
        """
        Returns
        -------
        A tuple of the following metrics based on the accumulated count statistics:
        precision : float
        recall : float
        f1-measure : float
        """
        precision = float(self._true_positives) / (float(self._true_positives + self._false_positives) + 1e-13)
        recall = float(self._true_positives) / (float(self._true_positives + self._false_negatives) + 1e-13)
        f1_measure = 2. * ((precision * recall) / (precision + recall + 1e-13))

        # Reset counts if at end of epoch.
        if reset:
            self.reset()

        return precision, recall, f1_measure

    @overrides
    def reset(self):
        self._true_positives = 0
        self._false_positives = 0
        self._true_negatives = 0
        self._false_negatives = 0