## Load data

### Benchmark dataset

In [1]:
import json

with open('..\\data\\external\\benchmark\\benchmark.json') as f:
    benchmark_dataset = json.load(f)
    benchmark_dataset = benchmark_dataset['Data']
    f.close()

sentences = []
perfect = []
good = []
ok = []

for element in benchmark_dataset:
    sentences.append(element['Sentence'])
    perfect.append(element['Perfect'])
    good.append(element['Good'])
    ok.append(element['Ok'])

## Get text embeddings

### Using Ollama

In [2]:
import ollama

EMBED_MODEL = 'nomic-embed-text'

def get_text_embedding_using_ollama(text:str):
    
    if text:

        response = ollama.embed(
            model=EMBED_MODEL,
            input=text,
        )

        return response["embeddings"]
    
    return None

## Approach 1: Measure quality of generated by LLM entities with their labels


~~For each pairwise matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. Similarity for each entity without a pair is -1. As a metric, the maximal sum of similarities is taken and divided by max(number_of_generated_entites, number_of_gold_entities).~~

### Metric 1.1: Maximal average cosinus similarity of embeddings
The metric consists in fact of several metrics:

1) 
- a) Maximal average cosinus similarity of embeddings (brute force)
For each pairwise matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. As a metric, the max sum of similarities is taken and divided by min(number_of_generated_entites, number_of_gold_entities). 

- b) Average cosinus similarity of embeddings (greedy)
For greedy matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. The sum is divided by min(number_of_generated_entites, number_of_gold_entities).

2) Found entities measure
- | generated entities | / | gold entities | - 1
- <-1; 0) - LLM did not found all entities
- 0 - LLM found all entities
- (0; +oo) - LLM halucynated some entities

#### Cosine similarity

In [3]:
import numpy as np   

def cosine_similarity(v1, v2):
    v1 = np.squeeze(np.asarray(v1))
    v2 = np.squeeze(np.asarray(v2))
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

#### Test metric

In [31]:
def test_metric(metric_fun, sentences, perfect, good, ok, verbose = False):

    n_passed = 0
    n_failed = 0

    for s, p, g, o in zip(sentences, perfect, good, ok):

        s1 = metric_fun(p, p)
        s2 = metric_fun(p, g)
        s3 = metric_fun(p, o)

        # if verbose: print(f'{p}\t{g}\t{o}')
        if s1 >= s2 >= s3:
            # if verbose: print(f'TEST PASSED\t{s1}\t{s2}\t{s3}\n')
            n_passed += 1
        else:
            if verbose: print(f'{p}\t{g}\t{o}')
            if verbose: print(f'TEST FAILED\t{s1}\t{s2}\t{s3}\n')
            n_failed += 1

    return n_passed, n_failed
    

#### 1.1.1.a Maximal average cosinus similarity of embeddings (brute force)

In [5]:
import itertools
import copy

def max_avg_cos_similarity(generated_entities, golden_entities):

    generated_entities = copy.copy(generated_entities)
    golden_entities = copy.copy(golden_entities)

    min_len = min(len(golden_entities), len(generated_entities))
    max_len = max(len(golden_entities), len(generated_entities))
    
    # Fill list with padding, to obtain two lists of the same size
    for n in range(len(golden_entities), max_len):
        golden_entities.append(None)

    for n in range(len(generated_entities), max_len):  
        generated_entities.append(None)
        
    # Get text embeddings for each entity
    golden_embeddings = [get_text_embedding_using_ollama(entity) for entity in golden_entities]
    generated_embeddings = [get_text_embedding_using_ollama(entity) for entity in generated_entities]
    

    best_permutation = generated_entities
    maximal_similarity = -1

    for permutation in list(itertools.permutations(list(range(max_len)))):

        sum_cosine_similarities = 0
        permutated_generated_embeddings = [generated_embeddings[i] for i in permutation]

        for golden_embedding, generated_embedding in zip(golden_embeddings, permutated_generated_embeddings):

            if golden_embedding and generated_embedding:
                sum_cosine_similarities += cosine_similarity(golden_embedding, generated_embedding)

        if sum_cosine_similarities/min_len > maximal_similarity:
            maximal_similarity = sum_cosine_similarities/min_len
            best_permutation = [generated_entities[i] for i in permutation]

    return maximal_similarity

In [32]:
p, f = test_metric(max_avg_cos_similarity, sentences, perfect, good, ok, verbose=True)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

['severe pain in the muscles in the shoulder area']	['severe pain in the muscles']	['severe pain in the muscles', 'severe pain in the shoulder area']
TEST FAILED	1.0	0.8627380522048517	0.9574064207803092

['increased blood pressure', 'increased heart rate']	['blood pressure', 'heart rate']	['increased blood pressure and heart rate']
TEST FAILED	1.0	0.8590391771065524	0.9162281472371826

['mild shortness of breath on exertion']	['shortness of breath']	['breath on exertion']
TEST FAILED	1.0000000000000002	0.8685195001479415	0.8846911153655715

['left leg swelling', 'left leg redness']	['leg swelling', 'redness']	['leg swelling and redness']
TEST FAILED	1.0	0.8519824239361447	0.8553893403675195

['upper abdominal pain', 'upper abdominal bloating']	['abdominal pain', 'bloating']	['abdominal pain and bloating']
TEST FAILED	1.0	0.833672914990464	0.8464600754327717

['joint stiffness in the morning']	['joint stiffness']	['stiffness in morning']
TEST FAILED	1.0000000000000002	0.827328030557259

#### 1.1.1.b Average cosinus similarity of embeddings (greedy)

In [21]:
def find_max_idx(x):
    k = x.argmax()
    ncol = x.shape[1]
    return int(k/ncol), int(k%ncol)

In [None]:
import numpy as np

def avg_cos_similarity(generated_entities, golden_entities):

    generated_entities = copy.copy(generated_entities)
    golden_entities = copy.copy(golden_entities)

    min_len = min(len(golden_entities), len(generated_entities))
    max_len = max(len(golden_entities), len(generated_entities))
    
    # Fill list with padding, to obtain two lists of the same size
    for n in range(len(golden_entities), max_len):
        golden_entities.append(None)

    for n in range(len(generated_entities), max_len):  
        generated_entities.append(None)
        
    # Get text embeddings for each entity
    golden_embeddings = [get_text_embedding_using_ollama(entity) for entity in golden_entities]
    generated_embeddings = [get_text_embedding_using_ollama(entity) for entity in generated_entities]

    # Create matrix of calculated similarities between golden and generated embeddings
    try:
        arr = np.array([[ cosine_similarity(i,j) for i in golden_embeddings] for j in generated_embeddings])
    except TypeError:   # if golden_embeddings or generated_embeddings is empty
        arr = np.array([[]]) 

    greedy_similarity = -1

    while arr.any():
        max_i, max_j = find_max_idx(arr)
        greedy_similarity += arr[max_i, max_j]
        arr = np.delete(arr, max_i, axis=0)
        arr = np.delete(arr, max_j, axis=1)

    return greedy_similarity/min_len

In [33]:
p, f = test_metric(avg_cos_similarity, sentences, perfect, good, ok)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

TESTS PASSED: 102	 TESTS FAILED: 10


#### 1.1.2 Found entities measure

In [None]:
def found_entities(generated_entities, golden_entities):
    # <-1; 0) - LLM did not found all entities
    # 0 - LLM found all entities
    # (0; +oo) - LLM halucynated some entities
    return len(generated_entities)/len(golden_entities) - 1

In [34]:
p, f = test_metric(found_entities, sentences, perfect, good, ok)
print(f'TESTS PASSED: {p}\t TESTS FAILED: {f}')

TESTS PASSED: 57	 TESTS FAILED: 55


## Approach 2: Measure accuracy (or other metric) of labeling task performed by LLM on each word in a sentence

### Metric 2.1: Accuracy of generated labels
For each generated label compare it to gold label and calculate number of correct generated labels, then divide it by a number of words in a sentence.