## Load data

### Benchmark dataset

In [None]:
import json

with open('..\\data\\external\\benchmark\\benchmark.json') as f:
    benchmark_dataset = json.load(f)
    benchmark_dataset = benchmark_dataset['Data']
    f.close()

sentences = []
perfect = []
good = []
ok = []

for element in benchmark_dataset:
    sentences.append(element['Sentence'])
    perfect.append(element['Perfect'])
    good.append(element['Good'])
    ok.append(element['Ok'])

## Get text embeddings

### Using Ollama

In [33]:
import ollama

EMBED_MODEL = 'nomic-embed-text'

def get_text_embedding_using_ollama(text:str):
    
    if text:

        response = ollama.embed(
            model=EMBED_MODEL,
            input=text,
        )

        return response["embeddings"]
    
    return None

## Approach 1: Measure quality of generated by LLM entities with their labels

### Metric 1.1: Maximal average cosinus similarity of embeddings
For each pairwise matching between generated entities and gold entities the sum of cosinus similarities between their embeddings is calculated. Similarity for each entity without a pair is -1. As a metric, the maximal sum of similarities is taken and divided by max(number_of_generated_entites, number_of_gold_entities). 

In [None]:
import numpy as np   

def cosine_similarity(v1, v2):
    v1 = np.squeeze(np.asarray(v1))
    v2 = np.squeeze(np.asarray(v2))
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [34]:
import itertools
import copy

def max_avg_cos_similarity(generated_entities, golden_entities):

    generated_entities = copy.copy(generated_entities)
    golden_entities = copy.copy(golden_entities)

    max_len = max(len(golden_entities), len(generated_entities))
    
    # Fill list with padding, to obtain two lists of the same size
    for n in range(len(golden_entities), max_len):
        golden_entities.append(None)

    for n in range(len(generated_entities), max_len):  
        generated_entities.append(None)
        
    # Get text embeddings for each entity
    golden_embeddings = [get_text_embedding_using_ollama(entity) for entity in golden_entities]
    generated_embeddings = [get_text_embedding_using_ollama(entity) for entity in generated_entities]
    

    best_permutation = generated_entities
    maximal_similarity = -1

    for permutation in list(itertools.permutations(list(range(max_len)))):

        sum_cosine_similarities = 0
        permutated_generated_embeddings = [generated_embeddings[i] for i in permutation]

        for golden_embedding, generated_embedding in zip(golden_embeddings, permutated_generated_embeddings):

            if golden_embedding and generated_embedding:
                sum_cosine_similarities += cosine_similarity(golden_embedding, generated_embedding)
            else:
                sum_cosine_similarities += -1

        if sum_cosine_similarities/max_len > maximal_similarity:
            maximal_similarity = sum_cosine_similarities/max_len
            best_permutation = [generated_entities[i] for i in permutation]

    return maximal_similarity

In [36]:
for s, p, g, o in zip(sentences, perfect, good, ok):

    s1 = max_avg_cos_similarity(p, p)
    s2 = max_avg_cos_similarity(p, g)
    s3 = max_avg_cos_similarity(p, o)

    print(f'{p}\t{g}\t{o}')
    if s1 >= s2 >= s3:
        print(f'TEST PASSED\t{s1}\t{s2}\t{s3}\n')
    else:
        print(f'TEST FAILED\t{s1}\t{s2}\t{s3}\n')

['decreased testosterone', 'decreased libido']	['decreased testosterone', 'libido']	['decreased testosterone and libido']
TEST PASSED	1.0	0.9246157485696923	-0.05010950778365991

['muscle pain', 'joint pain']	['muscle', 'joint pain']	['muscle/joint pain']
TEST PASSED	1.0	0.9038013963778003	-0.050650728647168186

['abdominal gas', 'abdominal cramps', 'abdominal discomfort']	['abdominal gas', 'abdominal cramps', 'abdominal pain']	['abdominal gas', 'cramps', 'discomfort']
TEST PASSED	0.9999999999999999	0.9756371152298943	0.8851080272343022

['pain unbearable']	['unbearable pain']	['pain']
TEST PASSED	0.9999999999999999	0.9676638020044335	0.8091187031941701

['severe colon cramping', 'severe uterine cramping']	['colon cramping', 'uterine cramping']	['colon', 'uterine cramping']
TEST PASSED	1.0000000000000002	0.9514653925553156	0.8506972454751643

['feet swelling', 'ankles swelling']	['feet', 'ankles']	['foot', 'ankle']
TEST PASSED	1.0	0.7885804512105015	0.7480446611985454

['body swelling'

## Approach 2: Measure accuracy (or other metric) of labeling task performed by LLM on each word in a sentence

### Metric 2.1: Accuracy of generated labels
For each generated label compare it to gold label and calculate number of correct generated labels, then divide it by a number of words in a sentence.