In [1]:
import evaluate
from evaluate import load
import tensorflow_hub as hub
from scipy.spatial import distance
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util

2024-03-05 10:12:07.215456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
bertscore = load("bertscore")
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

In [3]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/question_with_answers.csv', split="train[:10]")
predictions = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/answers_from_model.csv')

In [4]:
references = references["Answer"]
predictions = predictions["train"]["Text"]

In [5]:
bert_score = bertscore.compute(predictions=predictions, references=references, lang="nb")
bleu_score = bleu.compute(predictions=predictions, references=references, max_order=2)
rouge_score = rouge.compute(predictions=predictions, references=references)



In [6]:
avg_precision = sum(bert_score['precision']) / len(bert_score['precision'])
avg_recall = sum(bert_score['recall']) / len(bert_score['recall'])
avg_f1 = sum(bert_score['f1']) / len(bert_score['f1'])

BLEU SCORES
{'bleu': 0.04627694342127828, 'precisions': [0.5238095238095238, 0.2732919254658385], 'brevity_penalty': 0.12231073353103171, 'length_ratio': 0.32245681381957775, 'translation_length': 168, 'reference_length': 521}
ROUGE SCORES
{'rouge1': 0.1904796827059792, 'rouge2': 0.12125085942689337, 'rougeL': 0.1689650527576143, 'rougeLsum': 0.17168244406196215}
BERT SCORES
Average Precision: 0.5430774986743927
Average Recall: 0.44924169182777407
Average F1 Score: 0.4900475561618805


In [8]:
## SAS encoder score
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print("module %s loaded" % module_url)
 
 
def embed(input):
    return model(input)
list_of_similarity_scores = []
for i in range(len(predictions)):
    similarity_score = 1-distance.cosine(embed([predictions[i]])[0, :],embed([references[i]])[0, :])
    list_of_similarity_scores.append(similarity_score)
    print(f'\nPrediction: {predictions[i]}\nReference: {references[i]}\nSimilarity Score = {similarity_score} ')
average_score = sum(list_of_similarity_scores) / len(list_of_similarity_scores)
print("Average similarity score:", average_score)


Prediction: 26.8.2021
Reference: Datoen for vedtaket av Kommunedelplan for sentrum av bystyret var 26.8.2021.
Similarity Score = 0.16227787733078003 

Prediction: Ønsker å legge et nødvendig faglig og legalt grunnlag for at Kristiansund sentrum skal videreutvikles som et attraktivt og framtidsrettet regionbysentrum med et pulserende folkeliv.
Reference: Hovedintensjonene i planen beskrevet i dokumentet inkluderer å videreutvikle Kristiansund sentrum som et attraktivt og framtidsrettet regionbysentrum med et pulserende folkeliv. Dette inkluderer positiv utvikling av handels- og servicevirksomhet, god parkerings- og trafikksituasjon, hensyn til gjenreisningsarkitekturens formuttrykk, god framkommelighet og trygghet for alle grupper, et mangfold av kulturaktiviteter, ønske om flere boligprosjekter i sentrum, stimulering til flere arbeids- og skoleplasser i sentrum, og utvikling av sentrum til en regional attraktiv, tett og urban bolig-, handels- og kulturby med gode bymessige kvaliteter.

In [9]:
## SAS transformer score
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def SAS(preds, refs):
    similarities = []
    embeddings_preds = model.encode(preds)
    embeddings_refs = model.encode(refs)
    for i in range(len(embeddings_preds)):
        similarity = util.pytorch_cos_sim(embeddings_preds[i], embeddings_refs[i])
        similarities.append(similarity[0][0].item())
    average_similarity_score = sum(similarities) / len(similarities)
    return average_similarity_score

In [10]:
print("BLEU SCORES")
print(bleu_score)
print("ROUGE SCORES")
print(rouge_score)
print("BERT SCORES")
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)
print("Average SAS encoder Score:", average_score)
print("Average SAS transformer Score:", SAS(predictions, references))

0.5249271161854268
