In [1]:
import evaluate
from evaluate import load
import tensorflow_hub as hub
from scipy.spatial import distance
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import pandas as pd

2024-03-07 12:57:21.682122: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
bertscore = load("bertscore")
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')

In [3]:
references = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/question_with_answers.csv', split="train[:10]")
predictions = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/Results/Faiss_answers_from_model.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
references = references["Answer"]
predictions = predictions["train"]["Text"]

In [5]:
bert_score = bertscore.compute(predictions=predictions, references=references, lang="nb")
bleu_score = bleu.compute(predictions=predictions, references=references, max_order=2)
rouge_score = rouge.compute(predictions=predictions, references=references)



In [6]:
avg_precision = sum(bert_score['precision']) / len(bert_score['precision'])
avg_recall = sum(bert_score['recall']) / len(bert_score['recall'])
avg_f1 = sum(bert_score['f1']) / len(bert_score['f1'])

In [7]:
## SAS encoder score
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print("module %s loaded" % module_url)
 
 
def embed(input):
    return model(input)
list_of_similarity_scores = []
for i in range(len(predictions)):
    similarity_score = 1-distance.cosine(embed([predictions[i]])[0, :],embed([references[i]])[0, :])
    list_of_similarity_scores.append(similarity_score)
    print(f'\nPrediction: {predictions[i]}\nReference: {references[i]}\nSimilarity Score = {similarity_score} ')
average_score = sum(list_of_similarity_scores) / len(list_of_similarity_scores)
print("Average similarity score:", average_score)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded

Prediction: 22. mai 2019
Reference: Datoen for vedtaket av Kommunedelplan for sentrum av bystyret var 26.8.2021.
Similarity Score = -0.07438260920255124 

Prediction: 2.2: Formålet med planen er å opprettholde et nærmest naturlig, menneskelig og livsbevisst miljø ved å redusere eksterne innvirkninger, minimere unngåelige kilder til skadelige emisjoner og bevare arven av ressursene.
Reference: Hovedintensjonene i planen beskrevet i dokumentet inkluderer å videreutvikle Kristiansund sentrum som et attraktivt og framtidsrettet regionbysentrum med et pulserende folkeliv. Dette inkluderer positiv utvikling av handels- og servicevirksomhet, god parkerings- og trafikksituasjon, hensyn til gjenreisningsarkitekturens formuttrykk, god framkommelighet og trygghet for alle grupper, et mangfold av kulturaktiviteter, ønske om flere boligprosjekter i sentrum, stimulering til flere arbeids- og skoleplasser i sentrum, og utvikling av 

In [8]:
## SAS transformer score
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def SAS(preds, refs):
    similarities = []
    embeddings_preds = model.encode(preds)
    embeddings_refs = model.encode(refs)
    for i in range(len(embeddings_preds)):
        similarity = util.pytorch_cos_sim(embeddings_preds[i], embeddings_refs[i])
        similarities.append(similarity[0][0].item())
    average_similarity_score = sum(similarities) / len(similarities)
    return average_similarity_score

In [9]:
print("BLEU SCORES")
print(bleu_score)
print("ROUGE SCORES")
print(rouge_score)
print("BERT SCORES")
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1 Score:", avg_f1)
print("Average SAS encoder Score:", average_score)
print("Average SAS transformer Score:", SAS(predictions, references))

BLEU SCORES
{'bleu': 0.04495810242411833, 'precisions': [0.25666666666666665, 0.03436426116838488], 'brevity_penalty': 0.47870694881990544, 'length_ratio': 0.5758157389635317, 'translation_length': 300, 'reference_length': 521}
ROUGE SCORES
{'rouge1': 0.12675491648924164, 'rouge2': 0.02527569930443326, 'rougeL': 0.0951712728652703, 'rougeLsum': 0.09405389441010323}
BERT SCORES
Average Precision: 0.6248170912265778
Average Recall: 0.5889408946037292
Average F1 Score: 0.605554848909378
Average SAS encoder Score: 0.5417058977824144
Average SAS transformer Score: 0.5880291670560837


In [10]:
data = {
    "Metric": ["BLEU Score", "ROUGE Score", "Average Precision", "Average Recall", "Average F1 Score", "Average SAS encoder Score", "Average SAS transformer Score"],
    "Score": [bleu_score, rouge_score, avg_precision, avg_recall, avg_f1, average_score, SAS(predictions, references)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Specify the file path
file_path = "/Users/adrianfolge/Documents/lokal:skole/Master/data/Evaluation_scores/Faiss_evaluation_scores.csv"

# Write DataFrame to CSV
df.to_csv(file_path, index=False)

print("Data has been written to", file_path)

Data has been written to /Users/adrianfolge/Documents/lokal:skole/Master/data/Evaluation_scores/Faiss_evaluation_scores.csv
