In [1]:
from collections import Counter
import numpy as np
import re 

def normalize_answer(s):
    # Lower case, remove punctuation, and whitespaces ### PLEASE CHECK If logic correct / desired ######## 
    return ' '.join(re.sub(r'[^A-Za-z0-9]', ' ', re.sub(r'\b(a|an|the)\b', ' ', s.lower())).split())

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()
        common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
        num_common = sum(common_tokens.values())
        
        prec = (num_common / len(pred_tokens))
        rec = (num_common / len(truth_tokens)) 

        f1_scores.append( 0 if prec+rec == 0 else (2*(prec*rec)/(prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)
    
    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)

In [2]:
"""
Citation: 
    The use of chatgpt for the normalize answer and for using the counter function only
"""

'\nCitation: \n    The use of chatgpt for the normalize answer and for using the counter function only\n'

In [3]:
def getFileData(filename):
    f = open(filename, "r")
    data = f.readlines()
    f.close()
    data = [i.strip() for i in data]
    return data

In [28]:
# GPT4All Embeddings + Llama2

refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_1.txt")

metrics(refrence_answers, system_generated_answers)

(0.23105094889028244, 0.0, 0.17675869722203136)

In [27]:
# all-miniLM-l6-v2 embeddings + LLama2

refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_2.txt")

metrics(refrence_answers, system_generated_answers)

(0.3569133369324437, 0.0379746835443038, 0.33798993223464463)

In [26]:
# mistral embeddings + LLama2

refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_3.txt")

metrics(refrence_answers, system_generated_answers)

(0.37367435697686446, 0.02531645569620253, 0.3185538110816663)

In [25]:
# all-miniLM-l6-v2 embeddings + Dragon Mistral

refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_5.txt")

system_generated_answers = [i for i in system_generated_answers if i != ""]
metrics(refrence_answers, system_generated_answers)

(0.04237111803781401, 0.0, 0.04669124425295719)

In [24]:
refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/miniLM_mistral.txt")

system_generated_answers = [i for i in system_generated_answers if i != ""]
metrics(refrence_answers, system_generated_answers)

(0.03734369574591945, 0.0, 0.04852857655033772)

In [23]:
refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")
system_generated_answers = getFileData("SubmissionData/system_outputs/colbert.txt")

system_generated_answers = [i for i in system_generated_answers if i != ""]
metrics(refrence_answers, system_generated_answers)

(0.4142937759054067, 0.05063291139240506, 0.3685443983028275)