In [2]:
from collections import Counter
import numpy as np
import re 

def normalize_answer(s):
    # Lower case, remove punctuation, and whitespaces ### PLEASE CHECK If logic correct / desired ######## 
    return ' '.join(re.sub(r'[^A-Za-z0-9]', ' ', re.sub(r'\b(a|an|the)\b', ' ', s.lower())).split())

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()
        common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
        num_common = sum(common_tokens.values())
        
        prec = (num_common / len(pred_tokens))
        rec = (num_common / len(truth_tokens)) 

        f1_scores.append( 0 if prec+rec == 0 else (2*(prec*rec)/(prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)
    
    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)

In [3]:
#MODIFIED
import numpy as np

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()

        # Avoid division by zero by checking if pred_tokens or truth_tokens are empty
        if len(pred_tokens) == 0 or len(truth_tokens) == 0:
            prec = 0
            rec = 0
        else:
            common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
            num_common = sum(common_tokens.values())
            prec = (num_common / len(pred_tokens))
            rec = (num_common / len(truth_tokens))

        f1_scores.append(0 if prec + rec == 0 else (2 * (prec * rec) / (prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)

    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)


In [3]:
"""
Citation: 
    The use of chatgpt for the normalize answer and for using the counter function only
"""

'\nCitation: \n    The use of chatgpt for the normalize answer and for using the counter function only\n'

In [4]:
def getFileData(filename):
    f = open(filename, "r")
    data = f.readlines()
    f.close()
    data = [i.strip() for i in data]
    return data

In [5]:
# Reference annotations
refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")

In [35]:
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_1.txt")
print("gpt4all", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/test_miniLM.txt")
print("miniLM", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/test_llmEmbed.txt")
print("llmEmbed", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/MistralEmbeddingsTest.txt")
print("mistral",  metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/test_colbert.txt")
print("colbert", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/test_rerank.txt")
print("rerank", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/final_2cluster_test.txt")
print("2cluster", metrics(refrence_answers, system_generated_answers))

system_generated_answers = getFileData("SubmissionData/system_outputs/4clusterTESTSET.txt")
print("4cluster w unlimiformer", metrics(refrence_answers, system_generated_answers))



gpt4all (0.23105094889028244, 0.0, 0.17675869722203136)
miniLM (0.3491102398471225, 0.0379746835443038, 0.3037977502482182)
llmEmbed (0.35181070114959034, 0.02531645569620253, 0.3247909502855802)
mistral (0.28001888388798374, 0.012658227848101266, 0.2474462005017682)
colbert (0.4184921453162676, 0.05063291139240506, 0.3744027543652713)
rerank (0.36428925570940424, 0.0379746835443038, 0.3278893985135896)
2cluster (0.43919092859505904, 0.05063291139240506, 0.39470423587580006)
4cluster + unlimiformer (0.43477073539220035, 0.0759493670886076, 0.365626385280889)
