In [1]:
from collections import Counter
import numpy as np
import re 

def normalize_answer(s):
    # Lower case, remove punctuation, and whitespaces ### PLEASE CHECK If logic correct / desired ######## 
    return ' '.join(re.sub(r'[^A-Za-z0-9]', ' ', re.sub(r'\b(a|an|the)\b', ' ', s.lower())).split())

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()
        common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
        num_common = sum(common_tokens.values())
        
        prec = (num_common / len(pred_tokens))
        rec = (num_common / len(truth_tokens)) 

        f1_scores.append( 0 if prec+rec == 0 else (2*(prec*rec)/(prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)
    
    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)

In [2]:
#MODIFIED
import numpy as np

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()

        # Avoid division by zero by checking if pred_tokens or truth_tokens are empty
        if len(pred_tokens) == 0 or len(truth_tokens) == 0:
            prec = 0
            rec = 0
        else:
            common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
            num_common = sum(common_tokens.values())
            prec = (num_common / len(pred_tokens))
            rec = (num_common / len(truth_tokens))

        f1_scores.append(0 if prec + rec == 0 else (2 * (prec * rec) / (prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)

    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)


In [3]:
"""
Citation: 
    The use of chatgpt for the normalize answer and for using the counter function only
"""

'\nCitation: \n    The use of chatgpt for the normalize answer and for using the counter function only\n'

In [4]:
def getFileData(filename):
    f = open(filename, "r")
    data = f.readlines()
    f.close()
    data = [i.strip() for i in data]
    return data

In [5]:
import os
directories = ["Web Scholar PDFs", "About Scottie", "Buggy News", "academic_calendars", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "program_handbooks", "Tartan Facts", "courses"]


questions = {}
for i in directories:
    if "annotation.txt" in os.listdir("Data/"+i):
        f = open("Data/"+i + "/annotation.txt")
        lines = f.readlines()
        f.close()

        q = lines[0::5]
        t = lines[3::5]


        for count, j in enumerate(q):
            questions[j[3:].strip()] = (t[count][3:].strip())

In [31]:

f = open("SubmissionData/test/questions.txt", "r")
ref_qs = [i.strip() for i in f.readlines()]
f.close()

def getPairsByType(ref_qs, annotation_answers, rag_answers):
    pairs = {}

    

    for count, i in enumerate(ref_qs):
        if i in questions:
            t = questions[i]
            if t in pairs:
                pairs[t][0].append(rag_answers[count])
                pairs[t][1].append(annotation_answers[count])
            else:
                pairs[t] = [[], []]
                pairs[t][0] = [rag_answers[count]]
                pairs[t][1] = [annotation_answers[count]]
            

    return pairs


def getRecallByTypes(s, questions):
    keyFreq = {}
    for i in questions:
        if questions[i] in keyFreq:
            keyFreq[questions[i]] += 1
        else:
            keyFreq[questions[i]] = 1
            
    res = {}
    for i in s:
        _, _, recall = metrics(s[i][0], s[i][1])
        res[i] = recall/keyFreq[i]

    

    return [(str(i), res[str(i)]) for i in sorted([int(i) for i in res.keys()])]



In [32]:
# Reference annotations
refrence_answers = getFileData("SubmissionData/test/reference_answers.txt")

In [33]:
system_generated_answers = getFileData("SubmissionData/system_outputs/system_output_1.txt")
print("gpt4all", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))

system_generated_answers = getFileData("SubmissionData/system_outputs/test_miniLM.txt")
print("miniLM", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))


system_generated_answers = getFileData("SubmissionData/system_outputs/test_llmEmbed.txt")
print("llmEmbed", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))


system_generated_answers = getFileData("SubmissionData/system_outputs/MistralEmbeddingsTest.txt")
print("mistral",  metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))


system_generated_answers = getFileData("SubmissionData/system_outputs/test_colbert.txt")
print("colbert", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))


system_generated_answers = getFileData("SubmissionData/system_outputs/test_rerank.txt")
print("rerank", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))

system_generated_answers = getFileData("SubmissionData/system_outputs/final_2cluster_test.txt")
print("2cluster", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))


system_generated_answers = getFileData("SubmissionData/system_outputs/4clusterTESTSET.txt")
print("4cluster w unlimiformer", metrics(refrence_answers, system_generated_answers))
print(getRecallByTypes(getPairsByType(ref_qs, refrence_answers, system_generated_answers), questions))




gpt4all (0.23105094889028244, 0.0, 0.17675869722203136)
[('1', 0.0018382995127181171), ('2', 0.00916793020360879), ('3', 0.035416666666666666), ('5', 0.0125), ('6', 0.00866663154798748), ('7', 0.015467416982568497), ('8', 0.06060606060606061), ('9', 0.03857142857142857), ('10', 0.010476190476190476)]
miniLM (0.3491102398471225, 0.0379746835443038, 0.3037977502482182)
[('1', 0.0021602286891866497), ('2', 0.010318801513390045), ('3', 0.032873931623931624), ('5', 0.0875), ('6', 0.011183836042593105), ('7', 0.015078131744798412), ('8', 0.24242424242424243), ('9', 0.060238095238095236), ('10', 0.018857142857142854)]
llmEmbed (0.35181070114959034, 0.02531645569620253, 0.3247909502855802)
[('1', 0.0018977138610412312), ('2', 0.00896156990506245), ('3', 0.046057692307692306), ('5', 0.075), ('6', 0.018457426649517047), ('7', 0.016519115003963487), ('8', 0.27272727272727276), ('9', 0.04047619047619048), ('10', 0.018857142857142854)]
mistral (0.28001888388798374, 0.012658227848101266, 0.247446200