# Define evaluator class

In [12]:
from sentence_transformers import SentenceTransformer  
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity 
import json

class EmbeddingEvaluator:
    def __init__(self, embedding_model_name, k=5):
        """
        Initialize the evaluator with the specified embedding model and k value for top-k retrieval.
        :param embedding_model_name: Name of the pre-trained embedding model.
        :param k: Number of top passages to retrieve.
        """
        self.embedding_model_name = embedding_model_name
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.k = k
        self.metrics_store = []

    def load_dataset(self, file_path):
        """
        Load dataset from JSON file and return questions and answers.
        :param file_path: Path to the JSON dataset file.
        :return: Tuple of (questions, answers)
        """
        with open(file_path, 'r') as file:
            data = json.load(file)

        questions = []
        answers = []

        for entry in data.values():
            questions.append(entry["question"])
            answers.extend(entry["answers"])

        return questions, answers

    def generate_embeddings(self, texts):
        """
        Generate embeddings for a list of texts using the pre-trained embedding model.
        :param texts: List of text strings.
        :return: Embeddings as tensors.
        """
        return self.embedding_model.encode(texts, convert_to_tensor=True)

    def retrieve_top_k(self, question_embeddings, answer_embeddings):
        """
        Retrieve the top-k most relevant answers based on cosine similarity.
        :param question_embeddings: List of question embeddings.
        :param answer_embeddings: List of answer embeddings.
        :return: Top-k indices for each question.
        """
        cosine_sim = cosine_similarity(question_embeddings, answer_embeddings)
        top_k_indices = np.argsort(cosine_sim, axis=1)[:, ::-1][:, :self.k]
        return top_k_indices

    def compute_metrics(self, dataset_name, relevant_indices, top_k_indices):
        """
        Compute MRR, Precision@k, and Recall@k for the current dataset.
        :param dataset_name: Name or path of the dataset being evaluated.
        :param relevant_indices: List of relevant indices (correct answers) for each question.
        :param top_k_indices: List of top-k retrieved indices for each question.
        :return: Tuple of MRR, Precision@k, and Recall@k.
        """
        mrr = self.mean_reciprocal_rank(relevant_indices, top_k_indices)
        precision = self.precision_at_k(relevant_indices, top_k_indices)
        recall = self.recall_at_k(relevant_indices, top_k_indices)

        # Store the metrics for this dataset
        self.metrics_store.append({
            "Model": self.embedding_model_name,
            "Dataset": dataset_name,
            "MRR": mrr,
            "Precision@k": precision,
            "Recall@k": recall
        })

        return mrr, precision, recall

    def mean_reciprocal_rank(self, relevant_indices, top_k_indices):
        """
        Average of the reciprocal ranks of the first correct answer.
        """
        mrr_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            for rank, retrieved in enumerate(top_k_indices[i]):
                if retrieved in relevant:
                    mrr_total += 1.0 / (rank + 1)
                    break
        return mrr_total / len(relevant_indices)

    def precision_at_k(self, relevant_indices, top_k_indices):
        """
        Proportion of retrieved answers in the top k results that are actually correct
        """
        precision_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            retrieved_set = set(top_k_indices[i][:self.k])
            relevant_set = set(relevant)
            precision_total += len(retrieved_set.intersection(relevant_set)) / self.k
        return precision_total / len(relevant_indices)

    def recall_at_k(self, relevant_indices, top_k_indices):
        """
        Proportion of correct answers retrieved in the top k results
        """
        recall_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            retrieved_set = set(top_k_indices[i][:self.k])
            relevant_set = set(relevant)
            recall_total += len(retrieved_set.intersection(relevant_set)) / len(relevant_set)
        return recall_total / len(relevant_indices)

    def evaluate(self, dataset_file_paths):
        """
        Evaluate the embedding model on multiple datasets and compute metrics for each.
        :param dataset_file_paths: List of dataset file paths.
        """
        for dataset_file_path in dataset_file_paths:
            questions, answers = self.load_dataset(dataset_file_path)

            # Generate embeddings for questions and answers
            question_embeddings = self.generate_embeddings(questions)
            answer_embeddings = self.generate_embeddings(answers)

            # Retrieve the top-k results for each question
            top_k_indices = self.retrieve_top_k(question_embeddings, answer_embeddings)

            # Generate relevant indices (assuming you have the correct answer indices)
            relevant_indices = self.get_relevant_indices(dataset_file_path, answers)

            # Compute and return the evaluation metrics for the current dataset
            mrr, precision, recall = self.compute_metrics(dataset_file_path, relevant_indices, top_k_indices)
            print(f"Dataset: {dataset_file_path}, MRR: {mrr:.3f}, Precision@{self.k}: {precision:.3f}, Recall@{self.k}: {recall:.3f}")

    def get_relevant_indices(self, dataset_file_path, answers):
        """
        Get the relevant indices (correct answers) for each question.
        :param dataset_file_path: Path to the dataset file.
        :param answers: List of all possible answers.
        :return: List of relevant indices for each question.
        """
        with open(dataset_file_path, 'r') as file:
            data = json.load(file)

        relevant_indices = []
        for entry in data.values():
            relevant = [answers.index(ans) for ans in entry["answers"]]
            relevant_indices.append(relevant)

        return relevant_indices

    def get_metrics_summary(self):
        """
        Retrieve the stored metrics across all evaluated datasets.
        :return: List of metric dictionaries for each dataset.
        """
        return self.metrics_store


In [16]:
# Evaluator usage function
def evaluate_model(model_name, datasets):
    # Initialize the evaluator with a pre-trained Sentence-BERT model
    evaluator = EmbeddingEvaluator(model_name, k=5)

    # Evaluate the model on all datasets
    evaluator.evaluate(datasets)

    # Retrieve stored metrics
    return(evaluator.get_metrics_summary())


# Evaluate embedding models

In [10]:
datasets = ['bioasq_test_set.json', 'pubmedqa_test_set.json']

## General model

In [17]:
metrics_general = evaluate_model ('BAAI/bge-base-en-v1.5', datasets)

Dataset: bioasq_test_set.json, MRR: 0.928, Precision@5: 0.722, Recall@5: 0.549
Dataset: pubmedqa_test_set.json, MRR: 0.925, Precision@5: 0.192, Recall@5: 0.958


## PubMed model

In [18]:
metrics_pubmed = evaluate_model ('neuml/pubmedbert-base-embeddings', datasets)

Dataset: bioasq_test_set.json, MRR: 0.809, Precision@5: 0.590, Recall@5: 0.420
Dataset: pubmedqa_test_set.json, MRR: 0.936, Precision@5: 0.195, Recall@5: 0.973
