# Define evaluator class

In [8]:
from sentence_transformers import SentenceTransformer
from embeddings_training import ContrastiveAutoencoder
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity 
import json
import torch

class EmbeddingEvaluator:
    def __init__(self, model_type, model_path=None, embedding_model_name=None, k=5):
        """
        Initialize the evaluator with the specified model type.
        :param model_type: Either 'custom' for a trained model or 'pretrained' for a SentenceTransformer model.
        :param model_path: Path to the trained model (for custom model).
        :param embedding_model_name: Name of the pre-trained SentenceTransformer model (for pretrained model).
        :param k: Number of top passages to retrieve.
        """
        self.model_type = model_type
        self.k = k
        self.metrics_store = []
        
        # Load the appropriate model based on model type
        if model_type == "custom":
            self.model = ContrastiveAutoencoder.from_pretrained(model_path)  # Load custom model
        elif model_type == "pretrained":
            self.embedding_model_name = embedding_model_name
            self.model = SentenceTransformer(embedding_model_name)  # Load SentenceTransformer model
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

    def load_dataset(self, file_path):
        """
        Load dataset from JSON file and return questions and answers.
        :param file_path: Path to the JSON dataset file.
        :return: Tuple of (questions, answers)
        """
        with open(file_path, 'r') as file:
            data = json.load(file)

        questions = []
        answers = []

        for entry in data.values():
            questions.append(entry["question"])
            answers.extend(entry["answers"])

        return questions, answers

    def generate_embeddings(self, texts):
        """
        Generate embeddings for a list of texts using the pre-trained embedding model or custom model.
        :param texts: List of text strings.
        :return: Embeddings as tensors.
        """
        if self.model_type == "custom":
            return self.model.encode(texts)  # Custom model's encode method
        elif self.model_type == "pretrained":
            return self.model.encode(texts, convert_to_tensor=True)  # SentenceTransformer's encode method

    def retrieve_top_k(self, question_embeddings, answer_embeddings):
        """
        Retrieve the top-k most relevant answers based on cosine similarity.
        :param question_embeddings: List of question embeddings.
        :param answer_embeddings: List of answer embeddings.
        :return: Top-k indices for each question.
        """
        # Check if the embeddings are on GPU and move them to CPU if necessary
        if question_embeddings.device != torch.device('cpu'):
            question_embeddings = question_embeddings.cpu().detach().numpy()
        if answer_embeddings.device != torch.device('cpu'):
            answer_embeddings = answer_embeddings.cpu().detach().numpy()
    
        # Convert to NumPy arrays for cosine similarity calculation
        #question_embeddings = question_embeddings.numpy()
        #answer_embeddings = answer_embeddings.numpy()

        cosine_sim = cosine_similarity(question_embeddings, answer_embeddings)
        top_k_indices = np.argsort(cosine_sim, axis=1)[:, ::-1][:, :self.k]
        return top_k_indices

    def compute_metrics(self, dataset_name, relevant_indices, top_k_indices):
        """
        Compute MRR, Precision@k, and Recall@k for the current dataset.
        :param dataset_name: Name or path of the dataset being evaluated.
        :param relevant_indices: List of relevant indices (correct answers) for each question.
        :param top_k_indices: List of top-k retrieved indices for each question.
        :return: Tuple of MRR, Precision@k, and Recall@k.
        """
        mrr = self.mean_reciprocal_rank(relevant_indices, top_k_indices)
        precision = self.precision_at_k(relevant_indices, top_k_indices)
        recall = self.recall_at_k(relevant_indices, top_k_indices)

        # Store the metrics for this dataset
        self.metrics_store.append({
            "Model": self.model_type if self.model_type == "custom" else self.embedding_model_name,
            "Dataset": dataset_name,
            "MRR": mrr,
            "Precision@k": precision,
            "Recall@k": recall
        })

        return mrr, precision, recall

    def mean_reciprocal_rank(self, relevant_indices, top_k_indices):
        mrr_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            for rank, retrieved in enumerate(top_k_indices[i]):
                if retrieved in relevant:
                    mrr_total += 1.0 / (rank + 1)
                    break
        return mrr_total / len(relevant_indices)

    def precision_at_k(self, relevant_indices, top_k_indices):
        precision_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            retrieved_set = set(top_k_indices[i][:self.k])
            relevant_set = set(relevant)
            precision_total += len(retrieved_set.intersection(relevant_set)) / self.k
        return precision_total / len(relevant_indices)

    def recall_at_k(self, relevant_indices, top_k_indices):
        recall_total = 0.0
        for i, relevant in enumerate(relevant_indices):
            retrieved_set = set(top_k_indices[i][:self.k])
            relevant_set = set(relevant)
            recall_total += len(retrieved_set.intersection(relevant_set)) / len(relevant_set)
        return recall_total / len(relevant_indices)

    def evaluate(self, dataset_file_paths):
        """
        Evaluate the embedding model on multiple datasets and compute metrics for each.
        :param dataset_file_paths: List of dataset file paths.
        """
        for dataset_file_path in dataset_file_paths:
            questions, answers = self.load_dataset(dataset_file_path)

            # Generate embeddings for questions and answers
            question_embeddings = self.generate_embeddings(questions)
            answer_embeddings = self.generate_embeddings(answers)

            # Retrieve the top-k results for each question
            top_k_indices = self.retrieve_top_k(question_embeddings, answer_embeddings)

            # Generate relevant indices (assuming you have the correct answer indices)
            relevant_indices = self.get_relevant_indices(dataset_file_path, answers)

            # Compute and return the evaluation metrics for the current dataset
            mrr, precision, recall = self.compute_metrics(dataset_file_path, relevant_indices, top_k_indices)
            print(f"Dataset: {dataset_file_path}, MRR: {mrr:.3f}, Precision@{self.k}: {precision:.3f}, Recall@{self.k}: {recall:.3f}")

    def get_relevant_indices(self, dataset_file_path, answers):
        """
        Get the relevant indices (correct answers) for each question.
        :param dataset_file_path: Path to the dataset file.
        :param answers: List of all possible answers.
        :return: List of relevant indices for each question.
        """
        with open(dataset_file_path, 'r') as file:
            data = json.load(file)

        relevant_indices = []
        for entry in data.values():
            relevant = [answers.index(ans) for ans in entry["answers"]]
            relevant_indices.append(relevant)

        return relevant_indices

    def get_metrics_summary(self):
        """
        Retrieve the stored metrics across all evaluated datasets.
        :return: List of metric dictionaries for each dataset.
        """
        return self.metrics_store


In [2]:
# Evaluator usage function
def evaluate_model(datasets,model_type, model_path=None, embedding_model_name=None, k=5):
    # Initialize the evaluator with a pre-trained Sentence-BERT model
    evaluator = EmbeddingEvaluator(model_type, model_path=model_path, embedding_model_name=embedding_model_name, k=k)

    # Evaluate the model on all datasets
    evaluator.evaluate(datasets)

    # Retrieve stored metrics
    return(evaluator.get_metrics_summary())


# Evaluate embedding models

In [3]:
datasets = ['/home/mcorradi/researchdrive/BetterEmbedData/bioasq_test_set.json', '/home/mcorradi/researchdrive/BetterEmbedData/pubmedqa_test_set.json']

## General model

In [4]:
metrics_general = evaluate_model(datasets,model_type='pretrained',embedding_model_name = 'BAAI/bge-base-en-v1.5')

Dataset: /home/mcorradi/researchdrive/BetterEmbedData/bioasq_test_set.json, MRR: 0.932, Precision@5: 0.721, Recall@5: 0.549
Dataset: /home/mcorradi/researchdrive/BetterEmbedData/pubmedqa_test_set.json, MRR: 0.925, Precision@5: 0.192, Recall@5: 0.958


## PubMed model

In [None]:
metrics_pubmed = evaluate_model(datasets,model_type='pretrained',embedding_model_name = 'neuml/pubmedbert-base-embeddings')

## BetterEmbed model

In [9]:
metrics_better = evaluate_model(datasets,model_type='custom',model_path='bioqa_contrastive_2')

Some weights of the model checkpoint at bioqa_contrastive_2 were not used when initializing ContrastiveAutoencoder: ['embedding_model.0.auto_model.embeddings.LayerNorm.bias', 'embedding_model.0.auto_model.embeddings.LayerNorm.weight', 'embedding_model.0.auto_model.embeddings.position_embeddings.weight', 'embedding_model.0.auto_model.embeddings.token_type_embeddings.weight', 'embedding_model.0.auto_model.embeddings.word_embeddings.weight', 'embedding_model.0.auto_model.encoder.layer.0.attention.output.LayerNorm.bias', 'embedding_model.0.auto_model.encoder.layer.0.attention.output.LayerNorm.weight', 'embedding_model.0.auto_model.encoder.layer.0.attention.output.dense.bias', 'embedding_model.0.auto_model.encoder.layer.0.attention.output.dense.weight', 'embedding_model.0.auto_model.encoder.layer.0.attention.self.key.bias', 'embedding_model.0.auto_model.encoder.layer.0.attention.self.key.weight', 'embedding_model.0.auto_model.encoder.layer.0.attention.self.query.bias', 'embedding_model.0.au

Dataset: /home/mcorradi/researchdrive/BetterEmbedData/bioasq_test_set.json, MRR: 0.424, Precision@5: 0.254, Recall@5: 0.141
Dataset: /home/mcorradi/researchdrive/BetterEmbedData/pubmedqa_test_set.json, MRR: 0.475, Precision@5: 0.121, Recall@5: 0.606


In [None]:
model = ContrastiveAutoencoder.from_pretrained('bioqa_contrastive_2')