# Install libs

## General libs

In [None]:
!pip install --user ir_datasets
!pip install joblib
!pip install dill

## Text processing libs

In [None]:
!pip install nltk

import nltk
nltk.download()

!pip install contractions

## TF-IDF libs

In [None]:
!pip install scikit-learn
!pip install numpy

## Embedding libs

In [None]:
!pip install chromadb
!pip install sentence_transformers
!pip install torch

## BM25 libs

In [None]:
!pip install rank_bm25

## Plotting libs

In [None]:
!pip install --user matplotlib

# Evaluation

## Loader

In [None]:
# Dataset configurations
DATASETS = {
    "antique": {
        "name": "antique",
        "description": "Question-answer dataset with natural questions from real users",
        "ir_datasets_id": "antique",
        "ir_datasets_test_id": "antique/test/non-offensive"
    },
    "quora": {
        "name": "beir/quora",
        "description": "Quora question pairs dataset from the BEIR benchmark",
        "ir_datasets_id": "beir/quora",
        "ir_datasets_test_id": "beir/quora/dev"
    },
    "webis": {
        "name": "beir/webis-touche2020/v2",
        "description": "Webis Touché 2020 (v2) dataset from the BEIR benchmark",
        "ir_datasets_id": "beir/webis-touche2020/v2",
        "ir_datasets_test_id": "beir/webis-touche2020/v2"
    },
    "recreation": {
        "name": "lotte/recreation/dev",
        "description": "LOTTE Recreation domain, development split",
        "ir_datasets_id": "lotte/recreation/dev/forum",
        "ir_datasets_test_id": "lotte/recreation/test/forum"
    },
    "wikir": {
        "name": "wikir/en1k",
        "description": "Wiki-Retrieval English 1K dataset",
        "ir_datasets_id": "wikir/en1k",
        "ir_datasets_test_id": "wikir/en1k/test"
    },
    "clinical": {
        "name": "clinicaltrials/2021/trec-ct-2021",
        "description": "ClinicalTrials TREC-CT 2021 dataset",
        "ir_datasets_id": "clinicaltrials/2021/trec-ct-2021",
        "ir_datasets_test_id": "clinicaltrials/2021/trec-ct-2021/test"
    }
}

# Default dataset to use if none specified
DEFAULT_DATASET = "antique"

In [None]:
import ir_datasets

from typing import TypeAlias

from collections import namedtuple

Doc = namedtuple('Doc', ['doc_id', 'text'])
Query = namedtuple('Query', ['query_id', 'text'])
Qrel = namedtuple('Qrel', ['query_id', 'doc_id', 'relevance', 'iteration'])

def load_dataset(name: str) -> list[Doc]:
    dataset = ir_datasets.load(DATASETS[name]['ir_datasets_id'])
    
    docs = list(dataset.docs_iter())
    return docs

def load_queries_and_qrels(name: str) -> tuple[list[Query], list[Qrel]]:
    dataset_test = ir_datasets.load(DATASETS[name]['ir_datasets_test_id'])
    
    queries = list(dataset_test.queries_iter())
    qrels = list(dataset_test.qrels_iter())
    return queries, qrels

def load_dataset_with_queries(name: str) -> tuple[list[Doc], list[Query], list[Qrel]]:
    dataset = ir_datasets.load(DATASETS[name]['ir_datasets_id'])
    dataset_test = ir_datasets.load(DATASETS[name]['ir_datasets_test_id'])
    
    docs = list(dataset.docs_iter())
    queries = list(dataset_test.queries_iter())
    qrels = list(dataset_test.qrels_iter())
    
    return docs, queries, qrels

## Text Processing

In [None]:
import re
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import contractions

class TextPreprocessor:
    __lemmatizer__ = WordNetLemmatizer()
    __stop_words__ = set(stopwords.words('english'))
    __instance__ = None

    @staticmethod
    def getInstance():
        if TextPreprocessor.__instance__ == None:
            TextPreprocessor.__instance__ = TextPreprocessor()
        return TextPreprocessor.__instance__

    def __clean_text__(self, text):
        """
        Clean text by removing special characters and converting to lowercase
        """
        # Convert to lowercase
        text = contractions.fix(text)

        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        return text

    def __remove_stopwords__(self, text):
        """
        Remove common stopwords from text
        """
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in self.__stop_words__]
        return ' '.join(filtered_words)

    def __get_wordnet_pos__(self, tag_parameter):
        tag = tag_parameter[0].upper()
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
            }
        return tag_dict.get(tag, wordnet.NOUN)

    def __lemmatize_text__(self, text):
        """
        Lemmatize words to their root form
        """
        # Tokenize into words
        words = word_tokenize(text)

        # POS tagging
        pos_tags = pos_tag(words)

        lemmatized_words = [self.__lemmatizer__.lemmatize(word, pos=self.__get_wordnet_pos__(tag)) for word, tag in pos_tags]

        return ' '.join(lemmatized_words)

    def preprocess_text(self, text):
        """
        Apply full preprocessing pipeline
        """
        if not isinstance(text, str):
            return ""
        
        # Apply all preprocessing steps
        text = self.__clean_text__(text)
        text = self.__lemmatize_text__(text)
        text = self.__remove_stopwords__(text)

        return word_tokenize(text.strip())


## Inverted Index

In [None]:
from collections import defaultdict

class InvertedIndex:
    def __init__(self):
        self.index = defaultdict(set)        # term -> {doc_id}
        self.doc_lengths = defaultdict(int)  # doc_id -> total terms
        self.N = 0                           # total documents

    def add_document(self, doc_id, tokens):
        self.N += 1
        self.doc_lengths[doc_id] = len(tokens)
        for token in tokens:
            self.index[token].add(doc_id)

    def get_documents_sharing_terms_with_query(self, query_tokens):
        """
        Returns a set of doc_ids that share at least one word with the query.
        """
        related_docs = set()

        for token in query_tokens:
            related_docs.update(self.index.get(token, set()))

        return list(related_docs)


## Retrievers

In [None]:
import math

def calc_dcg(relevance, rank):
    return ((2 ** relevance) - 1) / math.log10(rank + 1)

class Retriever:
    def search(self, dataset_name: str, query: str, top_k: int = 10, with_index: bool = True) -> list[tuple[str, float, str]]:
        raise NotImplementedError()
    
    def evaluateNDCG(self, dataset_name, queries, qrels, docs, K = 10, print_more = False):
        nDCG = []

        for i in range(len(queries)):
            query = queries[i]
            if print_more:
                preprocess_text = TextPreprocessor.getInstance().preprocess_text
                print(f"Query: {query.text}")
                print(f"Query: {preprocess_text(query.text)}")
            
            # Search
            results = self.search(dataset_name, query.text, K, True)
            if print_more:
                for i, res in enumerate(results):
                    print(f"Result #{i} {res[1]}: {res[2]}")
                    print(f"Result #{i} {res[1]}: {preprocess_text(res[2])}")

            # Find relevant documents for this query
            relevant_qrels = [qrel for qrel in qrels if qrel.query_id == query.query_id]
            relevant_qrels = sorted(relevant_qrels, key=lambda x: x.relevance, reverse=True)
            if print_more:
                for i, qrel in enumerate(relevant_qrels[:K]):
                    doc = [doc for doc in docs if qrel.doc_id == doc.doc_id][0]
                    print(f"Qrel #{i} {qrel.relevance}: {doc.text}")
                    print(f"Qrel #{i} {qrel.relevance}: {preprocess_text(doc.text)}")
            
            DCG = [
                calc_dcg(
                    list(
                        filter(
                            lambda qrel: qrel.doc_id == doc[0], relevant_qrels
                            )
                        )[0].relevance if list(
                        filter(
                            lambda qrel: qrel.doc_id == doc[0], relevant_qrels
                            )
                        ) else 0
                    , i+1
                ) for i, doc in enumerate(results)]
            
            iDCG = [calc_dcg(qrel.relevance, i+1) for i, qrel in enumerate(relevant_qrels[:K])]
            
            res = sum(DCG) 
            ires = sum(iDCG) 
            
            if print_more:
                print("")
                print(f"query: {i+1}/{len(queries)}")
                print(f"DCG: {res}")
                print(f"iDCG: {ires}")
                print(f"nDCG: {res/ires*100}%")
            nDCG.append(res/ires)
            if print_more:
                print(f"Average nDCG: {sum(nDCG)/len(nDCG)*100}%")
        
        nDCG = sum(nDCG)/len(nDCG)*100

        if print_more:
            print(f"Final Average nDCG: {nDCG}%")

        return nDCG

    def evaluateMRR(self, dataset_name, queries, qrels, K = 100, print_more = False):
        MRR = []

        cleaned_qrels: dict[str, dict[str, int]] = {}
        for qrel in qrels:
            if qrel.query_id not in cleaned_qrels.keys():
                cleaned_qrels[qrel.query_id] = {}
            cleaned_qrels[qrel.query_id][qrel.doc_id] = qrel.relevance

        for i in range(len(queries)):
            query = queries[i]
            results = self.search(dataset_name, query.text, K, True)
            
            firstRank = 100
            for ii, res in enumerate(results):
                if res[0] in cleaned_qrels[query.query_id].keys() and cleaned_qrels[query.query_id][res[0]] > 0:
                    firstRank = ii + 1
            
            MRR.append(1/firstRank)
            
            if print_more:
                print()
                print(f"Query: {i+1}/{len(queries)}")
                print(f"Current MRR: {sum(MRR) / len(MRR) * 100}")
        
        MRR = sum(MRR) / len(MRR) * 100
        if print_more:
            print(f"MRR: {MRR}%")
        return MRR
    
    def evaluateMAP(self, dataset_name, queries, qrels,docs, K = 10, print_more = False):
        MAP = []

        cleaned_qrels: dict[str, dict[str, int]] = {}
        for qrel in qrels:
            if qrel.query_id not in cleaned_qrels.keys():
                cleaned_qrels[qrel.query_id] = {}
            cleaned_qrels[qrel.query_id][qrel.doc_id] = qrel.relevance
        
        for i in range(len(queries)):
            query = queries[i]
            if print_more:
                print()
                print(f'Query: {i+1}/{len(queries)}')
                print(query.text)

            results = self.search(dataset_name, query.text, K, True)
            if print_more:
                print([res[0] for res in results])
                print([qrel.doc_id+f": {qrel.relevance}" for qrel in qrels if qrel.query_id == query.query_id])
                print("results")
                for doc in [doc for doc in docs if doc.doc_id in [res[0] for res in results]]:
                    print(doc.doc_id+" "+doc.text)
                print("qrels")
                koko = [qrel.doc_id for qrel in qrels if qrel.query_id == query.query_id]
                for doc in [doc for doc in docs if doc.doc_id in koko]:
                    print(doc.doc_id+" "+doc.text)

            relevant_num = 0
            precision_sum = 0
            for ii, res in enumerate(results):
                if res[0] in cleaned_qrels[query.query_id].keys() and cleaned_qrels[query.query_id][res[0]] > 0:
                    relevant_num += 1
                    precision_sum += relevant_num / (ii + 1)
            if print_more:
                print(precision_sum)
            if relevant_num > 0:
                MAP.append(precision_sum / relevant_num)
            if print_more:
                if len(MAP) > 0:
                    print(f'MAP = {sum(MAP) / len(MAP) * 100}')
        if len(MAP) > 0:
            MAP = sum(MAP) / len(MAP) * 100
        else:
            MAP = 0
        if print_more:
            print(f'MAP={MAP}%')
        return MAP


In [None]:
import time
import dill
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class TFIDF_online(Retriever):
    __tfidfInstance__ : dict[str, list] = {}
    __invertedIndex__ : dict[str, InvertedIndex] = {}

    @staticmethod
    def __loadInstance__(dataset_name : str):
        if dataset_name not in TFIDF_online.__tfidfInstance__.keys():

            # Load the model and the documents
            docs = load_dataset(dataset_name)
            vectorizer = joblib.load(f"../data/{dataset_name}/tfidf_vectorizer.joblib")
            docs_tfidf_matrix = joblib.load(f"../data/{dataset_name}/tfidf_matrix.joblib")

            TFIDF_online.__tfidfInstance__[dataset_name] = [docs,vectorizer,docs_tfidf_matrix]

    @staticmethod
    def __loadInvertedIndex__(dataset_name : str):
        if dataset_name not in TFIDF_online.__invertedIndex__.keys():
            with open(f"../data/{dataset_name}/inverted_index.dill", "rb") as f:
                inverted_index = InvertedIndex()
                ii = dill.load(f)
                inverted_index.index = ii.index
                inverted_index.doc_lengths = ii.doc_lengths
                inverted_index.N = ii.N
                TFIDF_online.__invertedIndex__[dataset_name] = inverted_index

    def search(self, dataset_name, query, top_k, with_index = True):

        # Load the model and the index
        self.__loadInstance__(dataset_name)
        docs = self.__tfidfInstance__[dataset_name][0]
        vectorizer = self.__tfidfInstance__[dataset_name][1]
        docs_tfidf_matrix = self.__tfidfInstance__[dataset_name][2]
        
        self.__loadInvertedIndex__(dataset_name)
        inverted_index = self.__invertedIndex__[dataset_name]

        # Start the process
        query_vec = vectorizer.transform([query])
        
        if(with_index):
            tokenized_query = TextPreprocessor.getInstance().preprocess_text(query)
            candidate_indices = inverted_index.get_documents_sharing_terms_with_query(tokenized_query)   
            docs_tfidf_matrix = docs_tfidf_matrix[candidate_indices]

        cosine_sim = cosine_similarity(query_vec, docs_tfidf_matrix).flatten()

        ranked_indices = np.argsort(cosine_sim)[::-1]

        # Prepare structured results
        results = []
        # Limit results to a reasonable number for display/API response, e.g., top 10 or 20
        for i in ranked_indices[:top_k]:
            if(with_index):
                original_doc_idx = candidate_indices[i]
            else:
                original_doc_idx = i

            doc = docs[original_doc_idx]
            results.append((
                docs[original_doc_idx].doc_id,
                float(cosine_sim[i]),
                doc.text[:40] + "..." if len(doc.text) > 40 else doc.text # Provide a snippet
            ))
        return results


In [None]:
import math
import chromadb
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

class Embedding_online(Retriever):
    __embeddingInstance__ : dict[str, any] = {}
    __collection_instance__: dict = {}
    __modelInstance__  = None

    @staticmethod
    def __loadModelInstance__():
        if Embedding_online.__modelInstance__ == None:
            Embedding_online.__modelInstance__ = SentenceTransformer("../data/models/all-MiniLM-L6-v2") 
        return Embedding_online.__modelInstance__

    @staticmethod
    def __loadInstance__(dataset_name : str):
        if dataset_name not in Embedding_online.__embeddingInstance__.keys():
            with open(f"../data/{dataset_name}/bert_embeddings.npy", "rb") as f:
                Embedding_online.__embeddingInstance__[dataset_name] = np.load(f)
        return Embedding_online.__embeddingInstance__[dataset_name]

    @staticmethod
    def __get_collection__(dataset_name: str):
        if dataset_name not in Embedding_online.__collection_instance__:
            print(f"Connecting to ChromaDB and getting collection: {dataset_name}_embeddings...")
            client = chromadb.PersistentClient(path="chroma_db")
            Embedding_online.__collection_instance__[dataset_name] = client.get_collection(name=f"{dataset_name}_embeddings")
        return Embedding_online.__collection_instance__[dataset_name]

    def search(self, dataset_name: str, query: str, top_k: int = 10, with_index: bool = True):
        if with_index:
            return self.embedding_vectors_search(dataset_name, query, top_k)
        else:
            return self.embedding_search(dataset_name, query, top_k)

    def embedding_search(self, dataset_name: str, query: str, top_k: int):
        # Load model and documents
        model = Embedding_online.__loadModelInstance__()
        document_embeddings =  Embedding_online.__loadInstance__(dataset_name)
        docs = load_dataset(dataset_name)
        processedQuery = TextPreprocessor.getInstance().preprocess_text(query)
        query_embedding = model.encode(processedQuery)

        cos_scores = util.cos_sim(torch.tensor(query_embedding), torch.tensor(document_embeddings))[0]
        top_results = torch.topk(cos_scores, k=top_k)
        results = []
        # print(f"\nTop {top_k} results for query: '{query}'")
        for score, idx in zip(top_results[0], top_results[1]):
            doc_id = docs[idx].doc_id
            doc_text = docs[idx].text[:100] + "..." 
            results.append((doc_id, score.item(), doc_text))
            # print(f"Doc ID: {doc_id}, Score: {score.item():.4f}, Text: {doc_text}"
        return results

    def embedding_vectors_search(self, dataset_name: str, query: str, top_k: int):
        #Load model and collection
        model = Embedding_online.__loadModelInstance__()
        collection = Embedding_online.__get_collection__(dataset_name)
        #process query
        processedQuery = TextPreprocessor.getInstance().preprocess_text(query)
        query_embedding = model.encode(processedQuery)
        search_results = collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        results = []
        ids = search_results['ids'][0]
        distances = search_results['distances'][0]
        metadatas = search_results['metadatas'][0]
        for doc_id, score, meta in zip(ids, distances, metadatas):
            similarity_score = 1 - score
            text = meta.get('text', '')[:100] + "..." 
            results.append((doc_id, similarity_score, text))
        return results

    def embedding_rerank(self, dataset_name: str, query: str, doc_ids: list) -> list[tuple[str, float]]:
        """
        Efficiently re-ranks a list of documents using the loaded embeddings.
        """

        docs_list = load_dataset(dataset_name)
        document_embeddings =  Embedding_online.__loadInstance__(dataset_name)
        model = Embedding_online.__loadModelInstance__()

        # 1. Create a quick lookup map for doc_id to its index
        doc_id_to_index = enumerate(docs_list)

        # 2. Get the indices and embeddings for the documents we need to rerank
        candidate_indices = [i for i, doc in doc_id_to_index if doc.doc_id in doc_ids]

        # Filter out any docs that might not be in our list
        valid_indices = [idx for idx in candidate_indices if idx is not None]
        
        if not valid_indices:
            return []
            
        candidate_embeddings = document_embeddings[valid_indices]
        
        # 3. Encode the query
        query_embedding = model.encode(query, convert_to_tensor=True)
        
        # 4. Calculate similarity scores
        cosine_scores = util.cos_sim(query_embedding, candidate_embeddings)[0]
        
        # 5. Pair the original doc_ids (that were valid) with their new scores
        valid_doc_ids = [docs_list[i] for i in valid_indices]
        reranked_results = []
        for doc_id, score in zip(valid_doc_ids, cosine_scores):
            reranked_results.append((doc_id, score.item()))

        return sorted(reranked_results, key=lambda item: item[1], reverse=True)

In [None]:
import math
import os
import dill
from rank_bm25 import BM25Okapi

class BM25_online(Retriever):
    __bm25instance__ : dict[str, BM25Okapi] = {}
    __invertedIndex__ : dict[str, InvertedIndex] = {}
    __docs__ : dict[str, list[str, str]] = {}
    @staticmethod
    def __loadInstance__(dataset_name : str):
        if dataset_name not in BM25_online.__bm25instance__.keys():
            with open(os.path.join(os.path.abspath(''), f"../data/{dataset_name}/bm25_model.dill"), "rb") as f:
                BM25_online.__bm25instance__[dataset_name] = dill.load(f) 
        return BM25_online.__bm25instance__[dataset_name]
    @staticmethod
    def __loadInvertedIndex__(dataset_name : str):
        if dataset_name not in BM25_online.__invertedIndex__.keys():
            with open(f"../data/{dataset_name}/inverted_index.dill", "rb") as f:
                inverted_index = InvertedIndex()
                ii = dill.load(f)
                inverted_index.index = ii.index
                inverted_index.doc_lengths = ii.doc_lengths
                inverted_index.N = ii.N
                BM25_online.__invertedIndex__[dataset_name] = inverted_index
        return BM25_online.__invertedIndex__[dataset_name]
    @staticmethod
    def __loadDocs__(dataset_name : str):
        if dataset_name not in BM25_online.__docs__.keys():
            BM25_online.__docs__[dataset_name] = load_dataset(dataset_name)
        return BM25_online.__docs__[dataset_name]

    def search(self, dataset_name: str, query: str, top_k: int = 10, with_inverted_index: bool = True) -> list[tuple[str, float, str]]:
        # Load the model and the documents
        bm25 = BM25_online.__loadInstance__(dataset_name)
        docs = BM25_online.__loadDocs__(dataset_name)
        if with_inverted_index:
            inverted_index = BM25_online.__loadInvertedIndex__(dataset_name)

        # Execute the query
        query_tokens = TextPreprocessor.getInstance().preprocess_text(query)

        if with_inverted_index:
            documents_sharing_terms_with_query = inverted_index.get_documents_sharing_terms_with_query(query_tokens)
            scores = bm25.get_batch_scores(query_tokens, documents_sharing_terms_with_query)
        else:
            scores = bm25.get_scores(query_tokens)

        # Sort the results
        if with_inverted_index:
            top_indices = sorted(list(enumerate(documents_sharing_terms_with_query)), key=lambda  elm: scores[elm[0]], reverse=True)[:top_k]
        else:
            top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

        results = []

        # Display the results
        if with_inverted_index:
            for elm in top_indices:
                text = docs[elm[1]].text
                results.append((docs[elm[1]].doc_id, scores[elm[0]], text))
        else:
            for idx in top_indices:
                text = docs[idx].text
                results.append((docs[idx].doc_id, scores[idx], text))
        
        return results


In [None]:
class Hybrid_online(Retriever):
    def __normalize_scores__(self, ranked_list: list) -> list:
        """
        Normalizes scores in a ranked list to a [0, 1] scale.
        """
        scores = [score for doc_id, score, text in ranked_list]
        if not scores:
            return []
        
        min_score = min(scores)
        max_score = max(scores)
        
        if max_score == min_score:
            return [(doc_id, 1.0) for doc_id, score, text in ranked_list]
        
        normalized_list = []
        for doc_id, score, text in ranked_list:
            normalized_score = (score - min_score) / (max_score - min_score)
            normalized_list.append((doc_id, normalized_score))
        return normalized_list

    def search(self, dataset_name: str, query: str, top_k: int = 10, with_index: bool = True) -> list[tuple[str, float, str]]:
        """
        Performs a complex hybrid search.
        """
        print("\nPerforming complex hybrid search (Stage 1: Fusion)...")

        # --- Get the required service modules from the registry ---
        tfidf_service = TFIDF_online()
        bm25_service = BM25_online()
        embedding_service = Embedding_online()

        # ==========================================================================
        #  STAGE 1: Parallel Fusion of TF-IDF and BM25
        # ==========================================================================
        
        tfidf_results = tfidf_service.search(dataset_name, query, top_k)

        bm25_results = bm25_service.search(dataset_name, query, top_k)

        # --- Normalize and Fuse the lexical results ---
        norm_tfidf = self.__normalize_scores__(tfidf_results)
        norm_bm25 = self.__normalize_scores__(bm25_results)

        fused_scores = {}
        tfidf_weight = 0.5
        bm25_weight = 0.5

        for doc_id, score in norm_tfidf:
            fused_scores[str(doc_id)] = score * tfidf_weight

        for doc_id, score in norm_bm25:
            doc_id_str = str(doc_id)
            if doc_id_str in fused_scores:
                fused_scores[doc_id_str] += score * bm25_weight
            else:
                fused_scores[doc_id_str] = score * bm25_weight
                
        # --- Create the final candidate list from Stage 1 ---
        candidate_list = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)
        
        # Extract just the document IDs for the next stage
        candidate_doc_ids = [doc_id for doc_id, score in candidate_list]
        
        # ==========================================================================
        #  STAGE 2: Serial Re-ranking with Embedding Model
        # ==========================================================================

        # Call the new, efficient rerank function from the embedding service
        final_list = embedding_service.embedding_rerank(dataset_name, query, candidate_doc_ids)
        
        return final_list[:top_k]

## run main

### Antique

In [None]:
dataset_name = 'antique'

docs, queries, qrels = load_dataset_with_queries(dataset_name)

### Antique tf-idf

In [None]:
print()
print("Calculating MAP for tf-idf")
st = time.time()
antique_tfidf_MAP = TFIDF_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"tf-idf MAP= {antique_tfidf_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for tf-idf")
st = time.time()
antique_tfidf_MRR = TFIDF_online().evaluateMRR(dataset_name, queries, qrels)
print(f"tf-idf MRR= {antique_tfidf_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for tf-idf")
st = time.time()
antique_tfidf_NDCG = TFIDF_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"tf-idf nDCG= {antique_tfidf_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Antique Embedding

In [None]:
print()
print("Calculating MAP for embedding")
st = time.time()
antique_embedding_MAP = Embedding_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"embedding MAP= {antique_embedding_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for embedding")
st = time.time()
antique_embedding_MRR = Embedding_online().evaluateMRR(dataset_name, queries, qrels)
print(f"embedding MRR= {antique_embedding_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for embedding")
st = time.time()
antique_embedding_NDCG = Embedding_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"embedding nDCG= {antique_embedding_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Antique bm25

In [None]:
print()
print("Calculating MAP for bm25")
st = time.time()
antique_bm25_MAP = BM25_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"bm25 MAP= {antique_bm25_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for bm25")
st = time.time()
antique_bm25_MRR = BM25_online().evaluateMRR(dataset_name, queries, qrels)
print(f"bm25 MRR= {antique_bm25_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for bm25")
st = time.time()
antique_bm25_NDCG = BM25_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"bm25 nDCG= {antique_bm25_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Antique hybrid

In [None]:
print()
print("Calculating MAP for hybrid")
st = time.time()
antique_hybrid_MAP = Hybrid_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"hybrid MAP= {antique_hybrid_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for hybrid")
st = time.time()
antique_hybrid_MRR = Hybrid_online().evaluateMRR(dataset_name, queries, qrels)
print(f"hybrid MRR= {antique_hybrid_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for hybrid")
st = time.time()
antique_hybrid_NDCG = Hybrid_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"hybrid nDCG= {antique_hybrid_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Quora

In [None]:
dataset_name = 'quora'

docs, queries, qrels = load_dataset_with_queries(dataset_name)

### Quora tf-idf

In [None]:
print()
print("Calculating MAP for tf-idf")
st = time.time()
quora_tfidf_MAP = TFIDF_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"tf-idf MAP= {quora_tfidf_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for tf-idf")
st = time.time()
quora_tfidf_MRR = TFIDF_online().evaluateMRR(dataset_name, queries, qrels)
print(f"tf-idf MRR= {quora_tfidf_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for tf-idf")
st = time.time()
quora_tfidf_NDCG = TFIDF_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"tf-idf nDCG= {quora_tfidf_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Quora Embedding

In [None]:
print()
print("Calculating MAP for Embedding")
st = time.time()
quora_embedding_MAP = Embedding_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"Embedding MAP= {quora_embedding_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for Embedding")
st = time.time()
quora_embedding_MRR = Embedding_online().evaluateMRR(dataset_name, queries, qrels)
print(f"Embedding MRR= {quora_embedding_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for Embedding")
st = time.time()
quora_embedding_NDCG = Embedding_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"Embedding nDCG= {quora_embedding_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Quora bm25

In [None]:
print()
print("Calculating MAP for bm25")
st = time.time()
quora_bm25_MAP = BM25_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"bm25 MAP= {quora_bm25_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for bm25")
st = time.time()
quora_bm25_MRR = BM25_online().evaluateMRR(dataset_name, queries, qrels)
print(f"bm25 MRR= {quora_bm25_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for bm25")
st = time.time()
quora_bm25_NDCG = BM25_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"bm25 nDCG= {quora_bm25_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Quora Hybrid

In [None]:

print()
print("Calculating MAP for Hybrid")
st = time.time()
quora_hybrid_MAP = Hybrid_online().evaluateMAP(dataset_name, queries, qrels, docs)
print(f"Hybrid MAP= {quora_hybrid_MAP}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating MRR for Hybrid")
st = time.time()
quora_hybrid_MRR = Hybrid_online().evaluateMRR(dataset_name, queries, qrels)
print(f"Hybrid MRR= {quora_hybrid_MRR}")
print(f"This took {time.time() - st}s")
print()

print()
print("Calculating nDCG for Hybrid")
st = time.time()
quora_hybrid_NDCG = Hybrid_online().evaluateNDCG(dataset_name, queries, qrels, docs)
print(f"Hybrid nDCG= {quora_hybrid_NDCG}")
print(f"This took {time.time() - st}s")
print()

### Collecting results

In [None]:
antique_values = [
[antique_tfidf_MRR, antique_embedding_MRR, antique_bm25_MRR, antique_hybrid_MRR],
[antique_tfidf_MAP, antique_embedding_MAP, antique_bm25_MAP, antique_hybrid_MAP],
[antique_tfidf_NDCG, antique_embedding_NDCG, antique_bm25_NDCG, antique_hybrid_NDCG],
]

quora_values = [
[quora_tfidf_MRR, quora_embedding_MRR, quora_bm25_MRR, quora_hybrid_MRR],
[quora_tfidf_MAP, quora_embedding_MAP, quora_bm25_MAP, quora_hybrid_MAP],
[quora_tfidf_NDCG, quora_embedding_NDCG, quora_bm25_NDCG, quora_hybrid_NDCG],
]

# Plotting

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = ["tf-idf", "embedding", "hybrid", "bm25"]
metrics = ["MRR", "MAP", "nDCG"]
x_pos = np.arange(len(methods))
y_pos = np.arange(len(metrics))

def draw_plot(dataset_name, values):

    plt.figure(figsize = (10, 6))

    for i, metric in enumerate(metrics):
        plt.scatter(x_pos, [y_pos[i]]*len(methods), s = values[i]*3000, alpha = 0.6, label = metrics[i])

    for i in range(len(metrics)):
        for j in range(len(methods)):
            plt.annotate(f"{values[i, j]:.2f}",
                        (x_pos[j], y_pos[i]),
                        ha = 'center', va = 'center',
                        fontsize = 10
                        )

    plt.xticks(x_pos, methods)
    plt.yticks(y_pos, metrics)

    plt.title(dataset_name, pad = 20)
    plt.xlabel("Retrieval Methods")
    plt.ylabel("Evaluation Metrics")

    plt.tight_layout()
    plt.grid(True, linestyle = '--', alpha = 0.7)
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 'upper left')

    plt.show()

In [None]:
draw_plot("antique", antique_values)

In [None]:
draw_plot("quora", quora_values)