### 1. Setup and Imports 

This section initializes the environment, sets up necessary constants, and imports required libraries and modules.

In [5]:
import re
import os
import nltk
import string
import csv
import numpy as np
import pandas as pd
from typing import List
from numpy import ndarray
from abc import ABC, abstractmethod
from typing import Optional
from typing import  Dict
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from database.mongo_helper import MongoDBConnection
from collections import defaultdict
from gensim.models import Word2Vec
from common.constants import Locations
from common.file_utilities import FileUtilities
from database.chroma_helper import ChromaHelper
from text_processors.antique_text_processor import AntiqueTextProcessor
from tabulate import tabulate
from overrides import overrides
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, PorterStemmer, ne_chunk
from nltk.corpus import wordnet
from spellchecker import SpellChecker


load_dotenv()
WIKIPEDIA_DATASET_PATH = os.getenv('WIKIPEDIA_DATASET_PATH')
RECALL_PRECISION_THRESHOLD = int(os.getenv('RECALL_PRECISION_THRESHOLD', 10))


### 2. Define Evaluation Metrics Calculators

in this section we are creating the routines used to calculate evaluations later for MAP, MRR, Recall@10, precision@10

In [6]:
class MetricCalculator(ABC):
    @abstractmethod
    def calculate(self, query_id: str, retrieved_docs: List[str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> float:
        pass

class AveragePrecisionCalculator(MetricCalculator):
    def calculate(self, query_id: str, retrieved_docs: List[str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> float:
        if query_id not in qrels:
            return 0.0

        relevant_docs = qrels[query_id]
        num_retrieved_relevant_docs = 0
        sum_precisions = 0.0
        num_relevant_docs = 0

        for i, doc_id in enumerate(retrieved_docs, start=1):
            if doc_id in relevant_docs and relevant_docs[doc_id] > 0:
                num_retrieved_relevant_docs += 1
                num_relevant_docs += 1
                precision_at_i = num_retrieved_relevant_docs / i
                sum_precisions += precision_at_i

        average_precision = 0 if num_relevant_docs == 0 else sum_precisions / num_relevant_docs
        return average_precision

class PrecisionCalculator(MetricCalculator):
    def calculate(self, query_id: str, retrieved_docs: List[str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> float:
        if not retrieved_docs:
            return 0.0

        relevant_docs = qrels.get(query_id, {})
        relevant_retrieved = sum(1 for doc_id in retrieved_docs[:k] if doc_id in relevant_docs)

        if not relevant_retrieved:
            return 0.0
        return relevant_retrieved / min(len(retrieved_docs), k)

class RecallCalculator(MetricCalculator):
    def calculate(self, query_id: str, retrieved_docs: List[str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> float:
        relevant_docs = qrels.get(query_id, {})
        relevant_retrieved = sum(1 for doc_id in retrieved_docs[:k] if doc_id in relevant_docs)
        total_relevant = sum(relevant_docs.values())
        return relevant_retrieved / total_relevant if total_relevant > 0 else 0

class ReciprocalRankCalculator(MetricCalculator):
    def calculate(self, query_id: str, retrieved_docs: List[str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> float:
        relevant_docs = qrels.get(query_id, {})

        if k is not None:
            retrieved_docs = retrieved_docs[:k]

        for i, doc in enumerate(retrieved_docs, start=1):
            doc_id = doc['doc_id']
            if doc_id in relevant_docs.keys() and relevant_docs[doc_id] > 0:
                return 1.0 / i
        return 0.0


### 3. Define Evaluation Manager
EvaluationManager is an extra class responsible for evaluating the qrels & queries passed for a set of (matcher, metrics)

In [7]:
class EvaluationManager:
    def __init__(self, metric_calculators: List[MetricCalculator], matcher):
        self.metric_calculators = metric_calculators
        self.matcher = matcher

    def evaluate(self, queries: Dict[str, str], qrels: Dict[str, Dict[str, int]], k: Optional[int] = None) -> Dict[str, Dict[str, float]]:
        evaluation_results = {}

        for query_id, query_text in queries.items():
            retrieved_docs = self.matcher.match(query_text)[:k]
            metrics_results = {}
            for metric_calculator in self.metric_calculators:
                metric_name = metric_calculator.__class__.__name__
                if metric_name in ["AveragePrecisionCalculator", "RecallCalculator", "PrecisionCalculator"]:
                    retrieved_doc_ids = [doc_info['doc_id'] for doc_info in retrieved_docs]
                    metric_value = metric_calculator.calculate(query_id, retrieved_doc_ids, qrels, k=k)
                else:
                    metric_value = metric_calculator.calculate(query_id, retrieved_docs, qrels)
                metrics_results[metric_name] = metric_value

            evaluation_results[query_id] = metrics_results

        return evaluation_results


### 4. Load Data
in this section we are loading the trained model `load_as_dict`, qrels, and queries.

In [8]:
class DatasetReader:
    def __init__(self, file_path: str):
        self.file_path = file_path

    @abstractmethod
    def load_as_dict(self) -> dict:
        pass

    @abstractmethod
    def read_queries(self) -> dict:
        pass

    @abstractmethod
    def read_qrels(self) -> defaultdict:
        pass
    
class WikipediaReader(DatasetReader, ABC):

    @overrides
    def load_as_dict(self) -> dict:
        key_value_pairs = {}

        df = pd.read_csv(self.file_path)

        for index, row in df.iterrows():
            key = str(row['id_right'])
            value = str(row['text_right'])

            key_value_pairs[key] = value

        return key_value_pairs

    @overrides
    def read_queries(self) -> dict:
        queries_path = os.environ.get('WIKIPEDIA_QUERIES_PATH', '../data/wikipedia/queries.csv')
        queries = {}
        with open(queries_path, newline='') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # Skip header
            for row in reader:
                queries[row[0]] = row[1]
        return queries

    @overrides
    def read_qrels(self) -> defaultdict:
        qrels_path = os.environ.get('WIKIPEDIA_QRELS_PATH', '../data/wikipedia/qrels')
        qrels = defaultdict(dict)

        with open(qrels_path, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                query_id, _, doc_id, relevance = parts
                qrels[query_id][doc_id] = int(relevance)

        return qrels
    
def load_wikipedia_data():
    reader = WikipediaReader(WIKIPEDIA_DATASET_PATH)
    qrels = reader.read_qrels()
    queries = reader.read_queries()
    return qrels, queries

qrels, queries = load_wikipedia_data()

### 5. Text Processing
text processing steps used for training the model and preparing queries

In [9]:

def get_wordnet_pos(tag_parameter):
    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


class BaseTextProcessor:
    def __init__(self) -> None:
        self.stop_words = set(stopwords.words('english'))
        self.spell_checker = SpellChecker(distance=4)
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = word_tokenize
        self.stemmer = PorterStemmer()
        self.pos_tagger = pos_tag

    def process(self, text) -> List[str]:
        pass

    def process_query(self, query: str) -> List[str]:
        pass

    def _word_tokenizer(self, text: str) -> List[str]:
        tokens = self.tokenizer(text)
        return tokens

    @staticmethod
    def _lowercase_tokens(tokens: List[str]) -> List[str]:
        return [str(np.char.lower(token)) for token in tokens]

    def _filter_stop_words(self, tokens: List[str]) -> List[str]:
        return [token for token in tokens if token not in self.stop_words and len(token) > 1]

    @staticmethod
    def _remove_registered_markers(tokens: List[str]) -> List[str]:
        return [re.sub(r'\u00AE', '', token) for token in tokens if token is not None]

    @staticmethod
    def _strip_punctuation(tokens: List[str]) -> List[str]:
        return [
            token.translate(str.maketrans('', '', string.punctuation))
            for token in tokens if token is not None
        ]

    @staticmethod
    def _eliminate_whitespaces(tokens: List[str]) -> List[str]:
        return [re.sub(r'_', ' ', token) for token in tokens if token is not None]

    @staticmethod
    def _remove_apostrophes(tokens: List[str]) -> List[str]:
        return [str(np.char.replace(token, "'", " ")) for token in tokens if token is not None]

    def _apply_stemming(self, tokens: List[str]) -> List[str]:
        return [self.stemmer.stem(token) for token in tokens]

    @staticmethod
    def _normalize_abbreviations(tokens: List[str]) -> List[str]:
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens

    def _lemmatize_tokens(self, tokens: List[str]) -> List[str]:
        lemmatizer = self.lemmatizer
        pos_tags = self.pos_tagger(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag)) for token, tag in pos_tags]
        return lemmatized_tokens

    def _spell_check(self, tokens: List[str]) -> List[str]:
        return [self.spell_checker.correction(word) if isinstance(word, str) else word for word in tokens if
                word is not None]
    @staticmethod
    def get_tokens_as_string(tokens: List[str]) -> str:
        return ' '.join(tokens)

    def _apply_named_entity_recognition(self, tokens: List[str]) -> List[str]:
        tagged_tokens = self.pos_tagger(tokens)
        ner_tags = ne_chunk(tagged_tokens)
        entities = []

        def extract_entity_text(chunk):
            if isinstance(chunk, nltk.tree.Tree):
                return ' '.join([extract_entity_text(c) for c in chunk])
            else:
                return chunk[0]

        for chunk in ner_tags:
            if hasattr(chunk, 'label') and chunk.label() == 'NE':
                entity = extract_entity_text(chunk.leaves())
                entities.append(str(entity))
            else:
                entity = chunk[0] if isinstance(chunk, nltk.tree.Tree) else chunk[0][0]
                entities.append(str(entity))

        return entities

class WikipediaTextProcessor(BaseTextProcessor):
    @overrides
    def process(self, text) -> List[str]:
        tokens = self._word_tokenizer(text)
        tokens = self._lowercase_tokens(tokens)
        tokens = self._strip_punctuation(tokens)
        tokens = self._remove_apostrophes(tokens)
        tokens = self._filter_stop_words(tokens)
        tokens = self._remove_registered_markers(tokens)
        tokens = self._lemmatize_tokens(tokens)
        tokens = self._normalize_abbreviations(tokens)
        #tokens = self._spell_check(tokens)
        tokens = self._lowercase_tokens(tokens)
        tokens = self._eliminate_whitespaces(tokens)
        return tokens


### 6. Matching
matcher classes are used to match a query to a set of documents, it calculates the similarity between query and document using `cos_similarity`

In [10]:
class QueryMatcher:
    def __init__(self, model_name: str):
        matrix_path: str = Locations.generate_matrix_path(model_name)
        self.matrix = FileUtilities.load_file(matrix_path)

        model_path: str = Locations.generate_model_path(model_name)
        self.model: TfidfVectorizer = FileUtilities.load_file(model_path)

        self.threshold = float(os.environ.get('SIMILARITY_THRESHOLD', 0.1))

        self.db_collection = MongoDBConnection.get_instance().get_collection(model_name)

    def __vectorize_query(self, query: str):
        return self.model.transform([query])

    def match(self, query: str):
        query_vector = self.__vectorize_query(query)

        cos_similarities = cosine_similarity(self.matrix, query_vector)

        sorted_indices = np.argsort(cos_similarities, axis=0)[::-1].flatten()

        matching_docs_indices = []
        for i in sorted_indices:
            if cos_similarities[i].item() >= self.threshold:
                matching_docs_indices.append(i.item() + 1)

        matching_results = list(self.db_collection.find({"index": {"$in": matching_docs_indices}}))

        return sorted(
            matching_results,
            key=lambda x: matching_docs_indices.index(x['index']),
            reverse=False
        )
    

class WikipediaMatcher(QueryMatcher):
    def __init__(self):
        super().__init__(Locations.WIKIPEDIA_COLLECTION_NAME)

### 7. Results (Without Embedding)

In [11]:
matcher = WikipediaMatcher()

metric_calculators = [
    AveragePrecisionCalculator(),
    PrecisionCalculator(),
    RecallCalculator(),
    ReciprocalRankCalculator()
]

evaluation_manager = EvaluationManager(metric_calculators, matcher)
evaluation_results_no_embedding = evaluation_manager.evaluate(queries, qrels, RECALL_PRECISION_THRESHOLD)

def calculate_average_metrics(evaluation_results):
    total_map = sum(metrics_results["AveragePrecisionCalculator"] for metrics_results in evaluation_results.values())
    total_mrr = sum(metrics_results["ReciprocalRankCalculator"] for metrics_results in evaluation_results.values())
    total_queries = len(evaluation_results)

    average_map = total_map / total_queries if total_queries > 0 else 0.0
    average_mrr = total_mrr / total_queries if total_queries > 0 else 0.0

    return average_map, average_mrr

average_map_no_embedding, average_mrr_no_embedding = calculate_average_metrics(evaluation_results_no_embedding)
print(f"MAP without embedding: {average_map_no_embedding}")
print(f"MRR without embedding: {average_mrr_no_embedding}")

table = []
for query_id, metrics in evaluation_results_no_embedding.items():
    row = [
        query_id,
        f"{metrics['PrecisionCalculator']}",
        f"{metrics['RecallCalculator']:.6f}"
    ]
    table.append(row)

print("\nDetailed Results Without Embedding:")
print(tabulate(table, headers=["Query ID", f'Precision@{RECALL_PRECISION_THRESHOLD}', f'Recall@{RECALL_PRECISION_THRESHOLD}'], tablefmt="pretty"))

MAP without embedding: 0.5479969576719579
MRR without embedding: 0.6204920634920635

Detailed Results Without Embedding:
+----------+--------------+-----------+
| Query ID | Precision@10 | Recall@10 |
+----------+--------------+-----------+
|  158491  |   0.000000   | 0.000000  |
|   5728   |   0.100000   | 0.062500  |
|  13554   |   0.200000   | 0.250000  |
|  32674   |   0.700000   | 0.875000  |
|  406391  |   0.300000   | 0.333333  |
|   5115   |   0.100000   | 0.111111  |
|  15469   |   0.200000   | 0.080000  |
|  62953   |   0.200000   | 0.285714  |
|  152444  |   0.100000   | 0.100000  |
|  104086  |   0.600000   | 0.545455  |
|  145194  |   0.100000   | 0.142857  |
|  73752   |   0.200000   | 0.285714  |
| 1368508  |   0.200000   | 0.047619  |
|  11534   |   0.400000   | 0.190476  |
|  83078   |   0.000000   | 0.000000  |
|  25174   |   0.200000   | 0.025641  |
|  265104  |   0.500000   | 0.294118  |
|  139082  |   0.100000   | 0.125000  |
|  37743   |   0.200000   | 0.285714  |

# 8. Improving Results With Embedding
new matcher class for embedding models, the matcher class will load embedding model on initialization

In [12]:
class BaseEmbeddingMatcher:

    def __init__(self, model_name: str, text_processor: BaseTextProcessor):
        self.vector_collection = ChromaHelper.get_instance().get_or_create_collection(model_name)
        self.vector_size = int(os.environ.get("VECTOR_SIZE", 500))
        self.model: Word2Vec = self.__load_model(model_name)
        self.text_processor = text_processor
        self.model_name = model_name

    def match(self, text: str, top: int = 5000):
        processed_query: List[str] = self.text_processor.process(text)
        query_embeddings: List = self.vectorize_query(processed_query).tolist()

        result = self.vector_collection.query(
            query_embeddings=query_embeddings,
            n_results=top,
        )

        transformed_results = []
        ids = result.get('ids', [[]])[0]
        documents = result.get('documents', [[]])[0]
        distances = result.get('distances', [[]])[0]

        for doc_id, doc_content, doc_similarity in zip(ids, documents, distances):
            transformed_results.append({
                'doc_id': doc_id,
                'doc_content': doc_content,
                'similarity': doc_similarity,
            })

        return transformed_results

    def vectorize_query(self, query_words: list[str]) -> ndarray:

        query_vectors = [self.model.wv[word] for word in query_words if word in self.model.wv ]

        if query_vectors:
            query_vec = np.mean(query_vectors, axis=0)
        else:
            query_vec = np.zeros(self.vector_size)

        return query_vec

    @staticmethod
    def __load_model(model_name: str):
        return FileUtilities.load_file(
            file_path=Locations.generate_embeddings_model_path(model_name)
        )



class WikipediaEmbeddingMatcher(BaseEmbeddingMatcher):

    def __init__(self):
        super().__init__(
            model_name='wikipedia',
            text_processor=WikipediaTextProcessor(),
        )

### 9. Results (With Embedding)
- ~?%+ MAP 
- ~?%+ MRR

In [13]:

matcher = WikipediaEmbeddingMatcher()
evaluation_manager = EvaluationManager(metric_calculators, matcher)
evaluation_results_with_embedding = evaluation_manager.evaluate(queries, qrels, RECALL_PRECISION_THRESHOLD)

average_map_with_embedding, average_mrr_with_embedding = calculate_average_metrics(evaluation_results_with_embedding)
print(f"MAP with embedding: {average_map_with_embedding:.6f}")
print(f"MRR with embedding: {average_mrr_with_embedding:.6f}")

table = []
for query_id, metrics in evaluation_results_with_embedding.items():
    row = [
        query_id,
        f"{metrics['PrecisionCalculator']}",
        f"{metrics['RecallCalculator']:.6f}"
    ]
    table.append(row)

print("\nDetailed Results With Embedding:")
print(tabulate(table, headers=["Query ID", f'Precision@{RECALL_PRECISION_THRESHOLD}', f'Recall@{RECALL_PRECISION_THRESHOLD}'], tablefmt="pretty"))

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\anasr\\Desktop\\search-engine\\common\\..\\engines\\wikipedia/embeddings_model\\wikipedia_embedding_model.pkl'