In [1]:
import heapq
from typing import List, Tuple
from io import BufferedReader

import import_ipynb
import sys
sys.path.append('../')  # Go up two folders to the project root

from structures.DocumentIndex import DocumentIndex
from structures.Lexicon import Lexicon
from structures.PostingListHandler import Posting_List_Reader
from query_processing.Scoring import Scoring
from building_data_structures.CollectionStatistics import Collection_statistics
from structures.InvertedIndex import Posting

importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\DocumentIndex.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\utilities\General_Utilities.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\DocumentIndexRow.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\Lexicon.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\LexiconRow.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\building_data_structures\CollectionStatistics.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\PostingListHandler.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\query_processing\..\structures\InvertedIndex.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\

In [2]:
DIR_INVERTED_INDEX="../building_data_structures/INV_INDEX"
PATH_FINAL_DOC_IDS="doc_ids.bin"
PATH_FINAL_FREQ="freq.bin"
PATH_FINAL_BLOCK_DESCRIPTOR="block_descriptors.bin"
DIR_LEXICON="../building_data_structures/LEXICON"
PATH_FINAL_LEXICON="lexicon.bin"

DIR_DOC_INDEX="../building_data_structures/DOC_INDEX"
PATH_COLLECTION_STATISTICS="collection_statistics.bin"

In [3]:
# ############################### CORREGGI: IL K VA DATO COME ATTRIBUTO ALLA CLASSE, NON PASSATO COME PARAMETRO ALLE FUNZIONI

class DAAT():
    file_DocIds: BufferedReader
    file_Freq: BufferedReader
    file_blocks: BufferedReader
    posting_readers: List[Posting_List_Reader]
    top_k_documents: List[Tuple[float, int]]

    def __init__(self):
        self.lexicon = Lexicon()
        self.collection_statistics = Collection_statistics(DIR_DOC_INDEX+"/"+PATH_COLLECTION_STATISTICS)
        self.collection_statistics.read_binary_mode()
        self.scorer = Scoring(self.collection_statistics)
        # ############### VALUTARE SE APRIRE TUTTE LE POSTING UNA SOLA VOLTA NEL COSTRUTTORE E CHIUDERLE A FINE PROGRAMMA PER RISPARMIARE TEMPO
        # self.open_all_posting_lists()
    
    def open_all_posting_lists(self) -> None: 
        self.file_DocIds = open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_DOC_IDS, 'rb') 
        self.file_Freq = open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_FREQ, 'rb') 
        self.file_blocks = open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_BLOCK_DESCRIPTOR, 'rb')
        self.file_lexicon = open(DIR_LEXICON+"/"+PATH_FINAL_LEXICON, 'rb') 
        
        self.scorer.open_files()

    def reset_lists(self) -> None:
        # This list will contain pointer to the posting lists of all terms 
        self.posting_readers = []
        # This list will contain the k most relevant document
        self.top_k_documents = []

    def close_all_posting_lists(self):
        for file in [self.file_DocIds, self.file_Freq, self.file_blocks, self.file_lexicon]:
            file.close()      

    def scoreQuery(self, k: int, choice_function: str, tokens: List[str], isConjunctive: bool) -> List[Tuple[float, int]]:
        """
        Scores a query and returns the top-k documents.

        Args:
            k (int): The number of top documents to retrieve.
            choice_function (str): The scoring function to use.
            tokens (List[str]): List of query tokens.
            isConjunctive (bool): Whether the query is conjunctive.

        Returns:
            List[Tuple[float, int]]: List of top-k documents with their scores.
        """
        self.open_all_posting_lists()
        self.reset_lists()
        self.initialize_posting_lists(tokens)

        old_doc_id = -1 # used for the last posting list
        
        while True:
            try:
                # Retrieve the minimum doc_id, the next to process
                docToProcess, term_freq = self.min_doc()

                # Check if there are no other doc to process
                if docToProcess == -1:
                    break

                # If i have read a new doc_id
                if docToProcess != old_doc_id:
                    term_freq = 0 # reset term_freq
                    old_doc_id = docToProcess # update old doc_id

                if isConjunctive:
                    current_docs = [reader for reader in self.posting_readers if reader.get_current_posting() is not None]

                    # Check if the document is present in all posting lists
                    if any(post.get_current_posting().doc_id != docToProcess for post in current_docs):
                        for reader in current_docs: 
                            if reader.get_current_posting().doc_id == docToProcess: # next if doc_id equal to min_doc_id
                                next(reader)
                        continue
                        
                for reader in self.posting_readers:
                    if reader.get_current_posting() is not None and reader.get_current_posting().doc_id == docToProcess:
                        term_freq += reader.get_current_posting().frequency
                        next(reader)

                self.update_heap(choice_function, docToProcess, term_freq, k)

            except StopIteration:
                    end, _ = self.all_lists_exhausted()
                
                    if end == True:
                        self.update_heap(choice_function, docToProcess, term_freq, k)
                        break  
                    else:
                        continue  
            except Exception as e:
                print(f"Error during execution: {e}")
                break
                
            finally:
                self.close_all_posting_lists()
        
        self.close_all_posting_lists()
        self.scorer.close_files()

        return self.top_k_documents

    def initialize_posting_lists(self,tokens: List[str]) -> None:
        """
        Initializes posting lists for the given tokens.

        Args:
            tokens (List[str]): List of query tokens.
        """
        for token in tokens:
            term_lexicon_row = self.lexicon.get_entry(token)
        
            if term_lexicon_row is not None:                
                self.posting_readers.append(Posting_List_Reader(term_lexicon_row, False, self.file_DocIds, self.file_Freq, self.file_blocks))

                # print("Per il termine: ", term_lexicon_row.term)
                # for obj in posting_reader:
                #     print (obj)
                
        for reader in self.posting_readers:
            next(reader)

    def update_heap(self,choice_function: str, docToProcess: int, term_freq: int, k: int) -> None:
        """
        Updates the priority queue (heap) with the latest document score.

        Args:
            choice_function (str): The scoring function to use.
            docToProcess (int): Document ID to process.
            term_freq (int): Term frequency in the document.
            k (int): The number of top documents to retrieve.
        """
        # print("Sto per processare docToProcess, term_freq: ", docToProcess, term_freq)
        doc_score = self.scorer.choose_scoring_function(choice_function, docToProcess, term_freq)
        # print("Il doc_score è venuto: ", doc_score)
        
        # Add the element to the priority queue
        heapq.heappush(self.top_k_documents, (doc_score, docToProcess)) 
    
        # Keep the priority queue of size k.
        if len(self.top_k_documents) > k:
            heapq.heappop(self.top_k_documents) 

    def all_lists_exhausted(self) -> Tuple[bool, List[Posting]]:
        """
        Checks if all posting lists are exhausted.

        Returns:
            Tuple[bool, List[[Posting]]: Tuple containing a boolean indicating whether all lists are exhausted
            and a list of the current documents in each posting list.
        """
        # Read the next document from each posting list
        current_docs = [reader.get_current_posting() for reader in self.posting_readers]
        
        # Check if all readers have reached the end of the list
        return all(doc is None for doc in current_docs), current_docs

    def min_doc(self) -> Tuple[int, int]:
        """
        Retrieves the minimum document ID and its frequency among the current documents in all posting lists.

        Returns:
            Tuple[int, int]: Tuple containing the minimum document ID and its frequency.
        """
        end, current_docs = self.all_lists_exhausted()

        # Check if all readers have reached the end of the list
        if end == True:
            return -1,-1

        # Fetch only not null documents
        valid_docs = [doc for doc in current_docs if doc is not None]

        # Retrieve the documents with min doc_id
        min_doc = min(valid_docs, key=lambda x: x.doc_id)

        # Return the minimum doc_id and its frequency
        return min_doc.doc_id, min_doc.frequency

In [4]:
# daat = DAAT()
# my_list = ["and", "cat"]
# daat.scoreQuery(3, "bm25", my_list , True)

[(1.1948436837130192, 21), (1.1948436837130192, 51)]

In [5]:
####################################################
# VALUTARE SE UNA CLASSE SIMILE RENDEREBBE PIU' LENTA/VELOCE L'ESECUZIONE
# Al posto di avere la lista top_k_documents e un parametro k 
####################################################

# class MinHeap:
#     def __init__(self, k: int):
#         self.heap = []
#         self.k = k

#     def push(self, item: Tuple[float, int]) -> None:
#         heapq.heappush(self.heap, item)
#         if len(self.heap) > self.k:
#             heapq.heappop(self.heap)