In [4]:
import struct
from typing import BinaryIO

import import_ipynb
import sys
sys.path.append('../')  
import structures.DocumentIndex as doc_index

In [5]:
class LexiconRow:
    
    MAX_TERM_LENGTH=30
    STR_SIZE_LEXICON_ROW='30s 2i 2f 3q 3i'
    SIZE_LEXICON_ROW=struct.calcsize(STR_SIZE_LEXICON_ROW)
    
    def __init__(self, term: str, dft: int, max_tf: int=0, bm25dl:float=0, BM25Tf:float=0,
                 docidOffset:int=0,frequencyOffset:int=0,blockOffset:int=0,docidSize:int=0,frequencySize:int=0,numBlocks:int=1):
        
        self.term = term if (len(term)<self.MAX_TERM_LENGTH) else term[:self.MAX_TERM_LENGTH]
        self.term = self.term.ljust(self.MAX_TERM_LENGTH)
        
        # Document frequency of the term
        self.dft = dft
        
        # Max term frequency
        self.max_tf = max_tf
        

        # Inverse of document frequency of the term.              
        self.idft = 0
        #compute_IDFT(doc_index.number_of_documents, dft)

        # Max tfidf
        self.maxTFIDF = 0
        #compute_TFIDF(max_tf, self.idft)
    
        
        self.docidOffset=docidOffset
        self.frequencyOffset=frequencyOffset
        self.blockOffset=0
        
        self.docidSize=0
        self.frequencySize=0
        self.numBlocks=0
        

#     def __init__(self, term: str, dft: int, doc_index: DocumentIndex, max_tf: int):
#         if not isinstance(term, str) or not isinstance(doc_index, DocumentIndex):
#             raise ValueError("term must be a string and doc_index a DocumentIndex.")
#         if not isinstance(dft, int) or not isinstance(max_tf, int):
#             raise ValueError("dft and max_tf must be integers.")

#         self.term = term

#         # Document frequency of the term
#         self.dft = dft

#         # Inverse of document frequency of the term.              
#         self.idft = compute_IDFT(doc_index.number_of_documents, dft)

#         # Max term frequency
#         self.max_tf = max_tf

#         # Max tfidf
#         self.maxTFIDF = compute_TFIDF(max_tf, self.idft)

    def to_string(self):
        """This function returns a string representation of a LexiconRow.
        
        Returns:
            a human readable string representation of the Lexicon Row
        """
        string = ' '.join([str(self.term) , str(self.dft) , str(self.idft), str(self.max_tf), str(self.maxTFIDF)])
        return string    
    
    def write_lexicon_row_on_disk_to_opened_file(self,file:BinaryIO,offset:int=0):
        """This function writes on a specific position of an opened file a lexicon row information.
           
           Args:
               file: the file to store the lexicon row
               offset: the position inside the file to store the lexicon row
           Returns:
               the new offset free position after writing on the file
        """
        
        file.seek(offset)
       
        binary_data = struct.pack(self.STR_SIZE_LEXICON_ROW, 
                                      self.term.encode('utf-8'),
                                      self.dft,self.max_tf,
                                      self.idft, self.maxTFIDF, 
                                      self.docidOffset, self.frequencyOffset,self.blockOffset,
                                      self.docidSize, self.frequencySize, self.numBlocks)
        file.write(binary_data)
            
        return self.SIZE_LEXICON_ROW+offset
        
    def read_lexicon_row_on_disk_from_opened_file(self,file:BinaryIO,offset:int):
        """This function reads a lexicon row informations in a specific position from an opened file.
        
        Args:
            file: the file to read a lexicon row
            offset: the position inside the file to read the lexicon row
        
        Returns:
            the offset position after reading
            
        """
        file.seek(offset)  
        bytesLetti = file.read(self.SIZE_LEXICON_ROW)
        
        if(not bytesLetti):
            return None
            
        try:
            term,dft,max_tf,idft,maxTFIDF,docidOffset,frequencyOffset,blockOffset, docidSize,frequencySize,numBlocks = struct.unpack(self.STR_SIZE_LEXICON_ROW, bytesLetti)

            self.term=term.decode('utf-8')
            self.dft=dft
            self.idft=idft
            self.max_tf=max_tf
            self.maxTFIDF=maxTFIDF
            self.docidOffset=docidOffset
            self.frequencyOffset=frequencyOffset
            self.docidSize=docidSize
            self.frequencySize=frequencySize
            self.numBlocks=numBlocks
            self.blockOffset=blockOffset
            
            
        except struct.error as e:
            print(f"Error unpacking data: {e}")
            
        return offset+self.SIZE_LEXICON_ROW
    
   
    #USED FOR DEBUGGING
    
    def write_lexicon_row_on_disk(self,file_path:str,offset:int=0):
        """This function opens a file and writes on a specific position a lexicon row information.
            This is used for debug and tests.
        
            Args:
               file_path: the file to store the lexicon row
               offset: the position inside the file to store the lexicon row
            Returns:
                the new offset free position after writing
               
        """
        with open(file_path, 'ab') as file:
            return self.write_lexicon_row_on_disk_to_opened_file(file,offset)
            
    def read_lexicon_row_on_disk(self,file_path:str,offset:int):
        """This function opens a file and reads in a specific position a lexicon row information.
            This is used for debug and tests.
        
            Args:
               file_path: the file to read a lexicon row
               offset: the position inside the file to read the lexicon row
            Returns:
                the offset position after reading
        """
        with open(file_path, 'rb') as file:
            return self.read_lexicon_row_on_disk_from_opened_file(file,offset)

    def compute_avgDL(doc_index: doc_index.DocumentIndex) -> float:
        """
        Compute the average document length based on the document index (doc_index).
    
        Args:
            doc_index: An instance of DocumentIndex representing the document index.
        """
        if not isinstance(doc_index, doc_index.DocumentIndex):
            raise ValueError("Invalid parameters.")
            
        return doc_index.total_document_length / doc_index.number_of_documents


    def compute_IDFT(number_of_documents: int, dft:int) -> float:
        """
        Compute the inverse document frequency for a term based on the document index (doc_index) and document frequency (dft).
    
        Args:
            doc_index: An instance of DocumentIndex representing the document index.
            dft: The document frequency of the term.
        """
        if not isinstance(number_of_documents, int) or not isinstance(dft, int):
            raise ValueError("Invalid parameters.")
    
        if dft < 0 or number_of_documents < 0:
            return 0
        
        return math.log(number_of_documents/dft)  

    def compute_TFIDF(tf: int, idf: float) -> float:
        """
        Compute the TF-IDF value based on the term frequency (tf) and inverse document frequency (idf).
        
        Args:
            tf: An integer representing the term frequency.
            idf: A float representing the inverse document frequency.
        """
        if not isinstance(tf, int) or not isinstance(idf, float):
            raise ValueError("Invalid parameters.")
    
        if tf < 0:
            return 0
            
        return (1 + math.log(tf)) * idf

    def compute_max_term_frequency(postings_list: str) -> int:
        """
        Given a postings list of a term, compute the maximum term frequency.
    
        Args:
            postings_list: A string containing elements with colon-separated values.
        """
        if not isinstance(postings_list, str):
            raise ValueError("Invalid postings list.")
    
        if len(postings_list) == 0:
            return 0
            
        # Split the postings list into individual elements
        postings_elements = postings_list.split()
    
        # Initialize the maximum value with the value from the first element
        max = int(postings_elements[0].split(':')[1])
    
        # Iterate through each element and find the maximum value after the colon
        for item in postings_elements:
            # Split each element to extract the value after the colon
            parts = item.split(':')
            value = int(parts[1])
            
            # Compare the extracted value with the current maximum value and update if necessary
            if value > max:
                max = value
    
        return max

    