In [1]:
import os
import shutil
import struct

import math
from collections import defaultdict, Counter
from typing import List, TextIO, BinaryIO

import import_ipynb
import sys
sys.path.append('../')  # Go up two folders to the project root

import structures.LexiconRow as lex_row
import structures.DocumentIndex as doc_index
import utilities.General_Utilities as util
import building_data_structures.CollectionStatistics as coll_stat

importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\structures\..\structures\LexiconRow.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\structures\..\structures\DocumentIndex.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\structures\..\utilities\General_Utilities.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\structures\..\structures\DocumentIndexRow.ipynb
importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\structures\..\building_data_structures\CollectionStatistics.ipynb


In [4]:
'''
def create_lexicon(file_input_path: str, file_output_path: str, DIR_FOLDER: str, file_extension: str, block_size: int, document_index: doc_index.DocumentIndex) -> int:
    """
    Function returns a file with one row for each distinct term in the corpus. Rows are composed by:
    term, document frequency, inverse document frequency, term upper bound
    Each values is separated by a comma.

    Args:
        file_input_path: file that contains the inverted index
        file_output_path: file that will contains the result
        DIR_FOLDER: folder that will contains the output file
        file_extension: extension of the file
        block_size: dimension of rows in main memory
    """
    # Check if the input file path exists and is a file
    if not file_input_path or not os.path.exists(file_input_path) or not os.path.isfile(file_input_path):
        raise ValueError("Invalid file_input_path.")

    # Check if the output folder path exists
    if not file_output_path:
        raise ValueError("Invalid file_output_path.")

    # Check if DIR_FOLDER is a non-empty string
    if not DIR_FOLDER or not isinstance(DIR_FOLDER, str):
        raise ValueError("Invalid DIR_FOLDER.")

    # Check if the file extension is a non-empty string
    if not file_extension or not isinstance(file_extension, str):
        raise ValueError("Invalid file_extension.")

    # Check that block_size is a positive integer
    if not isinstance(block_size, int) or block_size <= 0:
        raise ValueError("Invalid block_size. Must be a positive integer.")

    # Check that document_index is an instance of DocumentIndex
    if not document_index or not isinstance(document_index, doc_index.DocumentIndex):
        raise ValueError("Invalid document_index. Must be an instance of DocumentIndex.")
        
    try:
        lexicon = Lexicon()
        create_folder(DIR_FOLDER)
        nr_block = 0
        if os.path.exists(DIR_FOLDER + file_output_path + str(nr_block) + file_extension):
                os.remove(DIR_FOLDER + file_output_path + str(nr_block) + file_extension)
            
        with open(file_input_path, 'r') as file:
            for line in file:
                # term sarà qualcosa tipo "ciao", invece la postings list sarà 3:2 3:3 ecc
                elements = line.split()
                term = elements[0]          
                postings_list = ' '.join(elements[1:])
                
                # il dft si trova facendo la split su spazi e punti e virgola di tutta la posting list
                dft = len(postings_list.split())

                # la term frequency massima
                max_tf = compute_max_term_frequency(postings_list)

                if (sys.getsizeof(lexicon.get_structure()) > block_size):  #Free memory available
                    write_to_block(DIR_FOLDER + file_output_path + str(nr_block) + file_extension, lexicon.get_structure())
                    lexicon.clear_structure()
                    nr_block=nr_block + 1 

                lexicon.add_term(term, dft, document_index, max_tf)

            #Finally, saving the last remaing block.       
            if (not lexicon.is_empty()):   
                write_to_block(DIR_FOLDER + file_output_path + str(nr_block) + file_extension, lexicon.get_structure())

            return 0                
    except IOError as e:
        print(f"Error reading from {file_input_path}: {e}")
        return -1
'''
     

class Lexicon(util.Singleton):
    def __init__(self):
        self._vocabulary = defaultdict(lex_row.LexiconRow) # oppure "dictionary"??

    def add_term(self, term: str, dft: int, document_index: doc_index.DocumentIndex, maxTf: int) -> None:
        """Adds a document to the lexicon."""
        if not isinstance(term, str) or not isinstance(dft, int) or not isinstance(document_index, doc_index.DocumentIndex) or not isinstance(maxTf, int):
            raise ValueError("There's an error in parameter's type.")
            
        # Append new row to the lexicon
        if (self.get_terms(term)==None):
            self._vocabulary[term]=[]
        self._vocabulary[term] = lex_row.LexiconRow(term, dft, document_index, maxTf)
             
    def get_terms(self, term: str) -> lex_row.LexiconRow:
        """Fetches a row to the lexicon"""
        if not isinstance(term, str):
            raise ValueError("Term must be a string.")
            
        if (term in self._vocabulary):
            return self._vocabulary[term]
        return None
    
    def is_empty(self)->bool:
        """Check if there is no term in the lexicon."""
        return len(self.get_term())==0
    
    def get_term(self) -> List[str]:
        """Returns all unique terms in the lexicon."""
        return self._vocabulary.keys() 
    
    def clear_structure(self):
        """ It clears the lexicon data structure."""
        self._vocabulary.clear()
    
    def get_structure(self):
        """Returns the lexicon data structure."""
        return self._vocabulary 

    def find_entry(self,term: str) -> lex_row.LexiconRow:
        """Perform binary search to find a lexicon entry for a given term.

        Args:
            term: The term to search for in the lexicon.
    
        Returns:
            The LexiconRow object if the term is found, otherwise None.
        """
        entry = lex_row.LexiconRow("",0)  
        start = 0 
        
        # "end" is equal (at the beginning) to the total number of distinct terms in the lexicon
        collectionStatistics = coll_stat.Collection_statistics("Collection_statistics.txt")
        collectionStatistics.read_statistics()
        end = collectionStatistics.num_distinct_terms - 1  
    
        while start <= end:
            mid = start + (end - start) // 2
    
            # Get entry from disk

            # Forse bisogna mettere come attributo della classe lexicon un "lexicon_path" (dopo averlo costruito) con il percorso del file finale?
            # In questo modo qui basterebbe passare self.lexicon_path invece di "LEXICON/lexicon.bin"
            lexicon_path = os.path.join("..", "building_data_structures", "Lexicon", "lexicon.bin")
            with open(lexicon_path, 'rb') as file:
                entry.read_lexicon_row_on_disk_from_opened_file(file, mid * entry.SIZE_LEXICON_ROW)
            key = entry.term.strip()
            print(key)
            print(mid)
            
            # Check if the search was successful
            if key == term:
                return entry
    
            # Update search portion parameters
            if term > key:
                start = mid + 1
            else:
                end = mid - 1
    
        return None

    def get_entry(self, term: str) -> lex_row.LexiconRow:
        entry = self.get_terms(term) # check if term is in cache
        if entry is not None:
            return entry
            
        entry = find_entry(term)
        
        if entry is not None:         # add to cache
            self.add_term(term)
        
        return entry

In [5]:
#lexicon = Lexicon()
#print(lexicon.find_entry("happiness"))

like
104
early
51
have
77
for
64
golden
70
great
73
has
75
happiness
74
<structures.LexiconRow.LexiconRow object at 0x0000029891FBC1D0>
