In [1]:
import sys
import shutil
import os
import math

from typing import List, Dict, Tuple, Union, Any, Callable
from typing import TextIO, BinaryIO
from collections import Counter, defaultdict,OrderedDict


import import_ipynb
sys.path.append('../')  # Go up two folders to the project root

from structures.InvertedIndex import Posting,InvertedIndex
from structures.Lexicon import Lexicon
from structures.LexiconRow import LexiconRow
from structures.DocumentIndex import DocumentIndex
from structures.DocumentIndexRow import DocumentIndexRow
from structures.BlockDescriptor import BlockDescriptorBuilder,BlockDescriptor

importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\InvertedIndex.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\Lexicon.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\LexiconRow.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\DocumentIndex.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\utilities\General_Utilities.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\DocumentIndexRow.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\utilities\Compression.ipynb
importing Jupyter notebook from C:\Users\Davide\IR\Progetto\building_data_structures\..\structures\BlockDescriptor.ipynb


In [2]:
# Costants
TYPE_DOC_ID="type_doc_id"
TYPE_FREQ="type_freq"

DIR_TEMP_FOLDER="TEMP"
DIR_TEMP_DOC_ID="DOC_ID_TEMP"
DIR_TEMP_FREQ="FREQ_TEMP"
DIR_TEMP_LEXICON="LEXICON_TEMP"

DIR_LEXICON="LEXICON"
DIR_DOC_INDEX="DOCUMENT_INDEX"
DIR_INVERTED_INDEX="INV_INDEX"

DIR_DOC_INDEX="Document_index"

PATH_FINAL_LEXICON="lexicon.bin"
PATH_FINAL_DOC_IDS="doc_ids.bin"
PATH_FINAL_FREQ="freq.bin"
PATH_FINAL_BLOCK_DESCRIPTOR="block_descriptors.bin"

PATH_FINAL_INVERTED_INDEX_DEBUG="inverted_index.txt"


In [3]:
class IndexBuilder:

    debug_mode:bool
    compression_mode:bool
    
    
    #For writing the final result
    file_Final_Lexicon:BinaryIO
    file_Final_DocIds:BinaryIO
    file_Final_Freq:BinaryIO
        
    file_Final_Block_Descriptor:BinaryIO
        
    file_Final_InvertedIndex_Debug:TextIO
        
    #For merging operation
    input_lex_temp_files:List[BinaryIO]
    input_doc_id_temp_files:List[BinaryIO]
    input_freq_temp_files:List[BinaryIO]
        
    b_d_b:BlockDescriptorBuilder     
        
    def __init__(self,debug_mode:bool, compression_mode:bool)->None:
        """
            Default contructor method for initialization of IndexBuilder class.
        Args:
            debug_mode: if true, it enable the modality for a clear human readable debug.
            compression_mode: if true, it enable saving of posting list in a compression mode. 
        
        """
        print ("Index Builder Costructor")
        
        self.debug_mode=debug_mode
        self.compression_mode=compression_mode
        
        self.input_lex_temp_files = []
        self.input_doc_id_temp_files= []
        self.input_freq_temp_files= []
        
        self.b_d_b=BlockDescriptorBuilder("PATH_DA_DEFINIRE")
        
        print ("Using: ")
        print ("Debug Mode :"+str(debug_mode))
        print ("Compression Mode :"+str(compression_mode))
        print ("\n\n")
              
        
        
    def build_in_memory_index(self,list_of_documents:list)->InvertedIndex:
        """Given a list of document, build an Inverted Index in main Memory (RAM) and return it.
           !! THIS METHOD IS NOT USED TO BUILD THE EFFECTIVE INDEX !!
           
           This is used in test phase to check rapidly if the output obtained is correct or not.
        Args:
            list_of_documents: list of strings representing a document.
            
        Returns:
            An Inverted Index in memory object of the list_of_documents.
           
        """
        invertedIndex = InvertedIndex()
        for doc in list_of_documents:
            doc_list = doc.split()
            doc_id = int(doc_list[0])
            text = ' '.join(doc_list[1:])
            tc = Counter(text.lower().split())  # dict with term counts, QUI USARE DIRETTAMENTE IL CONTENUTO GIA' PRE-PROCESSATO
            for term, freq in tc.items():
                invertedIndex.add_posting(term, doc_id, freq)
        return invertedIndex
        
    
    def init_spimi(self)->None:
        """ Function to initialize a clear environment to start building the needed data structures for the spimi phase."""
        
        if os.path.exists(DIR_TEMP_FOLDER):
            shutil.rmtree(DIR_TEMP_FOLDER)

        os.makedirs(DIR_TEMP_FOLDER)
        os.makedirs(DIR_TEMP_FOLDER+"/"+DIR_TEMP_DOC_ID)
        os.makedirs(DIR_TEMP_FOLDER+"/"+DIR_TEMP_FREQ)
        os.makedirs(DIR_TEMP_FOLDER+"/"+DIR_TEMP_LEXICON)
        
        if os.path.exists(DIR_DOC_INDEX):
            shutil.rmtree(DIR_DOC_INDEX)
            
        os.makedirs(DIR_DOC_INDEX)
        
        
    def init_index_merging(self)->None:
        """ Function to initialize a clear environment to start building the effective datastructures. """
        
        if os.path.exists(DIR_LEXICON):
            shutil.rmtree(DIR_LEXICON)
                
        if os.path.exists(DIR_INVERTED_INDEX):
            shutil.rmtree(DIR_INVERTED_INDEX)

        os.makedirs(DIR_LEXICON)
        os.makedirs(DIR_INVERTED_INDEX)
                    
        if (self.debug_mode):
            if os.path.exists(PATH_FINAL_INVERTED_INDEX_DEBUG):
                os.remove(PATH_FINAL_INVERTED_INDEX_DEBUG)
    
    
    def __open_files_for_merging_operation(self)->None:
        """ Function to open the needed files for the merging operation. """
        
        file_lex_temp_paths = [DIR_TEMP_FOLDER+"/"+DIR_TEMP_LEXICON+"/"+f for f in os.listdir(DIR_TEMP_FOLDER+"/"+DIR_TEMP_LEXICON)]
        file_doc_id_temp_paths = [DIR_TEMP_FOLDER+"/"+DIR_TEMP_DOC_ID+"/"+f for f in os.listdir(DIR_TEMP_FOLDER+"/"+DIR_TEMP_DOC_ID)] 
        file_freq_temp_paths = [DIR_TEMP_FOLDER+"/"+DIR_TEMP_FREQ+"/"+f for f in os.listdir(DIR_TEMP_FOLDER+"/"+DIR_TEMP_FREQ)] 

        self.input_lex_temp_files = [open(file, 'rb') for file in file_lex_temp_paths]  #Open all the blocks in parallel
        self.input_doc_id_temp_files = [open(file, 'rb') for file in file_doc_id_temp_paths]  #Open all the blocks in parallel
        self.input_freq_temp_files = [open(file, 'rb') for file in file_freq_temp_paths]  #Open all the blocks in parallel
        
        self.file_Final_Lexicon=open(DIR_LEXICON+"/"+PATH_FINAL_LEXICON, 'ab') 
        self.file_Final_DocIds=open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_DOC_IDS, 'ab') 
        self.file_Final_Freq=open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_FREQ, 'ab') 
        self.file_Final_Block_Descriptor=open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_BLOCK_DESCRIPTOR, 'ab')

        if (self.debug_mode):
            self.file_Final_InvertedIndex_Debug=open(PATH_FINAL_INVERTED_INDEX_DEBUG,'a')
        
        
    def __close_files_for_merging_operation(self)->None:
        """ This function is used to close all the opened resource needed for the creation of the data structures
            in merging operation.
        """
        
        for file in self.input_lex_temp_files:
            file.close()  

        for file in self.input_doc_id_temp_files:
            file.close()  

        for file in self.input_freq_temp_files:
            file.close()  

        self.file_Final_Lexicon.close()
        self.file_Final_DocIds.close()
        self.file_Final_Freq.close()
        self.file_Final_Block_Descriptor.close()

        if (self.debug_mode):
            self.file_Final_InvertedIndex_Debug.close()
        
    def __check_all_blocks_are_read(self,offset_lexicon_terms:List):
        """ This functions checks if the all the blocks opened in parallel are read or not.
            The condition is matched when the list contains all None elements. 
        Args:
            offset_lexicon_terms: a list of offset
        
        Returns:
            True if the list contains all None elements.
        """
        #print (offset_lexicon_terms)
        return sum(1 if element is None else 0 for element in offset_lexicon_terms) == len(offset_lexicon_terms)
    
    def __find_min_term(self,lexicon_temp_terms:List,offset_lex_temp:List):
        """ This function checks and returns the minimum term (lexicographically) among blocks 
             at the current reading offset.
             If a offset_lex_temp[i] contains None means that the i block is completely read.
             
         Args:
             lexicon_temp_terms: the list of current lexicon element (each position is a different block)
             offset_lexicon_terms: the list of current lexicon element position inside the file (each position is a different block)
         Return:
             a string representing the current min term or None if all blocks are read
         
        """
    
        if not lexicon_temp_terms:
            return None  # Return None for an empty list
    
        min_term=None

        for index,lex_elem in enumerate(lexicon_temp_terms):
            if(offset_lex_temp[index]!=None):
                if (min_term==None):
                    min_term=lexicon_temp_terms[index].term

                if (lex_elem.term<min_term):
                    min_term=lex_elem.term
    
        return min_term
    
    
    def __save_postings_and_block_descriptor(self,new_term:bool,
                                             min_term:str,
                                             merged_posting_list:List[Posting],
                                             current_offset_doc_ids:int,current_offset_freq:int,
                                             current_offset_block_descriptor:int,
                                             block_descriptor:BlockDescriptor):
        """
        This function is used to save a posting list to related disk files calling the proper Inverted Index methods 
        and the related Block Descriptor.
        This function is created to avoid redundancy in the code considering that it is called in at least 2 points.
        
        Args:
            new_term: if true, in debug enabled is used to write different a new line in the output clear file.
            merged_posting_list: the posting list to be stored on disk
            current_offset_doc_ids: the offset position to save the doc ids of the merged_posting_list inside a file
            current_offset_freq: the offset position to save the freq of the merged_posting_list inside a file
            current_offset_block_descriptor: the offset position to save the block descriptor of the merged_posting_list inside a file
            block_descriptor: the block descriptor to use for saving on disk the information about the merged_posting_list
            
        Returns:
            current_offset_doc_ids: the new offset free position inside the file of the doc ids
            current_offset_freq: the new offset free position inside the file of the freq
            current_offset_block_descriptor: the new offset free position inside the file of the block descriptor
        
        """
        
        saved_offset_doc_ids=current_offset_doc_ids
        saved_offset_freq=current_offset_freq
                                
        #Write the posting on disk.
        current_offset_doc_ids,current_offset_freq=InvertedIndex.write_to_files_a_posting_list(merged_posting_list,self.compression_mode,self.file_Final_DocIds,self.file_Final_Freq,current_offset_doc_ids,current_offset_freq)
        block_descriptor.nr_postings+=len(merged_posting_list)
                                
        block_descriptor.doc_ids_bytes_size=(current_offset_doc_ids-saved_offset_doc_ids)
        block_descriptor.freq_bytes_size=(current_offset_freq-saved_offset_freq)
                                
        block_descriptor.min_doc_id=merged_posting_list[0].doc_id
        block_descriptor.max_doc_id=merged_posting_list[-1].doc_id
                                
        #Writing the block descriptor.
        current_offset_block_descriptor=block_descriptor.write_block_descriptor_on_disk_to_opened_file(self.file_Final_Block_Descriptor,current_offset_block_descriptor)
        
        if (self.debug_mode):
            InvertedIndex.write_to_file_a_posting_list_debug_mode(self.file_Final_InvertedIndex_Debug,min_term, merged_posting_list, new_term)
        
        return current_offset_doc_ids,current_offset_freq,current_offset_block_descriptor
        
    
    
    def single_pass_in_memory_indexing(self,list_of_documents:list,inv_index_block_size: int=2200,doc_index_block_size: int=2200,debug_mode:bool=False)-> None:

            ind = InvertedIndex()
            document_index = DocumentIndex()

            nr_block=0

            self.init_spimi()
            
            #Read all the documents and write the index at blocks on disk when memory is full, cleaning the memory data structure.
            
            for doc in list_of_documents:
                # Separate the doc_id from the content of the real document 
                doc_list = doc.split()
                doc_id = int(doc_list[0])
                text = ' '.join(doc_list[1:])

                if (sys.getsizeof(document_index.get_structure()) > doc_index_block_size):
                    if (self.debug_mode):
                        #Lexicon.write_to_block(DIR_DOC_INDEX+"/document_index.txt", document_index.get_structure())
                        document_index.write_document_index_to_file(DIR_DOC_INDEX+"/document_index.txt", document_index.get_structure())
                    document_index.clear_structure()

                document_index.add_document(doc_id, text)

                tc = Counter(text.lower().split())  # dict with term counts, Here there is the already preprocessed content
                for term, freq in tc.items():
                    if (sys.getsizeof(ind.get_structure()) > inv_index_block_size):  #Free memory available

                        LEXICON_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_LEXICON+"/block_nr_"+str(nr_block)
                        DOC_IDS_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_DOC_ID+"/block_nr_"+str(nr_block)
                        FREQ_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_FREQ+"/block_nr_"+str(nr_block)

                        ind.write_to_block_all_index_in_memory(LEXICON_TEMP_BLOCK_PATH,DOC_IDS_TEMP_BLOCK_PATH,FREQ_TEMP_BLOCK_PATH)

                        if (self.debug_mode):
                            ind.write_to_block_debug_mode(DIR_TEMP_FOLDER+"/inv_index_"+str(nr_block)+".txt")
                        ind.clear_structure()
                        nr_block=nr_block+1 

                    ind.add_posting(term, doc_id, freq)

            if (not document_index.is_empty()):   
                if (self.debug_mode):
                    #Lexicon.write_to_block(DIR_DOC_INDEX+"/document_index.txt", document_index.get_structure())
                    document_index.write_document_index_to_file(DIR_DOC_INDEX+"/document_index.txt", document_index.get_structure())

            #Finally, saving the last remaing block.       
            if (not ind.is_empty()):
                LEXICON_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_LEXICON+"/block_nr_"+str(nr_block)
                DOC_IDS_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_DOC_ID+"/block_nr_"+str(nr_block)
                FREQ_TEMP_BLOCK_PATH=DIR_TEMP_FOLDER+"/"+DIR_TEMP_FREQ+"/block_nr_"+str(nr_block)

                ind.write_to_block_all_index_in_memory(LEXICON_TEMP_BLOCK_PATH,DOC_IDS_TEMP_BLOCK_PATH,FREQ_TEMP_BLOCK_PATH)

                if (self.debug_mode):
                    ind.write_to_block_debug_mode(DIR_TEMP_FOLDER+"/inv_index_"+str(nr_block)+".txt")
        
    
    
        
    def index_merging(self)-> None:

        self.init_index_merging()

        try:

            self.__open_files_for_merging_operation()

            #Initialization of empty lexicon row elements for each block.
            lexicon_temp_elems=[LexiconRow("",0) for i in range (len (self.input_lex_temp_files))]
            
            #Start reading the first element in the lexicon of each block and saving the offset of each read.
            offset_lex_temp=[terms.read_lexicon_row_on_disk_from_opened_file(self.input_lex_temp_files[index],0) for index,terms in enumerate(lexicon_temp_elems)]
            
            #print(offset_lex_temp)

            current_offset_lexicon=0
            current_offset_doc_ids=0
            current_offset_freq=0
            current_offset_block_descriptor=0

            #First, check if all blocks opened blocks are read.
            while (not self.__check_all_blocks_are_read(offset_lex_temp)):

                #Find the minimum term among the opened blocks.
                min_term=self.__find_min_term(lexicon_temp_elems,offset_lex_temp)
                tot_posting=sum(lex_elem.dft if (lex_elem.term==min_term) else 0 for lex_elem in lexicon_temp_elems) 

                #This variable is used to mark if it is the first time a new term is elaborated among blocks, to print in debug a new line.
                new_term=True
                
                print("\n")
                print("Min termine corrente: "+min_term+ " nr. postings: "+str(tot_posting))
                
                #New Term to add definitively
                new_Lexicon_Def=LexiconRow(min_term,tot_posting)
                new_Lexicon_Def.docidOffset=current_offset_doc_ids
                new_Lexicon_Def.frequencyOffset=current_offset_freq
                new_Lexicon_Def.blockOffset=current_offset_block_descriptor  
                new_Lexicon_Def.numBlocks=4  #Sostituire con   #self.b_d_b.get_number_of_blocks(tot_posting)
                
                
                nr_of_postings_per_block_descriptor=3  #Sostituire con #math.ceil(tot_posting/new_Lexicon_Def.numBlocks)
                
                
                #Initialization of empty block descriptor.
                block_descriptor=BlockDescriptor(0,current_offset_doc_ids,current_offset_freq,0,0,0,0)
                
                merged_posting_list=[]
                #Number of postings that I can load in memory RAM.
                readable_postings=nr_of_postings_per_block_descriptor-len(merged_posting_list)
                
                print("INIZIO: readable_postings:"+str(readable_postings))
                
                for index,lex_term in enumerate(lexicon_temp_elems):
                    
                    if (lex_term.term==min_term):
                        print("Blocco: index:"+str(index))
                        
                        #Nr of posting in this block "file".
                        posting_to_be_read=lex_term.dft
                        
                        doc_id_block_offset=lex_term.docidOffset
                        freq_block_offset=lex_term.frequencyOffset
                        
                        #There are still some posting related to this min_term to be read in this block.
                        while(posting_to_be_read>0):
                            print("posting to be read: "+str(posting_to_be_read))
                            #Here I can read in one shot all the posting list of that term in the block in memory.
                            if (posting_to_be_read<readable_postings):
                                print("posting_to_be_read<readable_postings ")
                                nr_eff_readable=posting_to_be_read
                                posting_to_be_read=0
                            else:
                                print("posting_to_be_read>=readable_postings ")
                                #I read just what can be "currently" contained in main memory until block descriptor is full.
                                nr_eff_readable=readable_postings
                                print("nr_eff_readable: "+str(nr_eff_readable))
                                posting_to_be_read-=readable_postings

                            postingList,doc_id_block_offset,freq_block_offset=InvertedIndex.read_from_files_a_posting_list(
                                                                                self.input_doc_id_temp_files[index],self.input_freq_temp_files[index],
                                                                                False,
                                                                                doc_id_block_offset,freq_block_offset,
                                                                                nr_eff_readable)  
                            print(postingList)     
                            #I combine the posting I have just read with those read previously
                            merged_posting_list=InvertedIndex.merge_posting_lists(merged_posting_list,postingList)
                        
                            readable_postings=nr_of_postings_per_block_descriptor-len(merged_posting_list)
                            print("readable_postings:"+str(readable_postings))
                            
                            if(readable_postings==0):
                               
                                #I write the complete posting to disk (with possible compression) and the related block descriptor.
                                current_offset_doc_ids,current_offset_freq,current_offset_block_descriptor= self.__save_postings_and_block_descriptor(new_term,min_term,
                                                                          merged_posting_list,
                                                                          current_offset_doc_ids,current_offset_freq,
                                                                          current_offset_block_descriptor,
                                                                          block_descriptor)
                                newTerm=False
                                #Re-set the datastructures for new block descriptor.
                                merged_posting_list.clear()
                                block_descriptor=BlockDescriptor(0,current_offset_doc_ids,current_offset_freq,0,0,0,0)
                                readable_postings=nr_of_postings_per_block_descriptor
                        
                        ### END-WHILE ###
                       ### END - IF 
                    
                        # I read the next lexicon term related to this block file.
                        offset_lex_temp[index]=lex_term.read_lexicon_row_on_disk_from_opened_file(self.input_lex_temp_files[index],offset_lex_temp[index])
                
                ### END-FOR       
                if (len(merged_posting_list)>0):
                    print("FINE SCRITTURA ULTIMA MERGED LIST e ultimo descrittore blocco.")
                   
                    #Here I finish to write the remaining posting list to disk and the related block descriptor.
                    
                    current_offset_doc_ids,current_offset_freq,current_offset_block_descriptor= self.__save_postings_and_block_descriptor(new_term,min_term,merged_posting_list,
                                                                          current_offset_doc_ids,current_offset_freq,
                                                                          current_offset_block_descriptor,
                                                                          block_descriptor)
                
                
                # In questa parte qui si vanno a calcolare le definitive metriche per le query ed anche 
                # i descrittori di blocco per skipping e altro. 
                #Valutare se aggiungere calcolo metriche o altre informazioni utili per la query execution.
                
                
                new_Lexicon_Def.docidSize=(current_offset_doc_ids-new_Lexicon_Def.docidOffset)
                new_Lexicon_Def.frequencySize=(current_offset_freq-new_Lexicon_Def.frequencyOffset)
                
                #Save all the information related to the term just elaborated to disk.
                current_offset_lexicon=new_Lexicon_Def.write_lexicon_row_on_disk_to_opened_file(self.file_Final_Lexicon,current_offset_lexicon)

            print("END METHOD!")   

        except Exception as e:   
                raise e
        finally:
                #Be sure to close all the opened files in parallel
                self.__close_files_for_merging_operation()
      

# Example of usage

In [4]:
tot_doc=[
    "0     The pen is on the table",
    "1     The day is very sunny",
    "2     Goodmoring new article",
    "3     A cat is faster then a dog",
    "4     How are you",
    "5     A boy is a man with low age",
    "6     Lake Ontario is one of the biggest lake in the world",
    "7     English is worst than Italian",
    "8     Spiderman is the best superhero in Marvel universe",
    "9     Last night I saw a Netflix series",
    "10    A penny for your thoughts",
    "11    Actions speak louder than words",
    "12    All that glitters is not gold",
    "13    Beauty is in the eye of the beholder",
    "14    Birds of a feather flock together",
    "15    Cleanliness is next to godliness",
    "16    Don't count your chickens before they hatch",
    "17    Every people cloud has a silver lining people",
    "18    Fool me once shame on you fool me twice shame on me",
    "19    Honesty is the best policy.",
    "20    If the shoe fits, wear it",
    "21    It's a piece of cake",
    "22    Jump on the bandwagon",
    "23    Keep your chin up",
    "24    Let the cat out of the bag",
    "25    Make a long story short",
    "26    Necessity is the mother of invention",
    "27    Once in a blue moon",
    "28    Practice makes perfect",
    "29    Read between the lines",
    "30    The early bird catches people the worm",
    "31    The pen is mightier than the sword",
    "32    There's no smoke without fire",
    "33    To each his own",
    "34    Two heads are better than one",
    "35    You can't have your cake and eat it too",
    "36    A watched pot never boils",
    "37    Beggars can't be choosers",
    "38    Better late than never",
    "39    Calm before the storm",
    "40    Curiosity killed the cat",
    "41    Every dog has its day",
    "42    Great minds think alike",
    "43    Hope for the best prepare for the worst",
    "44    Ignorance is bliss.",
    "45    It's the last straw that breaks the camel's back",
    "46    Laugh and the world laughs with you weep and you weep alone",
    "47    Money can't buy happiness",
    "48    No news is good news",
    "49    Out of sight out of mind",
    "50    People who live in glass houses shouldn't throw stones",
    "51    Rome wasn't built in a day",
    "52    Silence is golden",
    "53    The apple doesn't fall far from the tree",
    "54    The more, the merrier",
    "55    There's no place like home",
    "56    Two wrongs don't make a right",
    "57    When in Rome do as the Romans do",
    "58    You reap what you sow",
    "59    People people people"
]


#indexBuilder=IndexBuilder()
#indexBuilder.build_block_sort_base_indexing(tot_doc,"complete_inverted_index",2220,False,False)

In [8]:
indexBuilder=IndexBuilder(True,False)
#invIndex=indexBuilder.build_in_memory_index(tot_doc)
#indexBuilder.single_pass_in_memory_indexing(tot_doc,2220,2220)
indexBuilder.index_merging()

Index Builder Costructor
Using: 
Debug Mode :True
Compression Mode :False





Min termine corrente: a                              nr. postings: 12
INIZIO: readable_postings:3
Blocco: index:0
posting to be read: 2
posting_to_be_read<readable_postings 
[Posting(doc_id=3, frequency=2), Posting(doc_id=5, frequency=2)]
readable_postings:1
Blocco: index:1
posting to be read: 3
posting_to_be_read>=readable_postings 
nr_eff_readable: 1
[Posting(doc_id=9, frequency=1)]
readable_postings:0
COMPRESSION_MODE:False
posting to be read: 2
posting_to_be_read<readable_postings 
[Posting(doc_id=9, frequency=1), Posting(doc_id=10, frequency=1)]
readable_postings:1
Blocco: index:2
posting to be read: 4
posting_to_be_read>=readable_postings 
nr_eff_readable: 1
[Posting(doc_id=17, frequency=1)]
readable_postings:0
COMPRESSION_MODE:False
posting to be read: 3
posting_to_be_read>=readable_postings 
nr_eff_readable: 3
[Posting(doc_id=17, frequency=1), Posting(doc_id=21, frequency=1), Posting(doc_id=25, frequ

In [24]:
b=BlockDescriptor()

fileBlockDescr=open(DIR_INVERTED_INDEX+"/"+PATH_FINAL_BLOCK_DESCRIPTOR, 'rb') 
# fileFinalFreq=open(PATH_FINAL_FREQ, 'rb') 
letto=b.read_block_descriptor_on_disk_from_opened_file(fileBlockDescr,36)
print(letto)
fileBlockDescr.close()
# posting=InvertedIndex.read_from_files_a_posting_list(11,fileFinalDocIds,fileFinalFreq,0,0)
# print(posting)
# #(nr_postings:int,fileDocIds,fileFreq,offsetDocIds:int,offsetFreq:int):
# fileFinalDocIds.close()
# fileFinalFreq.close()

72


In [29]:
b.offset_freqs


12

In [6]:
# ind = InvertedIndex()
# ind.read_from_block_all_index_in_memory("lexicon.bin","doc_ids.bin","freq.bin")

In [7]:
# ind.get_postings("a".ljust(30))

[Posting(doc_id=3, frequency=2),
 Posting(doc_id=5, frequency=2),
 Posting(doc_id=9, frequency=1),
 Posting(doc_id=10, frequency=1),
 Posting(doc_id=14, frequency=1),
 Posting(doc_id=17, frequency=1),
 Posting(doc_id=21, frequency=1),
 Posting(doc_id=25, frequency=1),
 Posting(doc_id=27, frequency=1),
 Posting(doc_id=36, frequency=1),
 Posting(doc_id=51, frequency=1),
 Posting(doc_id=56, frequency=1)]

In [11]:
fileFinalDocIds=open(PATH_FINAL_DOC_IDS, 'rb') 
fileFinalFreq=open(PATH_FINAL_FREQ, 'rb') 

posting=InvertedIndex.read_from_files_a_posting_list(11,fileFinalDocIds,fileFinalFreq,0,0)
print(posting)
#(nr_postings:int,fileDocIds,fileFreq,offsetDocIds:int,offsetFreq:int):
fileFinalDocIds.close()
fileFinalFreq.close()


([Posting(doc_id=3, frequency=2), Posting(doc_id=5, frequency=2), Posting(doc_id=9, frequency=1), Posting(doc_id=10, frequency=1), Posting(doc_id=14, frequency=1), Posting(doc_id=17, frequency=1), Posting(doc_id=21, frequency=1), Posting(doc_id=25, frequency=1), Posting(doc_id=27, frequency=1), Posting(doc_id=36, frequency=1), Posting(doc_id=51, frequency=1)], 44, 44)


In [55]:
# #To test a block
# invIndex=InvertedIndex()
# invIndex.read_to_block_all_index("TEMP",0)
# invIndex.get_postings("a")

In [22]:
# DIR_FOLDER="TEMP"
# b=a.read_row_on_disk(DIR_FOLDER+"/LEXICON_TEMP/lex_nr_0",0)

<__main__.LexiconRow object at 0x000001BAA0099370>


In [10]:
a=[1,2,3,4,5,6,7,8,9,10]
b=[1,22]
a+b

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 22]

In [None]:
#metodo di callback per leggere input file
#collegare la compression e modificare le funzioni
#Guardare funzionamento dei blocchi skipping etc
#fare test

In [11]:
a.clear()

In [12]:
a

[]

In [None]:
def index_merging(self,debug_mode:bool=False,compression_mode:bool=False)-> None:

        self.init_index_merging()

        try:

            self.__open_files_for_merging_operation()

            #Initialization of empty lexicon row elements for each block.
            lexicon_temp_elems=[LexiconRow("",0) for i in range (len (self.input_lex_temp_files))]
            
            #Start reading the first element in the lexicon of each block and saving the offset of each read.
            offset_lex_temp=[terms.read_lexicon_row_on_disk_from_opened_file(self.input_lex_temp_files[index],0) for index,terms in enumerate(lexicon_temp_elems)]
            
            #print(offset_lex_temp)

            current_offset_lexicon=0
            current_offset_doc_ids=0
            current_offset_freq=0
            current_offset_block_descriptor=0

            while (not self.__check_all_blocks_are_read(offset_lex_temp)):


                min_term=self.__find_min_term(lexicon_temp_elems,offset_lex_temp)
                tot_posting=sum(lex_elem.dft if (lex_elem.term==min_term) else 0 for lex_elem in lexicon_temp_elems) 

                #This variable is used to mark if it is the first time a new term is elaborated among blocks, to print in debug a new line.
                new_term=True
                
                print("Min termine corrente: "+min_term+ " nr. postings: "+str(tot_posting))

                #New Term to add definitively
                new_Lexicon_Def=LexiconRow(min_term,tot_posting)

                new_Lexicon_Def.docidOffset=current_offset_doc_ids
                new_Lexicon_Def.frequencyOffset=current_offset_freq
                  
                new_Lexicon_Def.numBlocks=2#self.b_d_b.get_number_of_blocks(tot_posting)
                nr_of_postings_per_block=3#math.ceil(tot_posting/new_Lexicon_Def.numBlocks)
                
                new_Lexicon_Def.blockOffset=current_offset_block_descriptor  
                
            
                elaborated_postings=0
                dim_block_available=nr_of_postings_per_block
                #Initialization of empty block descriptor.
                block_descriptor=BlockDescriptor(0,current_offset_doc_ids,current_offset_freq,0,0,0,0)
                
                for index,lex_term in enumerate(lexicon_temp_elems):
                    
                
                    if (lex_term.term==min_term):
                        #print("index "+str(index))

                        postingList,_,_=InvertedIndex.read_from_files_a_posting_list(lex_term.dft,self.input_doc_id_temp_files[index],self.input_freq_temp_files[index],
                                                                  lex_term.docidOffset,lex_term.frequencyOffset)
                        elaborated_postings+=len(postingList)
                        len_current_posting_list=len(postingList)
                        #print ("\n")
                        
                        #This variable is used as a placeholder to save the position of the posting considered in the block descriptor.
                        curr_posting_position=0
                        
                        while (len_current_posting_list>0):
                            #print ("Sono nel descr blocco nr: "+str(nr_blocco))
                            #print ("Lunghezza posting list corrente: "+str(len_current_posting_list))
                            #print ("Dim blocco disponibile: "+str(dim_block_available))
                            #print ("\n")
                            
                            #This is the case were the posting list read from a block is larger then block descriptor size
                            #I save on disk a portion of the posting list equal to dim_block_available
                            if (len_current_posting_list>dim_block_available):
                                #print("entro primo if")
                                block_descriptor.nr_postings+=dim_block_available
                                
                                saved_offset_doc_ids=current_offset_doc_ids
                                saved_offset_freq=current_offset_doc_ids
                                
                                #Select the min doc_id in the portion of the posting list considered.
                                if (block_descriptor.min_doc_id==0):
                                    block_descriptor.min_doc_id=postingList[curr_posting_position:(curr_posting_position+dim_block_available)][0].doc_id
                                
                                current_offset_doc_ids,current_offset_freq=InvertedIndex.write_to_files_a_posting_list(postingList[curr_posting_position:(curr_posting_position+dim_block_available)],compression_mode,self.file_Final_DocIds,self.file_Final_Freq,current_offset_doc_ids,current_offset_freq)
                                
                                block_descriptor.doc_ids_bytes_size+=(current_offset_doc_ids-saved_offset_doc_ids)
                                block_descriptor.freq_bytes_size+=(current_offset_freq-saved_offset_freq)
                                
                                len_current_posting_list-=dim_block_available
                                block_descriptor.max_doc_id=postingList[curr_posting_position:(curr_posting_position+dim_block_available)][-1].doc_id
                                curr_posting_position+=dim_block_available
                                
                                dim_block_available=0
                                
                            else:
                                #The posting list read from a block can is less then the block descriptor size available.
                                #print("entro secondo if")
                                block_descriptor.nr_postings+=len_current_posting_list
                                
                                saved_offset_doc_ids=current_offset_doc_ids
                                saved_offset_freq=current_offset_doc_ids
                                
                                #Select the min doc_id in the portion of the posting list considered.
                                if (block_descriptor.min_doc_id==0):
                                    block_descriptor.min_doc_id=postingList[curr_posting_position:(curr_posting_position+len_current_posting_list)][0].doc_id
                                
                                current_offset_doc_ids,current_offset_freq=InvertedIndex.write_to_files_a_posting_list(postingList[curr_posting_position:(curr_posting_position+len_current_posting_list)],compression_mode,self.file_Final_DocIds,self.file_Final_Freq,current_offset_doc_ids,current_offset_freq)
                                
                                block_descriptor.doc_ids_bytes_size+=(current_offset_doc_ids-saved_offset_doc_ids)
                                block_descriptor.freq_bytes_size+=(current_offset_freq-saved_offset_freq)
                                
                                dim_block_available-=len_current_posting_list
                                
                                if (dim_block_available==0):
                                    block_descriptor.max_doc_id=postingList[curr_posting_position:(curr_posting_position+len_current_posting_list)][-1].doc_id
                                
                                #Here i finished to write all postings related to the current posting list of a block.
                                len_current_posting_list=0 
                                
                            if (dim_block_available==0):
                                #I filled completly the descriptor, so I save it on disk and instantiate a new one.
                                #print("dim_blocco_disponibile esaurita")
                                print (block_descriptor.min_doc_id,block_descriptor.max_doc_id,block_descriptor.doc_ids_bytes_size,block_descriptor.freq_bytes_size)
             
                                #Todo salvataggio blocco.
                                
                                #ho completato il blocco, lo vado a salvare in memoria
                                #ne devo creare uno nuovo
                                block_descriptor=BlockDescriptor(0,current_offset_doc_ids,current_offset_freq,0,0,0,0)
                                dim_block_available=nr_of_postings_per_block
                        
                        #This is the case when the last posting list of the last block has been read.
                        #If there is still a block descriptor not saved, with some postings save it.
                        if(elaborated_postings==tot_posting and block_descriptor.nr_postings>0):
                            #print("scrivo anche l'ultimo blocco che Ã¨ avanzato")
                            block_descriptor.max_doc_id=postingList[-1].doc_id
                            
                        ## END WHILE
                        
                        
                        #Fare tutti i calcoli sulle metriche varie
                       
                    
                        
                        #This part must be finished!!
                        if (self.debug_mode):
                            InvertedIndex.write_to_file_a_posting_list_debug_mode(self.file_Final_InvertedIndex_Debug,min_term, postingList, new_term)
                        
                        #current_offset_doc_ids,current_offset_freq=InvertedIndex.write_to_files_a_posting_list(postingList,compression_mode,self.file_Final_DocIds,self.file_Final_Freq,current_offset_doc_ids,current_offset_freq)

                        #This part must be finished!!
                        new_Lexicon_Def.docidSize+=len(postingList)
                        new_Lexicon_Def.frequencySize+=len(postingList)  
                        #Read the next lexicon term

                        offset_lex_temp[index]=lex_term.read_lexicon_row_on_disk_from_opened_file(self.input_lex_temp_files[index],offset_lex_temp[index])
                        
                        new_term=False

                # In questa parte qui si vanno a calcolare le definitive metriche per le query ed anche 
                # i descrittori di blocco per skipping e altro.        

                current_offset_lexicon=new_Lexicon_Def.write_lexicon_row_on_disk_to_opened_file(self.file_Final_Lexicon,current_offset_lexicon)

            print("END METHOD!")   

        except Exception as e:   
                raise e
        finally:
                #Be sure to close all the opened files in parallel
                self.__close_files_for_merging_operation()
      