In [1]:
import ipytest
import import_ipynb

import sys
sys.path.append('../../')  # Go up two folders to the project root


from building_data_structures.IndexBuilder import Posting, InvertedIndex, IndexBuilder 

importing Jupyter notebook from C:\Users\Davide\IR\Progetto\tests\Building_Data_Structures\../..\building_data_structures\IndexBuilder.ipynb


# Tests

In [2]:
import ipytest

ipytest.autoconfig()

In [3]:
%%ipytest

#Test InvertedIndex and Posting datastructures

def test_inverted_index_data_structure_and_methods():
    ind = InvertedIndex()
    ind.add_posting("term", 1, 1)
    ind.add_posting("term", 2, 4)
    
    # Testing existing term
    postings = ind.get_postings("term")
    assert len(postings) == 2
    assert postings[0].doc_id == 1
    assert postings[0].payload == 1
    assert postings[1].doc_id == 2
    assert postings[1].payload == 4
   
    # Testing non-existent term
    assert ind.get_postings("xyx") is None
    
    #Test is_empty and clear_structure and get_structure
    assert ind.is_empty() == False
    ind.clear_structure()
    assert ind.is_empty() ==True
    assert ind.get_postings("term") == None
    ind.add_posting("term", 57, 4)
    ind2=ind.get_structure()
    assert ind.get_postings("term")[0].doc_id==ind2["term"][0].doc_id and ind.get_postings("term")[0].payload==ind2["term"][0].payload
    
    #Test vocabulary
    ind = InvertedIndex()
    ind.add_posting("term1", 1)
    ind.add_posting("term2", 1)
    ind.add_posting("term3", 2)
    ind.add_posting("term2", 3)
    assert set(ind.get_terms()) == set(["term1", "term2", "term3"])
    
def test_posting_data_structure():
    posting_1=Posting(4,5)
    
    assert posting_1.doc_id==4
    assert posting_1.payload==5
    
    posting_2=Posting.from_string("1:45")
    assert posting_2.doc_id==1
    assert posting_2.payload==45

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m
[32m[32m[1m2 passed[0m[32m in 0.01s[0m[0m


In [5]:
%%ipytest

indexBuilder=IndexBuilder()

test_documents=[
    "this is a random sentence without punctuation",
    "python is a versatile programming language",
    "the quick brown fox jumps over the lazy dog",
    "coding is a creative and logical process",
    "sunsets are a beautiful sight to behold",
    "coffee is a popular beverage around the world",
    "music has the power to evoke emotions",
    "books transport readers to different worlds",
    "kindness and compassion make the world better",
    "the moonlight reflects on the calm lake in the night the vision is awesome",
    "nature provides solace and tranquility",
    "imagination knows no boundaries",
    "friendship is a treasure worth cherishing",
    "happiness is found in simple moments",
    "laughter is contagious and brings joy is better for all"
]


def test_index_building():

    #Test buildInMemoryIndex
    
    index=indexBuilder.build_in_memory_index(test_documents)
    
    assert len(index.get_postings("is"))==8 
    assert index.get_postings("is")[2].doc_id==3 and index.get_postings("is")[2].payload==1
    assert index.get_postings("is")[7].doc_id==14 and index.get_postings("is")[7].payload==2
    
    assert len(index.get_postings("python"))==1 
    assert index.get_postings("python")[0].doc_id==1 and index.get_postings("python")[0].payload==1
    
    assert len(index.get_postings("the"))==5 
    assert index.get_postings("the")[4].doc_id==9 and index.get_postings("the")[4].payload==4


    
    
    #Test Blocked Sort-Based Indexing
    
    for i in range(1,6):   #Test for different block size
        
        indexBuilder.build_block_sort_base_indexing(test_documents,"complete_inverted_index_TEST"+str(i),500*i,False,True)

        #After testing the correctness of in memory index
        #for this short document collection I read the output file and store it in main memory, 
        #then check the correct presence of terms and postings.

        ind_read_from_disk=InvertedIndex()
        with open("complete_inverted_index_TEST"+str(i)+indexBuilder.OUTPUT_FILE_FORMAT, "r") as file:
            for line in file:

                term=line.split()[0]
                postings_str_lst=line.split()[1:]

                for posting in  postings_str_lst:
                    doc_id,freq=posting.split(":")
                    ind_read_from_disk.add_posting(term,int(doc_id),int(freq))


        assert len(ind_read_from_disk.get_postings("is"))==8 
        assert ind_read_from_disk.get_postings("is")[2].doc_id==3 and ind_read_from_disk.get_postings("is")[2].payload==1
        assert ind_read_from_disk.get_postings("is")[7].doc_id==14 and ind_read_from_disk.get_postings("is")[7].payload==2

        assert len(ind_read_from_disk.get_postings("python"))==1 
        assert ind_read_from_disk.get_postings("python")[0].doc_id==1 and ind_read_from_disk.get_postings("python")[0].payload==1

        assert len(ind_read_from_disk.get_postings("the"))==5 
        assert ind_read_from_disk.get_postings("the")[4].doc_id==9 and ind_read_from_disk.get_postings("the")[4].payload==4
    


Index Builder costructor
[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.12s[0m[0m
