In [1]:
import math
import ipytest
import import_ipynb
import sys
sys.path.append('../../')  # Go up two folders to the project root

from building_data_structures.Lexicon import DocumentIndex, DocumentIndexRow, Lexicon, LexiconRow, create_lexicon

ipytest.autoconfig()

importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\MultimediaProject\tests\Building_Data_Structures\../..\building_data_structures\Lexicon.ipynb


In [4]:
%%ipytest

def test_lexicon_structure():
    voc = Lexicon()

    # Create a doc index with 2 documents
    doc = DocumentIndex()
    doc.clear_structure()
    doc.add_document(1, "Information retrieval system")
    doc.add_document(2, "Information retrieval project")

    # test on term "information"
    voc.add_term("Information", 2, doc, 1)
    assert voc.get_terms("Information").term == "Information"
    assert voc.get_terms("Information").dft == 2
    assert voc.get_terms("Information").max_tf == 1
    assert voc.get_terms("Information").idft == 0 # log(number of doc / dft)

    # test on term "system"
    voc.add_term("system", 1, doc, 1)
    assert voc.get_terms("system").term == "system"
    assert voc.get_terms("system").dft == 1
    assert voc.get_terms("system").max_tf == 1
    assert doc.number_of_documents == 2
    assert round(voc.get_terms("system").idft, 2) == 0.69 # log(number of doc / dft)

    try:
        voc.add_term(4, 2, doc, 1)
    except ValueError as e:
        assert str(e) == "There's an error in parameter's type."

    try:
        voc.add_term("Retrieval", "Hello", doc, 1)
    except ValueError as e:
        assert str(e) == "There's an error in parameter's type."

    try:
        voc.get_terms(4567)
    except ValueError as e:
        assert str(e) == "Term must be a string."
    
    voc_aux = voc.get_structure()
    assert voc.get_terms("Information")==voc_aux["Information"]
    
    assert voc.is_empty() == False
    voc.clear_structure()
    assert voc.is_empty() == True

[32m.[0m[32m.[0m[32m                                                                                           [100%][0m
[32m[32m[1m2 passed[0m[32m in 0.01s[0m[0m


In [5]:
%%ipytest

# TESTO TO CREATE LEXICON
valid_document_index = DocumentIndex()

# Case 1: Valid parameters
assert create_lexicon("complete_inverted_index.txt", "lexicon", "Lexicon/", ".txt", 2200, valid_document_index) != -1

# Case 2: Invalid file_input_path (empty string)
try:
    create_lexicon("", "output/folder", "documents", ".txt", 10, valid_document_index)
except ValueError as e:
    assert str(e) == "Invalid file_input_path."

# Case 3: Invalid file_output_path (empty string)
try:
    create_lexicon("complete_inverted_index.txt", "", "Lexicon/", ".txt", 2200, valid_document_index)
except ValueError as e:
    assert str(e) == "Invalid file_output_path."

# Case 4: Invalid DIR_FOLDER (empty string)
try:
    create_lexicon("complete_inverted_index.txt", "lexicon", "", ".txt", 2200, valid_document_index)
except ValueError as e:
    assert str(e) == "Invalid DIR_FOLDER."

# Case 5: Invalid file_extension (empty string)
try:
    create_lexicon("complete_inverted_index.txt", "lexicon", "Lexicon/", "", 2200, valid_document_index)
except ValueError as e:
    assert str(e) == "Invalid file_extension."

# Case 6: Invalid block_size (negative integer)
try:
    create_lexicon("complete_inverted_index.txt", "lexicon", "Lexicon/", ".txt", -5, valid_document_index)
except ValueError as e:
    assert str(e) == "Invalid block_size. Must be a positive integer."

# Case 7: Invalid document_index (not an instance of DocumentIndex)
try:
    create_lexicon("complete_inverted_index.txt", "lexicon", "Lexicon/", ".txt", 2200, "invalid_document_index")
except ValueError as e:
    assert str(e) == "Invalid document_index. Must be an instance of DocumentIndex."



[33m[33mno tests ran[0m[33m in 0.01s[0m[0m


In [6]:
%%ipytest

from building_data_structures.Lexicon import compute_max_term_frequency, compute_IDFT, compute_TFIDF, compute_avgDL, compute_BM25_term
# TEST metrics

# test compute max term frequency
postings_list1 = "3:2 5:2 9:1 10:1 14:1 17:1 21:1 25:1 27:1 36:1 51:1 56:1"
postings_list2 = "3:10 5:2 9:3 10:1 14:1 17:2 21:1 25:1 27:1 36:1 51:1 56:9"
postings_list3 = "3:2 5:2 9:5 10:1 14:1 17:1 21:5 25:1 27:6 36:1 51:8 56:1"

assert compute_max_term_frequency(postings_list1) == 2
assert compute_max_term_frequency(postings_list2) == 10
assert compute_max_term_frequency(postings_list3) == 8
assert compute_max_term_frequency("") == 0

try:
    compute_max_term_frequency(45)
except ValueError as e:
    assert str(e) == "Invalid postings list."

# test compute_IDFT
assert valid_document_index.number_of_documents == 2
assert compute_IDFT(valid_document_index.number_of_documents, 3) == -0.40546510810816444 # ln(2/3) == -0.40546510810816444
assert compute_IDFT(valid_document_index.number_of_documents, 8) == -1.3862943611198906  # ln(2/8) == -1.3862943611198906
assert compute_IDFT(valid_document_index.number_of_documents, 1) ==  0.6931471805599453  # ln(2)   ==  0.6931471805599453
assert compute_IDFT(valid_document_index.number_of_documents, -5) == 0

try:
    compute_IDFT("invalid document index", 3)
except ValueError as e:
    assert str(e) == "Invalid parameters."

try: 
    compute_IDFT(valid_document_index.number_of_documents, "hello")
except ValueError as e:
    assert str(e) == "Invalid parameters."

# test compute_TFIDF
assert compute_TFIDF(3, -0.40546510810816444) == -0.8509140585019376 # (1 + math.log(3)) * -0.40546510810816444 = 2.0986122886681 * -0.40546510810816444
assert compute_TFIDF(2, 0.6931471805599453)   ==  1.1736001944781467 # (1 + 0.69314718055995) * 0.6931471805599453 
assert compute_TFIDF(-5, 0.245543234545434) == 0

try:
    compute_TFIDF(2, "hello")
except ValueError as e:
    assert str(e) == "Invalid parameters."

# test compute BM25
# il document index in questo caso è fatto così:
# (1, "Information retrieval system")
# (2, "Information retrieval project")
# PROVIAMO A CALCOLARE 1 SINGOLO CONTRIBUTO DELLA SOMMATORIA DEL BM25 DATO DA UNA QUERY COMPOSTA DAL SOLO TERMINE "system"
# NEL NOSTRO CASO "system" COMPARE IN 1 DOCUMENTO, HA DFT=1
log_tf = (1 + math.log(1)) # LA TERM FREQUENCIES DI "system" NEL DOCUMENTO CON DOC_ID = 1 è 1, COMPARE UNA SOLA VOLTA
avgDL = compute_avgDL(valid_document_index) # LA LUNGHEZZA MEDIA DEI DOCUMENTI E' 3
doc_len = valid_document_index.get_document(1).document_length # IL DOCUMENTO IN QUESTIONE E' COMPOSTO DA 3 TERMINI
assert idf == 0.6931471805599453
assert avgDL == 3
assert log_tf == 1
assert doc_len == 3
assert float((idf * log_tf)/(log_tf + 1.6 * ( (1 - 0.75) + 0.75 * (doc_len/avgDL) ))) == 0.2665950694461328
assert compute_BM25_term(valid_document_index, 1, idf, 1) == 0.2665950694461328

# test compute_avgDL
assert valid_document_index.total_document_length == 6
assert valid_document_index.number_of_documents == 2

assert compute_avgDL(valid_document_index) == 3

try:
    compute_avgDL("hello")
except ValueError as e:
    assert str(e) == "Invalid parameters."


[33m[33mno tests ran[0m[33m in 0.00s[0m[0m
