# Tests file

In this file we will make performance and consistency tests.

In [16]:
#import sys
#!conda install --yes --prefix {sys.prefix} -c conda-forge gensim

import time
import pickle
import Globals.globals as glob
import SearchAlgorithms.searchAlgorithms as algo
from Tokenization.tokenizer import createListOfTokens, replaceWordsByStem, replaceWordsByLemma, removeStopWords
from QueryMaker.queryShell import processQueryString
from DocumentServer import documentServer
from Tokenization.TokenizationCpp import tokenizer as tokenizerCpp
from IFConstruction import ifConstructor

datasetFoldername = "../latimes"
documentServer.foldername = datasetFoldername
glob.loadDocID2Content()

## Consistency tests

### 1. Impact of the word score on the top 10 documents

Setting : 
    - search algorithm : naive,
    - inverted file : no stemming and no lemmatization,
    - query processing : no stemming, no lemmatization, no word embedding.
    - query : "Chocolate and internet"
    
Variable parameter : **word score ∈ {<number of occurence\>, <tf * idf>}**
 
In this section, we compute the top 10 results with the naive algorithm using an inverted file which has been built without any stemming, lemmatization and no word embedding is applied on the query.
We think "Chocolate and internet" is a relevant query to test the word score since there is a significant difference between the number of occurence of "chocolate" and "internet" in the dataset as shown further.

In [2]:
searchAlgorithm = algo.naiveAlgo
query = "Chocolate and internet"
query = processQueryString(query, stemming = False, lemmatization = False, embedding = False)
print(query)

[('chocolate', 3), ('internet', 3)]


Note : because no word embedding is used, the reader must ignore the weights paired with the words. The weights are not used by the naive algorithm anyway.

* **First test : word score = number of occurence** 

In [3]:
vocabulary_filename = "Globals/nostemm_nolemm_notfidf/vocabulary.dict"
IF_filename = "Globals/nostemm_nolemm_notfidf/IF.dict"

glob.loadVocabulary(vocabulary_filename, IF_filename)

choco_PL = glob.voc2PostingList("chocolate")
internet_PL = glob.voc2PostingList("internet")

print("len(choco_PL) :", len(choco_PL))
print("len(internet_PL) :", len(internet_PL))

print("list(choco_PL.items())[:4] :", list(choco_PL.items())[:4])
print("list(internet_PL.items())[:4] :", list(internet_PL.items())[:4])

len(choco_PL) : 723
len(internet_PL) : 4
list(choco_PL.items())[:4] : [('321713', 38), ('145821', 27), ('321712', 25), ('111', 24)]
list(internet_PL.items())[:4] : [('85032', 8), ('85141', 6), ('105932', 1), ('254071', 1)]


Note : we can observe that "chocolate" appears in more documents and with a bigger number of occurence in each documents.

In [4]:
result = searchAlgorithm(query)

content_result = documentServer.serveDocuments(result)

for idx, doc in enumerate(content_result.keys()):
	print(idx+1,"----------------------------------")
	print(content_result[doc]["metadata"]),
print("----------------------------------")

1 ----------------------------------
DOCID : 321713

DATE : December 13, 1990, Thursday, Home Edition 

SECTION : Food; Part H; Page 20; Column 1 

HEADLINE : GOOD COOKING: MAKE YOUR HOLIDAY INDULGENCE BITTERSWEET 

2 ----------------------------------
DOCID : 145821

DATE : December 8, 1989, Friday, Orange County Edition 

SECTION : Orange County Life; Part N; Page 11; Column 1 

HEADLINE : SHE FINDS SWEET SUCCESS WITH CHOCOLATES 

3 ----------------------------------
DOCID : 321712

DATE : December 13, 1990, Thursday, Home Edition 

SECTION : Food; Part H; Page 20; Column 1 

HEADLINE : BACK TO BASICS: DON'T BE AFRAID: IT'S SIMPLY PERFECT CHOCOLATE 

4 ----------------------------------
DOCID : 111

DATE : January 1, 1989, Sunday, Home Edition 

SECTION : Opinion; Part 5; Page 5; Column 1; Op-Ed Desk 

HEADLINE : LITTLE CHOCOLATE DOUGHNUTS TELL THE TALE OF THE U.S. TRADE CRISIS 

5 ----------------------------------
DOCID : 196334

DATE : March 29, 1990, Thursday, Home Edition 

SECT

Note : if we look at the headlines of the top 10 results we can clearly see that they all are related to "chocolate".

* **Second test : word score = tf * idf = (1 + log(number of occurrences)) * log(total number of documents/(1 + length of posting list)))** 

In [5]:
vocabulary_filename = "Globals/nostemm_nolemm_tf_idf/vocabulary.dict"
IF_filename = "Globals/nostemm_nolemm_tf_idf/IF.dict"

glob.loadVocabulary(vocabulary_filename, IF_filename)

choco_PL = glob.voc2PostingList("chocolate")
internet_PL = glob.voc2PostingList("internet")

print("len(choco_PL) :", len(choco_PL))
print("len(internet_PL) :", len(internet_PL))

print("list(choco_PL.items())[:4] :", list(choco_PL.items())[:4])
print("list(internet_PL.items())[:4] :", list(internet_PL.items())[:4])

len(choco_PL) : 724
len(internet_PL) : 4
list(choco_PL.items())[:4] : [('321713', 24.139), ('145821', 22.36), ('321712', 21.959), ('111', 21.747)]
list(internet_PL.items())[:4] : [('85032', 32.037), ('85141', 29.044), ('105932', 10.403), ('254071', 10.403)]


Note : we can observe that, even if "chocolate" appears in more documents and with a bigger number of occurence in each documents (as seen before), the new score computation makes "internet" reach higher scores than "chocolate" in some documents.

In [6]:
result = searchAlgorithm(query)

content_result = documentServer.serveDocuments(result)

for idx, doc in enumerate(content_result.keys()):
	print(idx+1,"----------------------------------")
	print(content_result[doc]["metadata"]),
print("----------------------------------")

1 ----------------------------------
DOCID : 85032

DATE : July 21, 1989, Friday, Home Edition 

SECTION : Part 1; Page 14; Column 5; National Desk 

HEADLINE : COMPUTER NETWORK SEEN AS STILL VULNERABLE TO VIRUSES 

2 ----------------------------------
DOCID : 85141

DATE : July 21, 1989, Friday, Orange County Edition 

SECTION : Business; Part 4; Page 3; Column 5; Financial Desk 

HEADLINE : PLAN SOUGHT TO KEEP 'VIRUSES' FROM A COMPUTER NETWORK 

3 ----------------------------------
DOCID : 321713

DATE : December 13, 1990, Thursday, Home Edition 

SECTION : Food; Part H; Page 20; Column 1 

HEADLINE : GOOD COOKING: MAKE YOUR HOLIDAY INDULGENCE BITTERSWEET 

4 ----------------------------------
DOCID : 145821

DATE : December 8, 1989, Friday, Orange County Edition 

SECTION : Orange County Life; Part N; Page 11; Column 1 

HEADLINE : SHE FINDS SWEET SUCCESS WITH CHOCOLATES 

5 ----------------------------------
DOCID : 321712

DATE : December 13, 1990, Thursday, Home Edition 

SECTION

Note : now, if we look at the headlines of the top 10 results, both "chocolate" and "internet" seem to be represented in the results. However, it is not obvious why documents related to "internet" should be better than the ones related to "chocolate". This behavior is due to the naive algorithm.

**The score tf * idf shows itself more relevant than a simple word occurence counter since it allows rarer words to be considered by the algorithm and it lower the importance of common words. From now on, our tests will only use this score.**

### 2. Impact of the search algorithm on the top 10 documents

In this section, we won't use neither steming/lemmatization nor word embedding. The tf/idf has been choosen as the token score.
We also use the query "Chocolate and internet" for each algorithm.

Firsty, the naive algorithm has been runed previously.
The results was :
....
....
....
....
....
....
....
....
....

We compute the same query with the fagin algorithm.

We compute the same query with the threshold algorithm.

The ????? algorithm seems to be the best.

### Impact of stemming, lemmatization and word embedding

In this section, we will use the fagin algorithm with tf/idf scores on the query : "Chocolate and feet".

If we don't use stemming, lemmatization or word embedding we obtain the same results as before:
DETAILS THE RESULTS

We will now add stemming processing on the inverted file and on the user query.

In [9]:
def applyFaginOnQuery(processedQuery):
    queryResult = algo.faginAlgo(processedQuery)
    if(queryResult):
        returnedDocuments = documentServer.serveDocuments(queryResult)
        print("\n")
        print("results:\n")
        for idx, doc in enumerate(returnedDocuments.keys()):
            print(idx+1,"----------------------------------")
            print(returnedDocuments[doc]["metadata"]),
            print("----------------------------------")
    else:
        print("no result\n")

In [11]:
glob.loadVocabulary("./Globals/stemm_nolemm_tfidf/vocabulary.dict","./Globals/stemm_nolemm_tfidf/IF.dict")

query = "Chocolate and feet"

# Apply stemming on the query
processedQuery = processQueryString(query,stemming = True)
print(processedQuery)

# Apply fagin algorithm
applyFaginOnQuery(processedQuery)

[('chocol', 3), ('feet', 3)]
2
[('110992', 47.751000000000005)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817), ('30071', 35.199000000000005)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817), ('30071', 35.199000000000005), ('134434', 32.817)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817), ('30071', 35.199000000000005), ('134434', 32.817), ('323491', 38.556000000000004)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817), ('30071', 35.199000000000005), ('134434', 32.817), ('323491', 38.556000000000004), ('53702', 35.199000000000005)]
[('110992', 47.751000000000005), ('247462', 35.199000000000005), ('103552', 32.817), ('30071', 35.199000000000005), ('134434', 32.817), ('323491', 38.556000000000004),

Result obtained:

    The vocabulary set has a size of  234118

    [('chocol', 3), ('feet', 3)]
    
    Top 10 :


We will now add the lemmatization procedure to tokens in the inverted file and in the query.

In [13]:
glob.loadVocabulary("./Globals/stemm_lemm_tfidf/vocabulary.dict","./Globals/stemm_lemm_tfidf/IF.dict")

query = "Chocolate and feet"

# Apply stemming on the query
processedQuery = processQueryString(query,lemmatization = True)
print(processedQuery)

# Apply fagin algorithm
applyFaginOnQuery(processedQuery)

[('chocol', 3), ('foot', 3)]
2
[('103552', 33.873000000000005)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32.910000000000004)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32.910000000000004), ('247462', 36.105000000000004)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32.910000000000004), ('247462', 36.105000000000004), ('110992', 42.480000000000004)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32.910000000000004), ('247462', 36.105000000000004), ('110992', 42.480000000000004), ('295563', 32.910000000000004)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32.910000000000004), ('247462', 36.105000000000004), ('110992', 42.480000000000004), ('295563', 32.910000000000004), ('207562', 48.744)]
[('103552', 33.873000000000005), ('207671', 44.583000000000006), ('134434', 32

Finally we will extend the query with 3 synonyms for each tokens using word embedding.

In [17]:
glob.loadVocabulary("./Globals/stemm_lemm_tfidf/vocabulary.dict","./Globals/stemm_lemm_tfidf/IF.dict")

embeddingFile = open('./Globals/embeddingModel', 'rb')
model = pickle.load(embeddingFile)
embeddingFile.close()

query = "Chocolate and feet"

# Apply stemming on the query
processedQuery = processQueryString(query,lemmatization = True, embedding = True, embeddingModel = model, nbOfSynonyms = 3)
print(processedQuery)

# Apply fagin algorithm
applyFaginOnQuery(processedQuery)

[('chocol', 3), ('foot', 3), ('caramel', 1), ('eclair', 1), ('cake', 1), ('inch', 1), ('mile', 1), ('diamet', 1)]
8


results:

1 ----------------------------------
DOCID : 229922

DATE : June 7, 1990, Thursday, Home Edition 

SECTION : Food; Part H; Page 1; Column 3 

HEADLINE : WINE COUNTRY CHEFS; 
</P>
<P>
THREE YEARS AGO ONLY SIX WINERIES IN NAPA AND SONOMA HAD RESIDENT CHEFS. TODAY 
THE NUMBER HAS TRIPLED. THESE ARE RISING STARS. 

----------------------------------
2 ----------------------------------
DOCID : 141533

DATE : November 30, 1989, Thursday, Home Edition 

SECTION : Food; Part H; Page 2; Column 1 

HEADLINE : EASY-TO-MAKE GIFTS THAT ARE STRAIGHT FROM THE HEART AND THE KITCHEN; 
</P>
<P>
HOLIDAYS: FOODS THAT FREEZE WELL OR DON'T REQUIRE REFRIGERATION MAKE THE SAFEST 
EDIBLE PRESENTS. 

----------------------------------
3 ----------------------------------
DOCID : 179442

DATE : February 22, 1990, Thursday, Home Edition 

SECTION : Food; Part H; Page 2; Column 1 

HEADL

CONCLUSION ON STEM LEM EMBEDDING

## Performance tests

### 1. Time to build and query the inverted file

In this section, we will use neither stemming/lemmatization nor word embedding.

Firstly we will build the inverted file over the whole data set in RAM memory and resquest it for one posting list.

In [19]:
import sys
import cProfile

Then, we will build the inverted file in memory and request one posting list.

In [None]:
tokenizer_ = tokenizerCpp.Tokenizer(datasetFoldername, lemmatization_ = False, stemming_ = False)
#set runSize such that :
#the total number of documents (~130 000) in the dataset divided by runSize is less than the allowed number of 
#simultaneously opened files on your machine (usually 1024) 
runSize_ = 10000

#ifConstructor.constructIF_diskBased(tokenizer_, runSize = runSize_, score_tf_idf = True)
cProfile.run("ifConstructor.constructIF_diskBased(tokenizer_, runSize = runSize_, score_tf_idf = True)")

![alt text](images/run10000profile.png "Title")

Note : cProfile gives us a cpu time of 1123 seconds which is roughly equivalent to 19 seconds.

![alt text](images/run10000.png "Title")

Note : from the system monitor we can see that the program reach 583 MB of RAM usage.  

In [None]:
runSize_ = 130

cProfile.run("ifConstructor.constructIF_diskBased(tokenizer_, runSize = runSize_, score_tf_idf = True)")

...

CONCLUSION

### Time to run algorithm

In this section, we will use neither stemming/lemmatization nor word embedding. We will also use the query "Chocolate and internet" for all algorithm.

We compute the naive algorithm on this query.

We compute the fagin algorithm on the query.

CONCLUSION

### Time to run algorithm

In this section, we will use neither stemming/lemmatization nor word embedding. We will also use the query "Chocolate and internet" for all algorithm.

We compute the naive algorithm on this query.

We compute the fagin algorithm on the query.

AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [16]:
def testTime(queries):
    start_time = time.time()
    for query in queries:
        algo.naiveAlgo(query)
    print("--- %s naiveAlgo seconds ---" % (time.time() - start_time))
    start_time = time.time()
    for query in queries:
        algo.faginAlgo(query)
    print("--- %s faginAlgo seconds ---" % (time.time() - start_time))
    start_time = time.time()
    for query in queries:
        algo.threshold(query)
    print("--- %s threshold seconds ---" % (time.time() - start_time))


In [49]:
glob.loadVocabulary("./Globals/nostemm_nolemm_tf_idf/vocabulary.dict","./Globals/nostemm_nolemm_tf_idf/IF.dict")


In [50]:
oneWord = [
        [("daylight",3)]
    ]

notExist = [[("fdadfdfewf",3)],
           [("114rf4434",3)],
            [("jdifjoiq2323",3)]
           ]

queries = [
                [("love",3), ("chocolate",3)],
                [("january",3)],
                [("narrow",3)],
                [("today",3), ("tomorrow",3)]           
    ]

queries1 = [
      [("love",3), ("and",3), ("chocolate",3)],
                [("january",3)],
                [("narrow",3)],
                [("today",3), ("and",3), ("tomorrow",3)],

]



We compute the three algos on the words not existing in the dict:

In [57]:
testTime(notExist)

--- 4.124641418457031e-05 naiveAlgo seconds ---
--- 0.0005621910095214844 faginAlgo seconds ---
--- 0.00039196014404296875 threshold seconds ---


Result obtained:

--- 4.124641418457031e-05 naiveAlgo seconds ---  
--- 0.0005621910095214844 faginAlgo seconds ---  
--- 0.00039196014404296875 threshold seconds ---

We compute the three algos with one word 

In [1]:
testTime(oneWord)

NameError: name 'testTime' is not defined

--- 0.002106189727783203 naiveAlgo seconds ---
--- 0.004480123519897461 faginAlgo seconds ---
--- 0.0021691322326660156 threshold seconds ---

We compute the three algos with random words
Remark: We notice that the fagin algo is quite slow because it needs to go through every posting list

In [59]:
testTime(queries)

--- 0.45010828971862793 naiveAlgo seconds ---
--- 8.133760929107666 faginAlgo seconds ---
--- 0.44022607803344727 threshold seconds ---


--- 0.45010828971862793 naiveAlgo seconds ---  
--- 8.133760929107666 faginAlgo seconds ---  
--- 0.44022607803344727 threshold seconds ---