## Retrievability Experiment

Retrievability experiment and analysis of TREC678 corpus

Steps -
* Generate query set Q containing unigram and bigram queries from the corpus
* Perform retrieval for all queries q $\in$ Q. If document d present within cutoff rank, then increament r(d) by 1.

In [None]:
# quert set Q divided into unigram queries and bigram queries
# unigram queries are the corpus vocabulary terms which has tf>=5
# vocab terms and their tf from lucene index
# filter tf on the fly and add terms into a list

# For bigram queries, use corpus doc generator class from doc2vec training code
# From each doc, take bigrams and try adding them to a dictionary
# if key found, add 1 to its value; if key not found, then add key into dict with value=1
# this dict is bigram,tf pair
# sort dict by tf in reverse order
# take first 2 million bigram and make a list of these bigrams

# write query set on disk for record

# run loop over unigram and bigram queries
# do BM25 retrieval of top c(=100) ranks for each query
# iterate over top c docs and try adding them into r(d) dict
# if key found, add 1 to its value; if key not found, then add key into dict with value=1
# this dict is docid,occurenceCount
# sort dict by value in ascending order

# write r(d) dict on disk for record

# plot Lorenz curve
# compute Gini Coefficient G


In [1]:
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import MultiTerms
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search.similarities import LMJelinekMercerSimilarity
from org.apache.lucene.search.similarities import LMDirichletSimilarity
from org.apache.lucene.analysis.en import EnglishAnalyzer
from java.io import File

from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search import BoostQuery
from org.apache.lucene.index import Term

from org.apache.lucene.util import BytesRefIterator

In [2]:
# run this again if VM is not initialized already
lucene.initVM()

<jcc.JCCEnv at 0x7f6f63b941f0>

In [3]:
# Lucene index of the corpus
index_path = '../index/'
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

FIELDNAME = 'CONTENT'       # Lucene index field name

### Query Generation

In [5]:
unigram_queries = []

terms = MultiTerms.getTerms(indexReader, FIELDNAME)
iterator = terms.iterator()

for term in BytesRefIterator.cast_(iterator):
    term_str = term.utf8ToString()
    t = Term(FIELDNAME, term_str)
    tf = indexReader.totalTermFreq(t)
    if tf >= 5:
        unigram_queries.append(term_str)

print(len(unigram_queries))

228501


In [6]:
# write unigram_queries to disk
with open('./unigram_queries.txt', 'w') as f:
    f.write('\n'.join(unigram_queries))
    f.write('\n')

In [7]:
import smart_open

class MyCorpus:
    def __init__(self, corpusDirectory):
        import os
        # Corpus documents directory path
        self.dirPath = corpusDirectory
        self.fileNames = os.listdir(self.dirPath)
        self.filePaths = [f'{self.dirPath}/{f}' for f in self.fileNames]
        self.docCount = 0
    
    def __iter__(self):
        import re
        tag_exp = re.compile('<.*?>')

        def cleanTag(rawDoc):
            cleanDoc = re.sub(tag_exp, '', rawDoc)
            return cleanDoc

        def process(oneDoc):
            # global docCount
            self.docCount += 1
            # print(docCount, docid)   # weirdly, printing docid was contributing to RAM overflow
            # print(docCount)
            return cleanTag(oneDoc)
        
        # this function needs to be called for each of the files in the directory
        def processFile(filePath):
            with smart_open.open(filePath, 'r', encoding='ISO-8859-1') as f:
                inDoc = False
                docid,oneDoc = "",""
                docCounts,docids,contents = [],[],[]     # will store all the docs (docIDs, Contents) of a single file in a list
                                            # with docid and contents in one-to-one list index-wise correspondence
                                            # Why making lists? See the note in the next cell.
                for line in f:
                    if inDoc:
                        if line.startswith("<DOCNO>"):
                            m = re.search('<DOCNO>(.+?)</DOCNO>', line)
                            docid = m.group(1)
                            continue
                        elif line.strip() == "</DOC>":
                            inDoc = False
                            contents.append(process(oneDoc))
                            docCounts.append(self.docCount)
                            docids.append(docid.strip())
                            oneDoc = ""
                        else:
                            oneDoc += line

                    elif line.strip() == "<DOC>":
                        inDoc = True
                # return docids,contents
                return docCounts,contents
        
        for filePath in self.filePaths:
            integer_ids,contents = processFile(filePath)
            for i in range(len(integer_ids)):
                yield contents[i]

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.regexp import blankline_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

# Corpus documents directory path
dirPath = '../../TREC_67/documents_robust04/'
trec_corpus = MyCorpus(corpusDirectory=dirPath)

bigram_counter = Counter()      # to store bigrams with their frequencies

stop_words = stopwords.words('english')
stop_words.extend(["'s"])
stopwords_set = set(stop_words)

count_doc = 0
for doc in trec_corpus:
    count_doc += 1
    # text pre-processing:
    # blankline tokenization, then sentence tokenization, then word tokenization
    sents_nested = [sent_tokenize(ss) for ss in blankline_tokenize(doc)]
    sents = [sent for sublist in sents_nested for sent in sublist]
    tokens_nested = [word_tokenize(s) for s in sents]
    # stopword removal and all char non-alphanumeric token removal
    tokens_nested = [[token for token in tokens if any(char.isalnum() for char in token) and token.lower() not in stopwords_set] for tokens in tokens_nested]
    # bigram sampling one sentence at a time
    for tokens in tokens_nested:
        bi_grams = ngrams(tokens, 2)
        bigram_counter.update(bi_grams)
    # stdout count_doc logging
    if count_doc%10000==0:
        print(f'upto #doc = {count_doc}')

print(f'\nNumber of docs covered during bigram sampling = {count_doc}')

limit = 2000000     # 2 million -> #bigram limit
bigram_list = sorted([x for x in bigram_counter.items() if x[1]>=20], reverse=True, key=lambda lst: lst[1])[:limit]
bigram_queries = [' '.join(x[0]) for x in bigram_list]

upto #doc = 10000
upto #doc = 20000
upto #doc = 30000
upto #doc = 40000
upto #doc = 50000
upto #doc = 60000
upto #doc = 70000
upto #doc = 80000
upto #doc = 90000
upto #doc = 100000
upto #doc = 110000
upto #doc = 120000
upto #doc = 130000
upto #doc = 140000
upto #doc = 150000
upto #doc = 160000
upto #doc = 170000
upto #doc = 180000
upto #doc = 190000
upto #doc = 200000
upto #doc = 210000
upto #doc = 220000
upto #doc = 230000
upto #doc = 240000
upto #doc = 250000
upto #doc = 260000
upto #doc = 270000
upto #doc = 280000
upto #doc = 290000
upto #doc = 300000
upto #doc = 310000
upto #doc = 320000
upto #doc = 330000
upto #doc = 340000
upto #doc = 350000
upto #doc = 360000
upto #doc = 370000
upto #doc = 380000
upto #doc = 390000
upto #doc = 400000
upto #doc = 410000
upto #doc = 420000
upto #doc = 430000
upto #doc = 440000
upto #doc = 450000
upto #doc = 460000
upto #doc = 470000
upto #doc = 480000
upto #doc = 490000
upto #doc = 500000
upto #doc = 510000
upto #doc = 520000

Number of docs cover

In [11]:
# write bigram_queries to disk
with open('./bigram_queries.txt', 'w') as f:
    f.write('\n'.join(bigram_queries))
    f.write('\n')

In [12]:
len(bigram_queries)

797452

### Retrievals and estimating Document Retrievability

In [9]:
from collections import Counter

# setting up the searcher
analyzer = EnglishAnalyzer()    # used same analyzer as indexer
searcher = IndexSearcher(DirectoryReader.open(directory))

model = 'bm25'
k1 = 0.8; b = 0.4
similarityModel = BM25Similarity(k1,b)
# setting the similarity model
searcher.setSimilarity(similarityModel)

r_d = Counter()     # to store lucene_docids and their cumulative counts


def retrieve(query):
    escaped_q = QueryParser('CONTENT', analyzer).escape(query)
    q = QueryParser('CONTENT', analyzer).parse(escaped_q)
    
    # getting the top c search results using the searcher
    c = 100
    scoreDocs = searcher.search(q, c).scoreDocs
        
    lucene_docids = []
    for scoreDoc in scoreDocs:
        lucene_docids.append(scoreDoc.doc)
            
    r_d.update(lucene_docids)

print('Unigram queries run starts...')
with open('./unigram_queries.txt') as f:
    i = 0
    len_unigram_queries = 228501
    for line in f:
        query = line[:-1]
        retrieve(query)
        
        i += 1
        if i%10000==0:
            print(f'Unigram queries progress... {i*100/len_unigram_queries: .2f}%')
        
    print(f'Run completed with unigram queries progress... {i*100/len_unigram_queries: .2f}%\n')

print('Bigram queries run starts...')
with open('./bigram_queries.txt') as f:
    i = 0
    len_bigram_queries = 797452
    for line in f:
        query = line[:-1]
        retrieve(query)
        
        i += 1
        if i%10000==0:
            print(f'Bigram queries progress... {i*100/len_bigram_queries: .2f}%')
    
    print(f'Run completed with bigram queries progress... {i*100/len_bigram_queries: .2f}%')

import pickle

with open('./rd.pickle', 'wb') as f:
    pickle.dump(r_d, f)

Unigram queries run starts...
Unigram queries progress...  4.38%
Unigram queries progress...  8.75%
Unigram queries progress...  13.13%
Unigram queries progress...  17.51%
Unigram queries progress...  21.88%
Unigram queries progress...  26.26%
Unigram queries progress...  30.63%
Unigram queries progress...  35.01%
Unigram queries progress...  39.39%
Unigram queries progress...  43.76%
Unigram queries progress...  48.14%
Unigram queries progress...  52.52%
Unigram queries progress...  56.89%
Unigram queries progress...  61.27%
Unigram queries progress...  65.65%
Unigram queries progress...  70.02%
Unigram queries progress...  74.40%
Unigram queries progress...  78.77%
Unigram queries progress...  83.15%
Unigram queries progress...  87.53%
Unigram queries progress...  91.90%
Unigram queries progress...  96.28%
Run completed with unigram queries progress...  100.00%

Bigram queries run starts...
Bigram queries progress...  1.25%
Bigram queries progress...  2.51%
Bigram queries progress...

In [10]:
print(f'Run completed with bigram queries progress... {i*100/len_bigram_queries: .2f}%')

Run completed with bigram queries progress...  100.00%


In [12]:
with open('./rd.pickle', 'rb') as f:
    r_d_load = pickle.load(f)

In [14]:
# r_d_load.most_common(10)
type(r_d_load)

collections.Counter

In [11]:
r_d.most_common(10)

[(250940, 12719),
 (290694, 7146),
 (290697, 4076),
 (290693, 3562),
 (262627, 3441),
 (70208, 3112),
 (398846, 3017),
 (266912, 2930),
 (150269, 2820),
 (87389, 2765)]

In [20]:
# setting up the searcher
analyzer = EnglishAnalyzer()    # used same analyzer as indexer
searcher = IndexSearcher(DirectoryReader.open(directory))

def ret(query):
    q = QueryParser('CONTENT', analyzer).parse(query)
    
    # getting the top c search results using the searcher
    c = 10
    scoreDocs = searcher.search(q, c).scoreDocs
        
    docids = []
    for scoreDoc in scoreDocs:
        print(scoreDoc)
        doc = searcher.doc(scoreDoc.doc)
        docids.append(doc.get('DOCID'))

ret('United States')

doc=210200 score=1.9868847 shardIndex=0
doc=19831 score=1.9738262 shardIndex=0
doc=515136 score=1.9668773 shardIndex=0
doc=130628 score=1.961197 shardIndex=0
doc=286465 score=1.9611716 shardIndex=0
doc=516459 score=1.9607632 shardIndex=0
doc=216507 score=1.9568355 shardIndex=0
doc=216494 score=1.955946 shardIndex=0
doc=12444 score=1.9508271 shardIndex=0
doc=286457 score=1.950824 shardIndex=0


In [19]:
searcher.doc(0).get('DOCID')

'FBIS3-38143'