## Retrievability Experiment

Retrievability experiment and analysis of TREC678 corpus

Steps -
* Generate query set Q containing unigram and bigram queries from the corpus
* Perform retrieval for all queries q $\in$ Q. If document d present within cutoff rank, then increament r(d) by 1.

In [None]:
# quert set Q divided into unigram queries and bigram queries
# unigram queries are the corpus vocabulary terms which has tf>=5
# vocab terms and their tf from lucene index
# filter tf on the fly and add terms into a list

# For bigram queries, use corpus doc generator class from doc2vec training code
# From each doc, take bigrams and try adding them to a dictionary
# if key found, add 1 to its value; if key not found, then add key into dict with value=1
# this dict is bigram,tf pair
# sort dict by tf in reverse order
# take first 2 million bigram and make a list of these bigrams

# write query set on disk for record

# run loop over unigram and bigram queries
# do BM25 retrieval of top c(=100) ranks for each query
# iterate over top c docs and try adding them into r(d) dict
# if key found, add 1 to its value; if key not found, then add key into dict with value=1
# this dict is docid,occurenceCount
# sort dict by value in ascending order

# write r(d) dict on disk for record

# plot Lorenz curve
# compute Gini Coefficient G


In [1]:
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.index import MultiTerms
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search.similarities import LMJelinekMercerSimilarity
from org.apache.lucene.search.similarities import LMDirichletSimilarity
from org.apache.lucene.analysis.en import EnglishAnalyzer
from java.io import File

from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search import BoostQuery
from org.apache.lucene.index import Term

from org.apache.lucene.util import BytesRefIterator

In [2]:
# run this again if VM is not initialized already
lucene.initVM()

<jcc.JCCEnv at 0x7f2067b1d350>

In [3]:
# Lucene index of the corpus
index_path = '../../Dwaipayan sir/3. Rocchio & RM3/index/'
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

FIELDNAME = 'CONTENT'       # Lucene index field name

### Query Generation

In [4]:
unigram_queries = []

terms = MultiTerms.getTerms(indexReader, FIELDNAME)
iterator = terms.iterator()

for term in BytesRefIterator.cast_(iterator):
    term_str = term.utf8ToString()
    t = Term(FIELDNAME, term_str)
    tf = indexReader.totalTermFreq(t)
    if tf >= 5:
        unigram_queries.append(term_str)

print(len(unigram_queries))

228501


In [27]:
# write unigram_queries to disk
with open('./unigram_queries.txt', 'w') as f:
    f.write('\n'.join(unigram_queries))
    f.write('\n')

In [6]:
import smart_open

class MyCorpus:
    def __init__(self, corpusDirectory):
        import os
        # Corpus documents directory path
        self.dirPath = corpusDirectory
        self.fileNames = os.listdir(self.dirPath)
        self.filePaths = [f'{self.dirPath}/{f}' for f in self.fileNames]
        self.docCount = 0
    
    def __iter__(self):
        import re
        tag_exp = re.compile('<.*?>')

        def cleanTag(rawDoc):
            cleanDoc = re.sub(tag_exp, '', rawDoc)
            return cleanDoc

        def process(oneDoc):
            # global docCount
            self.docCount += 1
            # print(docCount, docid)   # weirdly, printing docid was contributing to RAM overflow
            # print(docCount)
            return cleanTag(oneDoc)
        
        # this function needs to be called for each of the files in the directory
        def processFile(filePath):
            with smart_open.open(filePath, 'r', encoding='ISO-8859-1') as f:
                inDoc = False
                docid,oneDoc = "",""
                docCounts,docids,contents = [],[],[]     # will store all the docs (docIDs, Contents) of a single file in a list
                                            # with docid and contents in one-to-one list index-wise correspondence
                                            # Why making lists? See the note in the next cell.
                for line in f:
                    if inDoc:
                        if line.startswith("<DOCNO>"):
                            m = re.search('<DOCNO>(.+?)</DOCNO>', line)
                            docid = m.group(1)
                            continue
                        elif line.strip() == "</DOC>":
                            inDoc = False
                            contents.append(process(oneDoc))
                            docCounts.append(self.docCount)
                            docids.append(docid.strip())
                            oneDoc = ""
                        else:
                            oneDoc += line

                    elif line.strip() == "<DOC>":
                        inDoc = True
                # return docids,contents
                return docCounts,contents
        
        for filePath in self.filePaths:
            integer_ids,contents = processFile(filePath)
            for i in range(len(integer_ids)):
                yield contents[i]

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.regexp import blankline_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

# Corpus documents directory path
dirPath = '../../Dwaipayan sir/3. Rocchio & RM3/trec678_corpus/documents'
trec_corpus = MyCorpus(corpusDirectory=dirPath)

bigram_counter = Counter()      # to store bigrams with their frequencies

stop_words = stopwords.words('english')
stop_words.extend(["'s"])
stopwords_set = set(stop_words)

count_doc = 0
for doc in trec_corpus:
    count_doc += 1
    # text pre-processing:
    # blankline tokenization, then sentence tokenization, then word tokenization
    sents_nested = [sent_tokenize(ss) for ss in blankline_tokenize(doc)]
    sents = [sent for sublist in sents_nested for sent in sublist]
    tokens_nested = [word_tokenize(s) for s in sents]
    # stopword removal and all char non-alphanumeric token removal
    tokens_nested = [[token for token in tokens if any(char.isalnum() for char in token) and token.lower() not in stopwords_set] for tokens in tokens_nested]
    # bigram sampling one sentence at a time
    for tokens in tokens_nested:
        bi_grams = ngrams(tokens, 2)
        bigram_counter.update(bi_grams)
    # stdout count_doc logging
    if count_doc%10000==0:
        print(f'upto #doc = {count_doc}')

print(f'\nNumber of docs covered during bigram sampling = {count_doc}')

limit = 2000000     # 2 million -> #bigram limit
bigram_list = sorted([x for x in bigram_counter.items() if x[1]>=20], reverse=True, key=lambda lst: lst[1])[:limit]
bigram_queries = [' '.join(x[0]) for x in bigram_list]

In [None]:
# write bigram_queries to disk
with open('./bigram_queries.txt', 'w') as f:
    f.write('\n'.join(bigram_queries))
    f.write('\n')

### Retrievals and estimating Document Retrievability

In [None]:
from collections import Counter

# setting up the searcher
analyzer = EnglishAnalyzer()    # used same analyzer as indexer
searcher = IndexSearcher(DirectoryReader.open(directory))

model = 'bm25'
k1 = 0.8; b = 0.4
similarityModel = BM25Similarity(k1,b)
# setting the similarity model
searcher.setSimilarity(similarityModel)

r_d = Counter()     # to store DOCIDs and their cumulative counts


def retrieve(query):
    q = QueryParser('CONTENT', analyzer).parse(query)
    
    # getting the top c search results using the searcher
    c = 100
    scoreDocs = searcher.search(q, c).scoreDocs
        
    docids = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        docids.append(doc.get('DOCID'))
            
    r_d.update(docids)


with open('./unigram_queries.txt') as f:
    i = 0
    for line in f:
        query = line[:-1]
        retrieve(query)
        
        i += 1
        if i >= 1000:
            break

with open('./bigram_queries.txt') as f:
    i = 0
    for line in f:
        query = line[:-1]
        retrieve(query)
        
        i += 1
        if i >= 10:
            break

In [30]:
r_d.most_common(10)

[('FR940527-1-00176', 228),
 ('FR940505-1-00477', 110),
 ('FR940505-1-00478', 103),
 ('FR941116-0-00016', 94),
 ('FR940803-1-00006', 94),
 ('FR940527-1-00157', 80),
 ('FR940919-0-00127', 62),
 ('FR940919-0-00064', 60),
 ('FBIS3-42440', 57),
 ('FR940527-1-00159', 57)]

In [31]:
def ret(query):
    q = QueryParser('CONTENT', analyzer).parse(query)
    
    # getting the top c search results using the searcher
    c = 10
    scoreDocs = searcher.search(q, c).scoreDocs
        
    docids = []
    for scoreDoc in scoreDocs:
        print(scoreDoc)
        doc = searcher.doc(scoreDoc.doc)
        docids.append(doc.get('DOCID'))

ret('United States')

doc=229601 score=2.0238037 shardIndex=0
doc=19831 score=2.0236964 shardIndex=0
doc=210200 score=2.013215 shardIndex=0
doc=286457 score=2.010394 shardIndex=0
doc=286465 score=2.0103028 shardIndex=0
doc=12444 score=2.0081882 shardIndex=0
doc=130628 score=2.00703 shardIndex=0
doc=68729 score=2.006886 shardIndex=0
doc=6789 score=2.003496 shardIndex=0
doc=43671 score=2.0031536 shardIndex=0


In [52]:
searcher.doc(7368).get('DOCID')

'FBIS3-1'

In [38]:
class MyCorpus2:
    def __init__(self, corpusDirectory):
        import os
        # Corpus documents directory path
        self.dirPath = corpusDirectory
        self.fileNames = os.listdir(self.dirPath)
        self.filePaths = [f'{self.dirPath}/{f}' for f in self.fileNames]
        self.docCount = 0
    
    def __iter__(self):
        import re
        tag_exp = re.compile('<.*?>')

        def cleanTag(rawDoc):
            cleanDoc = re.sub(tag_exp, '', rawDoc)
            return cleanDoc

        def process(oneDoc):
            # global docCount
            self.docCount += 1
            # print(docCount, docid)   # weirdly, printing docid was contributing to RAM overflow
            # print(docCount)
            return cleanTag(oneDoc)
        
        # this function needs to be called for each of the files in the directory
        def processFile(filePath):
            with smart_open.open(filePath, 'r', encoding='ISO-8859-1') as f:
                inDoc = False
                docid,oneDoc = "",""
                docCounts,docids,contents = [],[],[]     # will store all the docs (docIDs, Contents) of a single file in a list
                                            # with docid and contents in one-to-one list index-wise correspondence
                                            # Why making lists? See the note in the next cell.
                for line in f:
                    if inDoc:
                        if line.startswith("<DOCNO>"):
                            m = re.search('<DOCNO>(.+?)</DOCNO>', line)
                            docid = m.group(1)
                            continue
                        elif line.strip() == "</DOC>":
                            inDoc = False
                            contents.append(process(oneDoc))
                            docCounts.append(self.docCount)
                            docids.append(docid.strip())
                            oneDoc = ""
                        else:
                            oneDoc += line

                    elif line.strip() == "<DOC>":
                        inDoc = True
                # return docids,contents
                return docCounts,contents,docids
        
        for filePath in self.filePaths:
            integer_ids,contents,docIDs = processFile(filePath)
            for i in range(len(integer_ids)):
                yield integer_ids[i],docIDs[i]

In [51]:
dirPath = '../../Dwaipayan sir/3. Rocchio & RM3/trec678_corpus/documents'
corpus = MyCorpus2(corpusDirectory=dirPath)

i = 0
print('docCount\tdocid\tdocid_lucene')
for x in corpus:
    docCount,docid = x
    # print(f"{docCount}\t{docid}\t{searcher.doc(docCount).get('DOCID')}")
    i += 1
    if i%10000==0:
        print(i)
    if searcher.doc(docCount).get('DOCID') == 'FBIS3-1':
        print(f"{docCount}\t{docid}\t{searcher.doc(docCount).get('DOCID')}")
        break

docCount	docid	docid_lucene
7368	FBIS3-7689	FBIS3-1
