## Rocchio & Retrieval

Given the lucene index with term vectors stored, the following retriever with Rocchio PRF can be run.

In this version 3 (v3) file, query boosting was done and BM25 weighting scheme implemented and integrated with the main lmjm_rocchio function. MAPs for both TFIDF and BM25 are coming above baseline.

In [1]:
topicFilePath = './trec6.xml'  # 50 queries

In [2]:
import xml.etree.ElementTree as ET

tree = ET.parse(topicFilePath)
topics = tree.getroot()

In [3]:
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search.similarities import LMJelinekMercerSimilarity
from org.apache.lucene.search.similarities import LMDirichletSimilarity
from org.apache.lucene.analysis.en import EnglishAnalyzer
from java.io import File

from org.apache.lucene.search import BooleanQuery
from org.apache.lucene.search import BooleanClause
from org.apache.lucene.search import TermQuery
from org.apache.lucene.search import BoostQuery
from org.apache.lucene.index import Term

In [4]:
# run this again if VM is not initialized already
lucene.initVM()

<jcc.JCCEnv at 0x7fb028340d50>

In [5]:
index_path = './index/'
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

### Rocchio

In [30]:
import math

# calculating avgdl for queries. Used in BM25_query().
analyzer = EnglishAnalyzer()
query_lens = []
for topic in topics:
    queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'
    q = topic.find(queryKeywordsField).text.strip()
    escaped_q = QueryParser('CONTENT', analyzer).escape(q)      # a few titles had '/' in them which 
                                                                # EnglishAnalyzer was not able to parse
                                                                # without escaping those special characters
    query = QueryParser('CONTENT', analyzer).parse(escaped_q)
    query_terms = [term.strip()[8:] for term in query.toString().split()]
    query_lens.append(len(query_terms))
avgdl_query = sum(query_lens)/len(query_lens)

# calculating avgdl for collection. Used in BM25_docVec().
N = indexReader.numDocs()
avgdl_collection = indexReader.getSumTotalTermFreq('CONTENT')/N


def tf_idf_query(term, query_terms):
    # returns TF-IDF weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term('CONTENT', term))
    weight = (tf/D)*(math.log(N/(df+1)))
    return weight


def tf_idf_docVec(docVec, D):
    # tf-idf weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(N/(df+1))
        docVec[t] = (tf/D)*idf
    
    return docVec


def BM25_query(term, query_terms, k1=0.8, b=0.4):
    # returns Okapi BM25 weight for the given term in query
    D = len(query_terms)
    N = indexReader.numDocs()
    tf = query_terms.count(term)
    df = indexReader.docFreq(Term('CONTENT', term))
    idf = math.log(1+((N-df+0.5)/(df+0.5)))
    weight = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_query))))*idf
    return weight


def BM25_docVec(docVec, D, k1=0.8, b=0.4):
    # Okapi BM25 weight calculation for all the terms in the document vector
    N = indexReader.numDocs()       # no. of total docs in the corpus
    for t in docVec:
        tf = docVec[t][0]
        df = docVec[t][1]
        idf = math.log(1+((N-df+0.5)/(df+0.5)))
        docVec[t] = ((tf*(1+k1))/(tf+k1*((1-b)+(b*D/avgdl_collection))))*idf
    
    return docVec


def getDocumentVector(luceneDocid, FIELDNAME, weightScheme):
    # returns document vector in dictionary form with tf-idf weights
    from org.apache.lucene.util import BytesRefIterator
    
    docVec = {}                     # doc vector, which will have terms as keys and 
                                    # its tf-idf weight in the doc as values
    
    D = 0                           # doc length, i.e., total no. of tokens in the doc
    terms = indexReader.getTermVector(luceneDocid, FIELDNAME)
    iterator = terms.iterator()
    for term in BytesRefIterator.cast_(iterator):
        t = term.utf8ToString()
        tf = iterator.totalTermFreq()    # termFreq of term,t
        df = indexReader.docFreq(Term(FIELDNAME, t))    # docFreq of term,t
        D += tf
        docVec[t] = [tf,df]
        
    if weightScheme == 'TFIDF':
        docVec = tf_idf_docVec(docVec, D)
    elif weightScheme == 'BM25':
        docVec = BM25_docVec(docVec, D)
    
    return docVec


def rocchio_PRF(query, top_k_docs, N, alpha, beta, weightScheme):
    """Implements Rocchio's relevance feedback and returns a modified query

    Args:
        query (org.apache.lucene.search.Query): lucene parsed version of the initial/original query
        top_k_docs (lucene._lucene.JArray_object): scoreDocs returned after performing search with top k results
        N (int): number of terms to be in the returned modified query
        alpha (float): weight for original query
        beta (float): weight for positive feedback
        weightScheme (string): TFIDF or BM25 for term weighting

    Returns:
        list: expanded/modified query list of string query terms
    """
    
    # processing JQuery object to extract query terms in form of a list
    query_terms = [term.strip()[8:] for term in query.toString().split()]
    
    # creating query vector Q0
    Q0_vector = {}
    for term in query_terms:
        if weightScheme == 'TFIDF':
            Q0_vector[term] = tf_idf_query(term, query_terms)
        elif weightScheme == 'BM25':
            Q0_vector[term] = BM25_query(term, query_terms)
    
    sumRelDocsVector = {}     # Rel for Relevant, NRel for Non-relevant
    numRel = 0
    for scoreDoc in top_k_docs:
        docVec = getDocumentVector(scoreDoc.doc, 'CONTENT', weightScheme)
        numRel += 1
        # vector addition of sumRelDocsVector and docVec
        sumRelDocsVector = {term: sumRelDocsVector.get(term, 0) + docVec.get(term, 0) for term in set(sumRelDocsVector) | set(docVec)}
    
    r = {term: sumRelDocsVector[term]/numRel for term in sumRelDocsVector}    # normlaized Relevant Docs Vector
    
    # final Rocchio formula for Qm 
    expanded_query = [[term, alpha*Q0_vector.get(term,0) + beta*r.get(term,0)] for term in set(Q0_vector) | set(r)]
    
    expanded_query.sort(key = lambda x: x[1], reverse=True)   # sorted (descending) the expanded query list as per term scores
    Qm_with_scores = expanded_query[:N]     # selecting top N expanded query terms
    
    # weighting expanded query terms
    booleanQuery = BooleanQuery.Builder()
    for item in Qm_with_scores:
        t = Term('CONTENT', item[0])
        tq = TermQuery(t)
        boostedTermQuery = BoostQuery(tq, item[1])
        BooleanQuery.setMaxClauseCount(4096)
        booleanQuery.add(boostedTermQuery, BooleanClause.Occur.SHOULD)
    modifiedQuery = booleanQuery.build()
    
    return modifiedQuery   # modified query

### LM-JM and LM-JM + Rocchio Retrieval

In [10]:
def lmjm_rocchio(pr_docs_num, N, alpha, beta, weightScheme='TFIDF'):
    """ Performs LMJM search with Rocchio relevance feedback 
        on a set of queries and output the result in a file

    Args:
        pr_docs_num: no. of pseudo relevant docs
        N: no. of expansion terms
        alpha, beta: Rocchio model parameters
        weightScheme (string): TFIDF or BM25 for term weighting
        
    Returns:
        None
    """
     
    
    model = 'lmjm'
    LAMBDA = 0.4   # LM-JM baseline lambda parameter
    similarityModel = LMJelinekMercerSimilarity(LAMBDA)

    # change result file path below
    if weightScheme == 'BM25':
        rocchioOutputPath = f"./Rocchio_output/LMJM_Rocchio_#PRdocs={pr_docs_num}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    elif weightScheme == 'TFIDF':
        rocchioOutputPath = f"./Rocchio_output/LMJM_Rocchio_#PRdocs={pr_docs_num}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    else:
        print('Warning: weightScheme enetered not a valid parameter value. Taking default weightScheme: TFIDF')
        weightScheme = 'TFIDF'
        rocchioOutputPath = f"./Rocchio_output/LMJM_Rocchio_#PRdocs={pr_docs_num}_N={N}_alpha={alpha}_beta={beta}_{weightScheme}.res"
    
    f = open(rocchioOutputPath, 'w')

    # setting up the searcher
    analyzer = EnglishAnalyzer()    # used same analyzer as indexer
    index_path = './index/'
    directory = FSDirectory.open(File(index_path).toPath())
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    print('\nRetrieving ...')

    # search on 50 queries from the topic file 'trec6.xml'
    for topic in topics:
        qidField = 'num'
        queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

        qid = topic.find(qidField).text.strip()
        q = topic.find(queryKeywordsField).text.strip()

        escaped_q = QueryParser('CONTENT', analyzer).escape(q)      # a few titles had '/' in them which 
                                                                    # EnglishAnalyzer was not able to parse
                                                                    # without escaping those special characters
        query = QueryParser('CONTENT', analyzer).parse(escaped_q)

        print(f'Rocchio, PRdocs = {pr_docs_num}, N = {N}, alpha = {alpha}, beta = {beta}; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top pseudo relevant docs using the searcher
        scoreDocs = searcher.search(query, pr_docs_num).scoreDocs

        # Rocchio expanded query retrieval
        modified_query = rocchio_PRF(query, scoreDocs, N=N, alpha=alpha, beta=beta, weightScheme=weightScheme)

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(modified_query, k).scoreDocs

        # writing all k doc results in a .res file in TREC format
        rank = 0
        results = ''
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            # f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n")
            results += f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_lmjm_{LAMBDA}-rocchio_{alpha}_{beta}\n"
        
        f.write(results)

        print('complete!')

    f.close()
    print('Search completed! Search results exported to a .res file in the current directory.\n')

In [None]:
# running rocchio for all paramters

alphas = [i/100 for i in range(25,401,25)]
betas = [i/100 for i in range(25,401,25)]

for pr_docs_num in range(10,31,5):
    for N in range(50,121,10):
        for alpha in alphas:
            for beta in betas:
                lmjm_rocchio(pr_docs_num=pr_docs_num,N=N,alpha=alpha,beta=beta)

In [26]:
# This code cell helps check modified query by setting query and parameters of choice

model = 'lmjm'
LAMBDA = 0.4   # LM-JM baseline lambda parameter
similarityModel = LMJelinekMercerSimilarity(LAMBDA)

# setting up the searcher
analyzer = EnglishAnalyzer()    # used same analyzer as indexer
index_path = './index/'
directory = FSDirectory.open(File(index_path).toPath())
searcher = IndexSearcher(DirectoryReader.open(directory))

topic = topics[0]
qidField = 'num'
queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

qid = topic.find(qidField).text.strip()
q = topic.find(queryKeywordsField).text.strip()

# setting the similarity model
searcher.setSimilarity(similarityModel)

escaped_q = QueryParser('CONTENT', analyzer).escape(q)      # a few titles had '/' in them which 
                                                            # EnglishAnalyzer was not able to parse
                                                            # without escaping those special characters
query = QueryParser('CONTENT', analyzer).parse(escaped_q)

pr_docs_num = 10

# getting the top pseudo relevant docs using the searcher
scoreDocs = searcher.search(query, pr_docs_num).scoreDocs

N = 50
alpha = 1
beta = 4

# Rocchio expanded query retrieval
modified_query = rocchio_PRF(query, scoreDocs, N=N, alpha=alpha, beta=beta, weightScheme='BM25')

modified_query

<BooleanQuery: (CONTENT:crime)^25.174725 (CONTENT:organ)^14.74925 (CONTENT:russian)^13.528239 (CONTENT:yerin)^12.443677 (CONTENT:kramarev)^11.237789 (CONTENT:russia)^10.95366 (CONTENT:affair)^10.204604 (CONTENT:mvd)^9.883023 (CONTENT:crimin)^9.741795 (CONTENT:fight)^9.304386 (CONTENT:moscow)^9.254298 (CONTENT:enforc)^8.793086 (CONTENT:sphere)^8.579011 (CONTENT:combat)^8.454302 (CONTENT:arkadii)^7.974865 (CONTENT:p)^7.5909805 (CONTENT:ministri)^7.531326 (CONTENT:intern)^7.284298 (CONTENT:petersburg)^7.240315 (CONTENT:pm2705083194)^7.1797585 (CONTENT:pm2706135794)^7.1101203 (CONTENT:pm0902160994)^7.0929213 (CONTENT:sov)^7.023738 (CONTENT:ta1504192694)^7.0129485 (CONTENT:944f0615d)^6.930076 (CONTENT:944f0559b)^6.8468547 (CONTENT:pm3005102594)^6.830904 (CONTENT:edict)^6.7100906 (CONTENT:feder)^6.6716194 (CONTENT:law)^6.55573 (CONTENT:vasylyshyn)^6.5148463 (CONTENT:leningrad)^6.293019 (CONTENT:pm1506102194)^6.252441 (CONTENT:antimafia)^6.1444435 (CONTENT:drug)^6.111882 (CONTENT:usr)^6.10779

In [34]:
pr_docs_num = 10
N = 50
alpha = 1
beta = 0.25

# lmjm_rocchio(pr_docs_num=pr_docs_num,N=N,alpha=alpha,beta=beta, weightScheme='TFIDF')
lmjm_rocchio(pr_docs_num=pr_docs_num,N=N,alpha=alpha,beta=beta, weightScheme='BM25')


Retrieving ...
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 301, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 302, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 303, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 304, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 305, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 306, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 307, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 308, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 309, retrieving & writing ... complete!
Rocchio, PRdocs = 10, N = 50, alpha = 1, beta = 0.25; qid = 310, retrieving & writin