In [1]:
topicFilePath = '../trec678-robust.xml'  # 250 queries

import xml.etree.ElementTree as ET

tree = ET.parse(topicFilePath)
topics = tree.getroot()

In [2]:
import lucene
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search.similarities import ClassicSimilarity
from org.apache.lucene.search.similarities import LMDirichletSimilarity
from org.apache.lucene.analysis.en import EnglishAnalyzer
from java.io import File

In [3]:
lucene.initVM()

<jcc.JCCEnv at 0x7f71b810e9d0>

In [4]:
index_path = '../index/'
directory = FSDirectory.open(File(index_path).toPath())
indexReader = DirectoryReader.open(directory)

FIELDNAME = 'CONTENT'

### TFIDF

In [5]:
# setting up the searcher
analyzer = EnglishAnalyzer()    # used same analyzer as indexer
searcher = IndexSearcher(DirectoryReader.open(directory))

queries = []
for topic in topics:
    qidField = 'num'
    queryKeywordsField = 'title'     # other fields are 'desc'and 'narr'

    qid = topic.find(qidField).text.strip()
    q = topic.find(queryKeywordsField).text.strip()

    escaped_q = QueryParser(FIELDNAME, analyzer).escape(q)      # a few titles had '/' in them which 
                                                                # EnglishAnalyzer was not able to parse
                                                                # without escaping those special characters
    query = QueryParser(FIELDNAME, analyzer).parse(escaped_q)
    queries.append((qid,query))


def tfidf():
    model = 'tfidf'
    similarityModel = ClassicSimilarity()
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    # change result file path below
    outputPath = f'./Models res files/TFIDF_678robust.res'
    f = open(outputPath, 'w')

    print(f'\nTFIDF; retrieving...', end=' ')

    # search on 250 queries from the topic file 'trec678-robust.xml'
    for qid,query in queries:
        # print(f'BM25, k1 = {k1}, b = {b}; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(query, k).scoreDocs

        # writing all k doc results in a .res file in trec format
        rank = 0
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_{model}\n")

    print('complete!')

    f.close()
    print('Search completed! Search results exported to a .res file in the current directory.\n')
    
    
def lmdir(mu):
    model = 'lmdir'
    similarityModel = LMDirichletSimilarity(mu)
    # setting the similarity model
    searcher.setSimilarity(similarityModel)

    # change result file path below
    outputPath = f'./Models res files/LMDir{mu}_678robust.res'
    f = open(outputPath, 'w')

    print(f'\nLMDir, mu = {mu}; retrieving...', end=' ')

    # search on 250 queries from the topic file 'trec678-robust.xml'
    for qid,query in queries:
        # print(f'BM25, k1 = {k1}, b = {b}; qid = {qid}, retrieving & writing ...', end=' ')

        # getting the top k search results using the searcher
        k = 1000
        scoreDocs = searcher.search(query, k).scoreDocs

        # writing all k doc results in a .res file in trec format
        rank = 0
        for scoreDoc in scoreDocs:
            rank += 1
            doc = searcher.doc(scoreDoc.doc)
            f.write(f"{qid}\tQ0\t{doc.get('DOCID')}\t{rank}\t{scoreDoc.score}\taman_{model}_{mu}\n")

    print('complete!')

    f.close()
    print('Search completed! Search results exported to a .res file in the current directory.\n')

In [6]:
tfidf()

lmdir(mu=1000.0)


TFIDF; retrieving... complete!
Search completed! Search results exported to a .res file in the current directory.


LMDir, mu = 1000.0; retrieving... complete!
Search completed! Search results exported to a .res file in the current directory.

