In [1]:
import math
corpus = [[] for i in range(100)]  

In [2]:
i = 0
for i in range(100):
    file_name ="Datasets/doc_" + str(i) + ".txt"
    with open(file_name, 'r', encoding = "utf-8") as file:
        for line in file:
            for word in line.split():
                corpus[i].append(word)

In [3]:
i = 0
# contains all the words in the corpus in a set without duplicates
unique_words = {"and"}
for i in range(100):
    unique_words = unique_words.union(set(corpus[i]))

In [4]:
# average length of all the documents
def avgDocLen():
    avg = 0
    for i in range(100):
        avg = avg + len(corpus[i])
    return avg/100

In [5]:
# calculate frequency of documents in which word occurs
def calculateFreqWord(word):
    num_of_docs = 0
    normalizedTermsTemp = []
    for j in range(len(corpus[i])):
        normalizedTermsTemp.append(corpus[i][j].lower())
    if word.lower() in normalizedTermsTemp:
        num_of_docs += 1
    return num_of_docs;

In [6]:
# occurence of a word in a document
def freqWordDoc(word, doc):
    count = 0
    freqTerm = dict()
    normalizedTerms = []
    for j in range(len(doc)):
        normalizedTerms.append(doc[j].lower())
    for term in normalizedTerms:
        if term == word:
            count = count + 1
    return count

In [7]:
# frequency of a word in a query
def freqWordQuery(word, query):
    count = 0
    for i in range(len(query)):
        if word == query[i]:
            count = count + 1
    return count

In [8]:
S1 = 5.0
K1 = 0.2
def calculateFij1(word, doc):
    fij = freqWordDoc(word, doc)
    num = S1 * fij
    den = K1 + fij
    return num/den

In [9]:
def calculateFij2(word, doc):
    fij = freqWordDoc(word, doc)
    num = S1*fij
    den = (K1 + len(doc))/avgDocLen() + fij
    return num/den

In [10]:
S3 = 3.0
K3 = 2.3
def calculateFiq(word, query):
    fiq = freqWordQuery(word, query)
    num = S3 + fiq
    den = K3 + fiq
    return num/den

# Part 1: Original BM Formulas

In [11]:
N = 100
# the query list is created to contain only words that is present in the corpus
def BM1Original(queryList, doc):
    rankBM1 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        rankBM1 += math.log(num/den)
    return rankBM1

In [12]:
def BM11Original(queryList, doc):
    rankBM11 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        rankBM11 += calculateFij2(queryList[i], doc)*calculateFiq(queryList[i], queryList)*math.log(num/den)
    return rankBM11

In [13]:
def BM15Original(queryList, doc):
    rankBM15 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        rankBM15 += calculateFij1(queryList[i], doc)*calculateFiq(queryList[i], queryList)*math.log(num/den)
    return rankBM15

## Part 2: Simpler BM Formulas

In [14]:
def BM11Simplified(queryList, doc):
    rankBM11 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        temp = ((K1 + 1)*freqWordDoc(queryList[i], doc))/((K1*len(doc))/avgDocLen() + freqWordDoc(queryList[i], doc))
        rankBM11 += (temp*math.log(num/den))
    return rankBM11

In [15]:
def BM15Simplified(queryList, doc):
    rankBM15 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        temp = (K1+1)*freqWordDoc(queryList[i], doc)/(K1+freqWordDoc(queryList[i], doc))
        rankBM15 += (temp*math.log(num/den))
    return rankBM15

## Part 3: BM25

In [16]:
b = 0.6
def calculateBij(word, doc):
    num = (K1 + 1)*freqWordDoc(word, doc)
    den = K1*((1-b)+ b*len(doc)/avgDocLen()) + freqWordDoc(word, doc)
    return num/den

In [17]:
def BM25(queryList, doc):
    rankBM25 = 0.0
    for i in range(len(queryList)):
        num = N - calculateFreqWord(queryList[i]) + 0.5
        den = calculateFreqWord(queryList[i]) + 0.5
        temp = calculateBij(queryList[i], doc)
        rankBM25 += (temp*math.log(num/den))
    return rankBM25

## Final Rankings

In [18]:
# converts a given query to a list
def makeQueryList(query):
    li = list(query.split(" "))
    return li

In [19]:
rankedDocs = dict()
# generates a ranked list of documents
def generateRankedList(query):
    queryList = []
    for i in range(len(query)):
        for term in unique_words:
            if term == query[i]:
                queryList.append(term)
    for i in range(100):
        rankedDocs[i] = BM25(queryList, corpus[i])
    return queryList

In [20]:
#q = "Americans victory"
#q = "squads for centralised"
#q = "Washington news for today"
# q = "eat healthy"
q = "Washington is the capital of America. Eat healthy and stay happy. Victory of Americans."
queryList = makeQueryList(q)
generateRankedList(queryList)
# sorts the documents in decreasing rank
sortedRankedDocs = sorted(rankedDocs.items(), key=lambda kv: kv[1], reverse= True)
print(sortedRankedDocs)

[(71, 24.71036951775119), (34, 24.506426921241918), (11, 24.429105399839955), (18, 24.3316480655729), (93, 24.157421184634174), (12, 24.085250007834812), (79, 24.045646301750875), (64, 24.0034283627492), (8, 23.955593852973816), (46, 23.925895947254443), (44, 23.870988045602765), (75, 23.825323898437905), (13, 23.77608549953814), (25, 23.543861845152737), (32, 23.26330910296604), (38, 23.17957775573173), (72, 23.162526160056704), (66, 22.72741236552405), (69, 22.72741236552405), (61, 22.46341073738064), (65, 19.612618962509472), (84, 19.547179209786997), (92, 19.538505321168188), (26, 19.491860713453118), (91, 19.491860713453118), (94, 19.37790086885186), (42, 19.37548457130805), (98, 19.36998405657417), (70, 19.34524987383677), (17, 19.33267972532781), (15, 19.314023874809056), (87, 19.301570649006333), (37, 19.24183015226704), (85, 19.24183015226704), (73, 19.221832830060563), (95, 19.20070571563903), (5, 19.199832694381946), (21, 19.18894760983015), (99, 19.18894760983015), (9, 19.1