In [28]:
from collections import defaultdict

DIRICHLET_MU = 500

class DocLangModel:     # Document Maximnum Likelihood Language Model with Dirichlet Smoothing
    def __init__(self,doc_id,doc_text):
        self.doc_id = doc_id
        self.doc_text = preProcessText(doc_text,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True)
        self.token_freq = defaultdict(int)
        self.tokenizer = SimpleTokenizer()
        tokens = self.tokenizer.tokenize(self.doc_text)
        self.doc_length = len(tokens)
        for token in tokens:
            self.token_freq[token] += 1
        self.collection_model = None
        self.dirichlet_mu = DIRICHLET_MU

    def add_collection_model(self,CollectionLangModel):
        self.collection_model = CollectionLangModel
    
    def token_probability(self,token):
        return ((self.token_freq.get(token,0) + self.dirichlet_mu * self.collection_model.calc_M_c(token))/(self.doc_length + self.dirichlet_mu))


class CollectionLangModel:     # Maximnum Likelihood Language Model based on term frequencies in the collection as a whole
    def __init__(self):
        self.coll_total_tokens = 0
        self.coll_token_freq = defaultdict(int)

    def add_DocLangModel(self,docModel:DocLangModel):
        self.coll_total_tokens += docModel.doc_length
        for token,freq in docModel.token_freq.items():
            self.coll_token_freq[token] += freq

    def add_unk(self, percentage):
        self.coll_token_freq['<UNK>'] = int(percentage/100 * self.coll_total_tokens)
    
    def calc_M_c(self,token):
        return (self.coll_token_freq.get(token,0)/self.coll_total_tokens)

In [20]:
import os
import re
import numpy as np
import nltk
import string
# nltk.download('popular')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

# Pre-Process Text Parameters are boolean. TRUE if you want to remove that.
def preProcessText(text,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True):

    if lowercase:
        text = text.lower()

    if Stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split()if word.lower() not in stop_words)
        
    if punctuations:
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)

    if stemming:
        stemmer = SnowballStemmer("english")
        words = nltk.tokenize.wordpunct_tokenize(text)
        stemmed_words = [stemmer.stem(word) for word in words]
        text = ' '.join(stemmed_words)

    if digits:
        digits_pattern = r"\d+(\.\d+)?"
        text = re.sub(digits_pattern, "<NUMBER>", text)    

    return text


def KL_Divergence(document_model, relevance_model_probabilities):
    doc_prob = []
    rel_prob = []
    for token in relevance_model_probabilities.keys():
        doc_prob.append(document_model.token_probability(token))
        rel_prob.append(relevance_model_probabilities[token])
        # accumulator += document_model.probability(word) * log10(document_model.probability(word)/relevance_model_probabilities[word])
    doc_prob = np.array(doc_prob)
    rel_prob = np.array(rel_prob)
    
    return np.sum(doc_prob * np.log10(doc_prob/rel_prob))

In [4]:
import re
from typing import List
from nltk.tokenize import WordPunctTokenizer

class SimpleTokenizer:

    def __init__(self):
        self._tokenizer_ = WordPunctTokenizer()

    def tokenize(self, text: str)->List[str]:
        tokens = self._tokenizer_.tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalnum()]
        return tokens

In [6]:
import os
import csv
import re
import sys
from collections import defaultdict

csv.field_size_limit(sys.maxsize)

query_file = "/Users/jeet/Downloads/COL764-A2-2024/queries.tsv"
top_100_file = "/Users/jeet/Downloads/COL764-A2-2024/top100docs.tsv"
collection_file = "/Users/jeet/Downloads/COL764-A2-2024/docs.tsv"
output_file = "output-file"
expansion_file = "expansion-file"

Queries = {}    # query_id -> text
Top100 = {}     # query_id -> { rank -> (doc_id,score) }
DocIDs = set()  # set of doc_id
Docs = {}       # doc_id -> { url,title,body }

In [7]:
# Make Queries dictionary
with open(query_file,'r') as qf:
    tsv_reader = csv.reader(qf,delimiter='\t')
    next(tsv_reader)
    for row in tsv_reader:
        Queries[row[0]] = row[1]

Queries

{'42255': 'average salary for dental hygienist in nebraska',
 '47210': 'average wedding dress alteration cost',
 '67316': 'can fever cause miscarriage early pregnancy',
 '135802': 'definition of laudable',
 '156498': 'do google docs auto save',
 '169208': 'does mississippi have an income tax',
 '174463': 'dog day afternoon meaning',
 '258062': 'how long does it take to remove wisdom tooth',
 '324585': 'how much money do motivational speakers make',
 '330975': 'how much would it cost to install my own wind turbine',
 '332593': 'how often to button quail lay eggs',
 '336901': 'how old is vanessa redgrave',
 '673670': 'what is a alm',
 '701453': 'what is a statutory deed',
 '730539': 'what is chronometer who invented it',
 '768208': 'what is mamey',
 '877809': 'what metal are hip replacements made of',
 '911232': 'what type of conflict does della face in o, henry the gift of the magi',
 '938400': 'when did family feud come out?',
 '940547': 'when did rock n roll begin?',
 '997622': 'where

In [8]:
# Make Top100 dictionary & DocIDs set
with open(top_100_file,'r') as top:
    tsv_reader = csv.reader(top,delimiter='\t')
    next(tsv_reader)
    query = ''
    rank = 1
    for row in tsv_reader:
        if row[0]!= query:
            query = row[0]
            rank = 1
            Top100[row[0]] = {}
        Top100[row[0]][rank] = (row[1],row[2])
        DocIDs.add(row[1])
        rank+=1
len(DocIDs)

2400

In [9]:
# Make Docs dictionary
with open(collection_file,'r') as cf:
    tsv_reader = csv.reader(cf,delimiter='\t')
    for row in tsv_reader:
        if row[0] in DocIDs:
            Docs[row[0]] = {'url':row[1],'title':row[2],'body':row[3]}
        if len(Docs)==2400:
            break

CPU times: user 1min 51s, sys: 8.93 s, total: 2min
Wall time: 2min 3s


In [32]:
# For each Query
for qid,qtext in Queries.items():
    query = preProcessText(qtext,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True)

    # Get Language Model for each of the top 100 doc for the query containing token freq and doc length
    doc_lang_models = []
    for rank,(doc_id,score) in Top100[qid].items():
        doc_text = Docs[doc_id]['title'] + " " + Docs[doc_id]['body']
        doc_lang_models.append(DocLangModel(doc_id,doc_text))
    
    # Get Collection Statistics - Collection is of the top 100 docs
    Coll_lang_model  = CollectionLangModel()
    for DLM in doc_lang_models:
        Coll_lang_model.add_DocLangModel(DLM)

    Coll_lang_model.add_unk(0.5)

    # Add Collection Statistics to each Document language Model to calculate M_d(t) = (f_(t,d) + mu * M_c(t))/(l_d + mu)
    for DLM in doc_lang_models:
        DLM.add_collection_model(Coll_lang_model)

    query_tokens = [token if token in Coll_lang_model.coll_token_freq.keys() else '<UNK>' for token in query.split()]
    print(query_tokens)

    Query_prob_per_doc = []     # Probability of entering q as a query given that d is a relevant document
    for DLM in doc_lang_models:
        query_prob = 1
        for term in query_tokens:
            query_prob *= DLM.token_probability(term)
        Query_prob_per_doc.append(query_prob)  
    
    Avg_query_prob = sum(Query_prob_per_doc)/len(Query_prob_per_doc)

    # compute relevant model probabilities
    relevance_model_prob = {}

    for token in Coll_lang_model.coll_token_freq.keys():
        relevance_model_prob[token] = 0
        for idx,DLM in enumerate(doc_lang_models):
            prob_TGD = DLM.token_probability(token)     # Probability of the token being in the relevant document
            relevance_model_prob[token] += prob_TGD * Query_prob_per_doc[idx]
        relevance_model_prob[token] /= len(doc_lang_models)
        relevance_model_prob[token] /= Avg_query_prob

    results = []
    for i in range(len(doc_lang_models)):
        results.append((doc_lang_models[i].doc_id, 1-KL_Divergence(doc_lang_models[i], relevance_model_prob)))
    
    results.sort(key=lambda x: x[1], reverse=True)

    with open(output_file,'a') as out:
        for idx, (doc_id,score) in enumerate(results):
            out.write(f"{qid} Q0 {doc_id} {idx+1} {score:.6f} runid1\n")

    print(f"Processed Query : {qid}\r")


['average', 'salary', 'dental', 'hygienist', 'nebraska']
Processed Query : 42255
['average', 'wedding', 'dress', 'alteration', 'cost']
Processed Query : 47210
['fever', 'cause', 'miscarriage', 'early', 'pregnancy']
Processed Query : 67316
['definition', 'laudable']
Processed Query : 135802
['google', 'docs', 'auto', 'save']
Processed Query : 156498
['mississippi', 'income', 'tax']
Processed Query : 169208
['dog', 'day', 'afternoon', 'meaning']
Processed Query : 174463
['long', 'take', 'remove', 'wisdom', 'tooth']
Processed Query : 258062
['much', 'money', 'motivational', 'speakers', 'make']
Processed Query : 324585
['much', 'would', 'cost', 'install', 'wind', 'turbine']
Processed Query : 330975
['often', 'button', 'quail', 'lay', 'eggs']
Processed Query : 332593
['old', 'vanessa', 'redgrave']
Processed Query : 336901
['alm']
Processed Query : 673670
['statutory', 'deed']
Processed Query : 701453
['chronometer', 'invented']
Processed Query : 730539
['mamey']
Processed Query : 768208
['m

In [36]:
import re
from typing import List
from nltk.tokenize import RegexpTokenizer


class SimpleTokenizer:

    def __init__(self, delimiters):
        self.delimiters = delimiters
        pattern = "[" + re.escape(''.join(self.delimiters)) + "]+"
        self._tokenizer_ = RegexpTokenizer(pattern=pattern, gaps=True)

    def tokenize(self, text: str)->List[str]:
        tokens = self._tokenizer_.tokenize(text.lower())
        return tokens
    
if __name__ == "__main__":
    text = """This, is the 'text' I want to tokenize; My name is:chinmay. """
    text2 = "He said: 'I'm 50% sure that it's 10:30 am - (early morning) on 03/05/2024; check [this]!"
    tokenizer = SimpleTokenizer([" ", ",", ".", ":", ";", "\"", "\'", '/', '-', '%', '(', ')', '[', ']' ])
    tokens = tokenizer.tokenize(text)
    tokens2 = tokenizer.tokenize(text2)
    print(tokens)
    print(tokens2)


['this', 'is', 'the', 'text', 'i', 'want', 'to', 'tokenize', 'my', 'name', 'is', 'chinmay']
['he', 'said', 'i', 'm', '50', 'sure', 'that', 'it', 's', '10', '30', 'am', 'early', 'morning', 'on', '03', '05', '2024', 'check', 'this', '!']


In [32]:
class SimpleTokenizer2:
    def __init__(self):
        self.pattern = r"[\ ,\.:;\"'/\-%\(\)\[\]]+"
        # self.pattern = r"[ ,.:;\"\']+"    # Define the delimiter pattern for tokenization     
    
    def tokenize(self, text):
        tokens = re.split(self.pattern, text)
        tokens = [token.lower() for token in tokens if token]   # Filter out any empty strings that might have resulted from splitting    
        return tokens

if __name__ == "__main__":
    text = """This, is the 'text' I want to tokenize; My name is:chinmay. """
    text2 = "He said: 'I'm 50% sure that it's 10:30 am - (early morning) on 03/05/2024; check [this]!"
    tokenizer = SimpleTokenizer2()
    tokens = tokenizer.tokenize(text)
    tokens2 = tokenizer.tokenize(text2)
    print(tokens)
    print(tokens2)

['this', 'is', 'the', 'text', 'i', 'want', 'to', 'tokenize', 'my', 'name', 'is', 'chinmay']
['he', 'said', 'i', 'm', '50', 'sure', 'that', 'it', 's', '10', '30', 'am', 'early', 'morning', 'on', '03', '05', '2024', 'check', 'this', '!']


In [28]:
print(re.escape(''.join([" ", ",", ".", ":", ";", "\"", "\'", '/', '-', '%', '(', ')', '[', ']' ])))

\ ,\.:;"'/\-%\(\)\[\]
