In [1]:
import re
from nltk.tokenize import WordPunctTokenizer, TreebankWordTokenizer

class SimpleTokenizer:      # Tokenize and remove punctuation  

    def __init__(self):
        self._tokenizer_ = WordPunctTokenizer()

    def tokenize(self, text):
        tokens = self._tokenizer_.tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalnum()]
        return tokens

class SimpleTokenizer2:     # Tokenizes and removes punctuation but does not remove contractions

    def __init__(self):
        self._tokenizer_ = TreebankWordTokenizer()

    def tokenize(self, text):
        tokens = self._tokenizer_.tokenize(text)
        tokens = [token.lower() for token in tokens if not (len(token)==1 and not token.isalnum())]
        return tokens

In [2]:
import os
import re
import numpy as np
import string
import nltk
import math
nltk.download('popular')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}

# Expand contractions helper function
def expand_contractions(text, contractions_dict=contractions_dict):
    contractions_re = re.compile('(%s)' % '|'.join(re.escape(key) for key in contractions_dict.keys()))
    return contractions_re.sub(lambda match: contractions_dict[match.group(0)], text)

# Pre-Process Text Parameters are boolean. TRUE if you want to remove that.
def preProcessText(text,lowercase=True,contractions=False,punctuations=True,digits=False,stemming=False,Stopwords=True):

    if lowercase:
        text = text.lower()

    if contractions:
        text = expand_contractions(text)

    if Stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split()if word.lower() not in stop_words)
        
    if punctuations:
        translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
        text = text.translate(translator)

    if stemming:
        stemmer = SnowballStemmer("english")
        words = nltk.tokenize.wordpunct_tokenize(text)
        stemmed_words = [stemmer.stem(word) for word in words]
        text = ' '.join(stemmed_words)

    if digits:
        digits_pattern = r"\d+(\.\d+)?"
        text = re.sub(digits_pattern, "<NUMBER>", text)    

    text = ' '.join(word for word in text.split() if (word.isascii() and len(word) > 1))

    return text


def KL_Divergence(document_model, relevance_model_probabilities):
    doc_prob = []
    rel_prob = []
    for token in relevance_model_probabilities.keys():
        doc_prob.append(document_model.token_probability(token))
        rel_prob.append(relevance_model_probabilities[token])
    doc_prob = np.array(doc_prob)
    rel_prob = np.array(rel_prob)
    
    return np.sum(doc_prob * np.log10(doc_prob/rel_prob))

def KL_Divergence_Reverse(document_model, query_model):
    accumulator = 0
    for word in query_model.keys():
        if query_model[word] > 0:
            accumulator += query_model[word] * math.log10(query_model[word]/document_model.token_probability(word))
    return accumulator

def parse_results_file(results_file_path):    
    results = {}    # query_id -> rank -> doc_id

    with open(results_file_path, 'r') as rf:
        for line in rf.readlines():
            if line.strip():
                query_id, ignore_col, doc_id, rank, score, run_id = line.split()
                query_id = int(query_id)
                rank = int(rank)
                if query_id not in results:
                    results[query_id] = {}
                results[query_id][rank] = doc_id
    return results

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/jeet/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!

In [15]:
from collections import defaultdict

class DocLangModel:     # Document Maximnum Likelihood Language Model with Dirichlet Smoothing
    def __init__(self,doc_id,doc_text,DIRICHLET_MU=500,tokenizer='S1'):
        self.doc_id = doc_id
        self.doc_text = preProcessText(doc_text,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True,contractions=True)
        self.token_freq = defaultdict(int)
        if tokenizer == "S1":
            self.tokenizer = SimpleTokenizer()
        else:
            self.tokenizer = SimpleTokenizer2()
        tokens = self.tokenizer.tokenize(self.doc_text)
        self.doc_length = len(tokens)
        for token in tokens:
            self.token_freq[token] += 1
        self.collection_model = None
        self.dirichlet_mu = DIRICHLET_MU

    def add_collection_model(self,CollectionLangModel):
        self.collection_model = CollectionLangModel
    
    def token_probability(self,token):
        return ((self.token_freq.get(token,0) + self.dirichlet_mu * self.collection_model.calc_M_c(token))/(self.doc_length + self.dirichlet_mu))


class CollectionLangModel:     # Maximnum Likelihood Language Model based on term frequencies in the collection as a whole
    def __init__(self):
        self.coll_total_tokens = 0
        self.coll_token_freq = defaultdict(int)

    def add_DocLangModel(self,docModel:DocLangModel):
        self.coll_total_tokens += docModel.doc_length
        for token,freq in docModel.token_freq.items():
            self.coll_token_freq[token] += freq

    def add_unk(self, percentage):
        self.coll_token_freq['<UNK>'] = int(percentage/100 * self.coll_total_tokens)
    
    def calc_M_c(self,token):
        return (self.coll_token_freq.get(token,0)/self.coll_total_tokens)

In [4]:
import os
import csv
import re
import sys
from collections import defaultdict,Counter
from gensim.models import Word2Vec

csv.field_size_limit(sys.maxsize)

query_file = "/Users/jeet/Downloads/COL764-A2-2024/queries.tsv"
top_100_file = "/Users/jeet/Downloads/COL764-A2-2024/top100docs.tsv"
collection_file = "/Users/jeet/Downloads/COL764-A2-2024/docs.tsv"
output_file = "output-file"
expansion_file = "expansion-file"

Queries = {}    # query_id -> text
Top100 = {}     # query_id -> { rank -> (doc_id,score) }
DocIDs = set()  # set of doc_id
Docs = {}       # doc_id -> { url,title,body }


In [5]:
# Make Queries dictionary
with open(query_file,'r') as qf:
    tsv_reader = csv.reader(qf,delimiter='\t')
    next(tsv_reader)
    for row in tsv_reader:
        Queries[row[0]] = row[1]

In [6]:
# Make Top100 dictionary & DocIDs set
with open(top_100_file,'r') as top:
    tsv_reader = csv.reader(top,delimiter='\t')
    next(tsv_reader)
    query = ''
    rank = 1
    for row in tsv_reader:
        if row[0]!= query:
            query = row[0]
            rank = 1
            Top100[row[0]] = {}
        Top100[row[0]][rank] = (row[1],row[2])
        DocIDs.add(row[1])
        rank+=1

In [7]:
# Make Docs dictionary
with open(collection_file,'r') as cf:
    tsv_reader = csv.reader(cf,delimiter='\t')
    for row in tsv_reader:
        if row[0] in DocIDs:
            Docs[row[0]] = {'url':row[1],'title':row[2],'body':row[3]}
        if len(Docs)==2400:
            break

In [12]:
doc_info = Docs['D750949']
text = preProcessText((doc_info['title']+' '+doc_info['body']),lowercase=True,Stopwords=True,contractions=True,punctuations=True,digits=False,stemming=False)
text

'worthy worthy also found in thesaurus medical legal idioms encyclopedia wikipedia th adj worth merit value worthy cause honorable admirable worthy fellow sufficient worth deserving worthy revered worthy acclaim pl eminent distinguished person adv american dictionary english language fifth edition copyright 2016 houghton mifflin harcourt publishing company published houghton mifflin harcourt publishing company rights reserved worthy adj thier thiest1 postpositive often foll infinitive sufficient merit value for something someone specified deserving2 worth value meritn pl thiesoften facetious person distinguished character merit collins english dictionary complete unabridged 12th edition 2014 harper collins publishers 1991 1994 1998 2000 2003 2006 2007 2009 2011 adj adj adequate great merit character value worthy successor commendable excellence merit deserving meritorious effort worthy praise person eminent worth merit position adv worthya combining form worthy used meanings of fit new

In [30]:
# Prepare corpus for training
corpus = []

for query_id, query_text in Queries.items():
    corpus.append(preProcessText(query_text,lowercase=True,Stopwords=True,contractions=True,punctuations=True,digits=False,stemming=False).split())

for doc_id, doc_info in Docs.items():
    corpus.append(preProcessText((doc_info['title']+' '+doc_info['body']),lowercase=True,Stopwords=True,contractions=True,punctuations=True,digits=False,stemming=False).split())
                                                                                    

In [42]:
# Train Word2Vec to generate local embeddings
for sg in [0,1]:
    print('sg: ',sg)
    if sg == 1:
        mod = "Skip"
    else:
        mod = "CBOW"
    for window in [3,5,7,9]:
        print('window: ',window)

        w2v_model = Word2Vec(sentences=corpus, vector_size=300, window=window, min_count=5, sg=sg)

        word_to_index = w2v_model.wv.key_to_index
        embeddings = []
        index_to_word = {}
        for word,idx in word_to_index.items():
            index_to_word[idx] = word
        
        for word,idx in word_to_index.items():
            embeddings.append(w2v_model.wv[word])

        embeddings = np.array(embeddings)    
        V, k = embeddings.shape 
                
        query_count = 0
        for qid,qtext in Queries.items():   # query_id, query_text
            query_count +=1
            query = preProcessText(qtext,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True,contractions=True)

            original_QT = SimpleTokenizer2().tokenize(query)     # Original Query Terms

            query_vector = np.zeros(shape=(V,1))    # binary vector to represent terms present in query
            for word, idx in word_to_index.items():
                query_vector[idx, 0] = original_QT.count(word.lower())
            
            temp = np.matmul(embeddings.T,query_vector)     
            query_sim_scores = np.matmul(embeddings, temp)      # Similarity scores of trained W2V words to the query terms

            flat_sim_matrix = query_sim_scores.flatten()
            sorted_idx = np.argsort(flat_sim_matrix)[::-1]      # Sort the similaity scores in descending order and store the indexes

            for Top_N in [5,10,15,20]:
                sim_word_idx = [(idx, 0) for idx in sorted_idx[:Top_N]]    # Select the top N words
                
                expanded_QT = []    # Expanded Query Terms
                
                # Write expansion terms to Expansions file
                with open(expansion_file+'_'+mod+'_win_'+str(window)+'@Top_'+str(Top_N),'a') as expansion:
                    expansion.write(f"{qid}: ")
                    for term_idx,_ in sim_word_idx:
                        expansion.write(f"{index_to_word[term_idx]} ")
                        score = flat_sim_matrix[term_idx]
                        expanded_QT.append((index_to_word[term_idx], score))
                    expansion.write("\n")

                norm_c = sum([score for word, score in expanded_QT])
                EQT_score = {k: v/norm_c for k, v in expanded_QT}   # Normalised scores for expanded query term.    Expansion Term : score
                OQT_score = Counter(original_QT)                    # Original Query Term : 1

                # Get Language Model for each of the top 100 doc for the query containing token freq and doc length
                for DIRICHLET_MU in [250,500,750,1000,1250]:
                    doc_lang_models = []
                    for rank,(doc_id,score) in Top100[qid].items():
                        doc_text = Docs[doc_id]['title'] + " " + Docs[doc_id]['body']
                        doc_lang_models.append(DocLangModel(doc_id,doc_text,DIRICHLET_MU,'S1'))
                    
                    # Get Collection Statistics - Collection is of the top 100 docs
                    Coll_lang_model  = CollectionLangModel()
                    for DLM in doc_lang_models:
                        Coll_lang_model.add_DocLangModel(DLM)

                    Coll_lang_model.add_unk(0.5)

                    # Add Collection Statistics to each Document language Model to calculate M_d(t) = (f_(t,d) + mu * M_c(t))/(l_d + mu)
                    for DLM in doc_lang_models:
                        DLM.add_collection_model(Coll_lang_model)

                    # Recalculate Original Query Term scores only for terms present in the collection else <UNK>
                    new_OQT_score = defaultdict(int)
                    for QT, score in OQT_score.items():
                        key = QT if Coll_lang_model.coll_token_freq.get(QT) is not None else '<UNK>'
                        new_OQT_score[key] += score
                    OQT_score = new_OQT_score

                    # Recalculate Expanded Query Term scores only for terms present in the collection else <UNK>
                    new_EQT_score = defaultdict(int)
                    for QT, score in EQT_score.items():
                        key = QT if Coll_lang_model.coll_token_freq.get(QT) is not None else '<UNK>'
                        new_EQT_score[key] += score
                    EQT_score = new_EQT_score

                    EQT_norm = sum(EQT_score.values())  # EQT scores normalization constant
                    OQT_norm = sum(OQT_score.values())  # OQT scores normalization constant

                    # compute relevance model probabilities
                    for Model_Lambda in [0.15,0.3,0.45,0.6,0.75,0.9]:
                        relevance_model_prob = defaultdict(int)
                        for token in Coll_lang_model.coll_token_freq.keys():
                            relevance_model_prob[token] += (Model_Lambda) * (EQT_score.get(token,0)/EQT_norm)
                            relevance_model_prob[token] += (1-Model_Lambda) * (OQT_score.get(token,0)/OQT_norm)

                        results = []
                        for i in range(len(doc_lang_models)):
                            results.append((doc_lang_models[i].doc_id, 1-KL_Divergence_Reverse(doc_lang_models[i], relevance_model_prob)))

                        results.sort(key=lambda x: x[1], reverse=True)

                        with open(output_file+'_'+mod+'_win_'+str(window)+'@mu_'+str(DIRICHLET_MU)+'@Top_'+str(Top_N)+'@LAMBDA_'+str(Model_Lambda),'a') as out:
                            for idx, (doc_id,score) in enumerate(results):
                                out.write(f"{qid} Q0 {doc_id} {idx+1} {score:.6f} runid1\n")
                
                # print(f"Processed Query Number: {query_count}\r")



sg:  0
window:  3
window:  5
window:  7
window:  9
sg:  1
window:  3
window:  5
window:  7
window:  9


In [25]:
word_to_index

{' ': 0,
 'e': 1,
 'a': 2,
 'i': 3,
 'r': 4,
 's': 5,
 't': 6,
 'n': 7,
 'o': 8,
 'l': 9,
 'd': 10,
 'c': 11,
 'm': 12,
 'u': 13,
 'p': 14,
 'g': 15,
 'h': 16,
 'y': 17,
 'b': 18,
 'f': 19,
 'w': 20,
 'v': 21,
 'k': 22,
 '0': 23,
 '1': 24,
 '2': 25,
 'x': 26,
 'j': 27,
 '9': 28,
 '5': 29,
 '3': 30,
 '4': 31,
 '7': 32,
 '8': 33,
 'q': 34,
 '6': 35,
 'z': 36}

In [9]:
# For each Query
query_count = 0
qid = '42255'
qtext = Queries['42255']
query_count +=1
query = preProcessText(qtext,lowercase=True,punctuations=True,digits=False,stemming=False,Stopwords=True,contractions=False)


In [10]:
query_vector = np.zeros(shape=(V,1)) ### binary vector representing terms present in query
for word, idx in word_to_index.items():
    query_vector[idx, 0] = query.lower().split().count(word)

In [11]:
# sim_scores = np.matmul(embeddings, embeddings.T)
# query_sim_scores = np.matmul(sim_scores, query_vector)

temp = np.matmul(embeddings.T,query_vector)
query_sim_scores = np.matmul(embeddings, temp)

In [15]:
flat_sim_matrix = query_sim_scores.flatten()
sorted_idx = np.argsort(flat_sim_matrix)[::-1]

In [29]:
Top_N = 20
sim_word_idx = [(idx, 0) for idx in sorted_idx[:Top_N]]

original_QT = query.lower().split()     # Original Query Terms
expanded_QT = []    # Expanded Query Terms

for term_idx,_ in sim_word_idx:
    score = flat_sim_matrix[term_idx]
    expanded_QT.append((index_to_word[term_idx], score))

In [38]:
from collections import Counter
norm_c = sum([score for word, score in expanded_QT])
EQT_score = {k: v/norm_c for k, v in expanded_QT}
OQT_score = Counter(original_QT)

In [43]:
doc_lang_models = []
for rank,(doc_id,score) in Top100[qid].items():
    doc_text = Docs[doc_id]['title'] + " " + Docs[doc_id]['body']
    doc_lang_models.append(DocLangModel(doc_id,doc_text,'S2'))

In [45]:
Coll_lang_model  = CollectionLangModel()
for DLM in doc_lang_models:
    Coll_lang_model.add_DocLangModel(DLM)

Coll_lang_model.add_unk(0.5)

# Add Collection Statistics to each Document language Model to calculate M_d(t) = (f_(t,d) + mu * M_c(t))/(l_d + mu)
for DLM in doc_lang_models:
    DLM.add_collection_model(Coll_lang_model)

In [48]:
new_OQT_score = defaultdict(int)
for QT, score in OQT_score.items():
    key = QT if Coll_lang_model.coll_token_freq.get(QT) is not None else '<UNK>'
    new_OQT_score[key] += score
OQT_score = new_OQT_score

new_EQT_score = defaultdict(int)
for QT, score in EQT_score.items():
    key = QT if Coll_lang_model.coll_token_freq.get(QT) is not None else '<UNK>'
    new_EQT_score[key] += score
EQT_score = new_EQT_score

EQT_norm = sum(EQT_score.values())
OQT_norm = sum(OQT_score.values())

In [62]:
EQT_score.get('dental',0)

0.04829463661729081

In [66]:
W2V_Lambda = 0.5
relevance_model_prob = defaultdict(int)
for token in Coll_lang_model.coll_token_freq.keys():
    relevance_model_prob[token] += (W2V_Lambda) * (EQT_score.get(token,0)/EQT_norm)
    relevance_model_prob[token] += (1-W2V_Lambda) * (OQT_score.get(token,0)/OQT_norm)

results = []
for i in range(len(doc_lang_models)):
    results.append((doc_lang_models[i].doc_id, 1-KL_Divergence_Reverse(doc_lang_models[i], relevance_model_prob)))

results.sort(key=lambda x: x[1], reverse=True)

In [68]:
len(results)

100

In [29]:
tok = SimpleTokenizer()
tok.tokenize("I should've been there. Y'all know what I'm sayin'.")

['i',
 'should',
 've',
 'been',
 'there',
 'y',
 'all',
 'know',
 'what',
 'i',
 'm',
 'sayin']