# IT550 Information Retrieval Assignment-5
## Student ID - 202011032

## Importing Gensim and other necessary libraries and unzipping FIRE 2010 dataset

In [1]:
!pip install --upgrade gensim
import pprint
import os
import re
import json
import gzip
import joblib
import logging
import numpy as np
import pandas as pd
from smart_open import open, register_compressor
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora, models

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# FILE_PATHS = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames]
DATASET_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/English-Data.tgz"
TOPICS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt"
QRELS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.rel.txt"

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 157kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


**Create a class for retrieving data from FIRE Dataset**

In [2]:
class FIRECorpus:
    def __init__(self, path):
        self._docno_list = []
        self._text_list = []
        self._dataset_path = path
        self._process_corpus()

    def _handle_tgz(self, fileobj, mode):
        return gzip.GzipFile(fileobj=fileobj, mode=mode)

    def _process_corpus(self):
        register_compressor('.tgz', self._handle_tgz)

        self._docno_list = []
        self._text_list = []
        text_str = ""
        text_flag = 0

        for line in open(self._dataset_path, encoding='utf-8'):
            if line.startswith('<DOCNO>'):
                self._docno_list.append(line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip())
                continue
            
            if line.startswith('<TEXT>'):
                text_flag = 1
                continue
            
            if text_flag:
                if line.startswith('</TEXT>'):
                    text_flag = 0
                    text_tokens = list(map(str, preprocess_string(text_str)))
                    self._text_list.append(text_tokens)
                    text_tokens = []
                    text_str = ""
                    continue
                
                text_str += " " + line.strip()
    
    def __iter__(self):
        for text in self._text_list:
            yield text
    
    def get_docno_list(self):
        return self._docno_list
    
    def get_tokens_list(self):
        return self._text_list

## Create word vectors / word embedding matrices from documents

In [3]:
fire_corpus = FIRECorpus(DATASET_PATH)

### Create and save word2vec model and word embedding matrix using skipgram approach

In [37]:
fire_skipgram_model = models.word2vec.Word2Vec(
    sentences=fire_corpus,
    size=300,
    window=10,
    min_count=1,
    workers=4,
    sg=1,  # 1 for skip-gram; otherwise CBOW
    hs=1,  # 1 for hierarchical softmax; 0 for negative sampling
    negative=0,
    iter=5
)

2021-03-10 09:38:36,275 : INFO : collecting all words and their counts
2021-03-10 09:38:36,279 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-10 09:38:36,790 : INFO : PROGRESS: at sentence #10000, processed 2408443 words, keeping 52140 word types
2021-03-10 09:38:37,266 : INFO : PROGRESS: at sentence #20000, processed 4528056 words, keeping 72421 word types
2021-03-10 09:38:37,672 : INFO : PROGRESS: at sentence #30000, processed 6510850 words, keeping 84647 word types
2021-03-10 09:38:38,056 : INFO : PROGRESS: at sentence #40000, processed 8241739 words, keeping 101474 word types
2021-03-10 09:38:38,554 : INFO : PROGRESS: at sentence #50000, processed 10388700 words, keeping 114428 word types
2021-03-10 09:38:38,953 : INFO : PROGRESS: at sentence #60000, processed 12252237 words, keeping 123698 word types
2021-03-10 09:38:39,685 : INFO : PROGRESS: at sentence #70000, processed 15575041 words, keeping 136990 word types
2021-03-10 09:38:40,094 : INFO :

In [38]:
fire_skipgram_model.save('/content/drive/MyDrive/IR_Assignment5/fire_skipgram_model.model')

2021-03-10 10:27:45,422 : INFO : saving Word2Vec object under /content/drive/MyDrive/IR_Assignment5/fire_skipgram_model.model, separately None
2021-03-10 10:27:45,424 : INFO : storing np array 'vectors' to /content/drive/MyDrive/IR_Assignment5/fire_skipgram_model.model.wv.vectors.npy
2021-03-10 10:27:48,232 : INFO : not storing attribute vectors_norm
2021-03-10 10:27:48,239 : INFO : storing np array 'syn1' to /content/drive/MyDrive/IR_Assignment5/fire_skipgram_model.model.trainables.syn1.npy
2021-03-10 10:27:51,868 : INFO : not storing attribute cum_table
2021-03-10 10:27:57,223 : INFO : saved /content/drive/MyDrive/IR_Assignment5/fire_skipgram_model.model


In [39]:
fire_skipgram_model.wv.save('/content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors')

2021-03-10 10:28:02,304 : INFO : saving Word2VecKeyedVectors object under /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors, separately None
2021-03-10 10:28:02,306 : INFO : storing np array 'vectors' to /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors.vectors.npy
2021-03-10 10:28:03,418 : INFO : not storing attribute vectors_norm
2021-03-10 10:28:07,199 : INFO : saved /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors


### Create and save word2vec model and word embedding matrix using CBOW approach

In [40]:
fire_cbow_model = models.word2vec.Word2Vec(
    sentences=fire_corpus,
    size=300,
    window=10,
    min_count=1,
    workers=4,
    sg=0,  # 1 for skip-gram; otherwise CBOW
    hs=1,  # 1 for hierarchical softmax; 0 for negative sampling
    negative=0,
    iter=5
)

2021-03-10 10:28:55,519 : INFO : collecting all words and their counts
2021-03-10 10:28:55,521 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-10 10:28:55,996 : INFO : PROGRESS: at sentence #10000, processed 2408443 words, keeping 52140 word types
2021-03-10 10:28:56,446 : INFO : PROGRESS: at sentence #20000, processed 4528056 words, keeping 72421 word types
2021-03-10 10:28:56,843 : INFO : PROGRESS: at sentence #30000, processed 6510850 words, keeping 84647 word types
2021-03-10 10:28:57,224 : INFO : PROGRESS: at sentence #40000, processed 8241739 words, keeping 101474 word types
2021-03-10 10:28:57,661 : INFO : PROGRESS: at sentence #50000, processed 10388700 words, keeping 114428 word types
2021-03-10 10:28:58,056 : INFO : PROGRESS: at sentence #60000, processed 12252237 words, keeping 123698 word types
2021-03-10 10:28:58,746 : INFO : PROGRESS: at sentence #70000, processed 15575041 words, keeping 136990 word types
2021-03-10 10:28:59,169 : INFO :

In [43]:
fire_cbow_model.save('/content/drive/MyDrive/IR_Assignment5/fire_cbow_model.model')

2021-03-10 10:40:31,374 : INFO : saving Word2Vec object under /content/drive/MyDrive/IR_Assignment5/fire_cbow_model.model, separately None
2021-03-10 10:40:31,376 : INFO : storing np array 'vectors' to /content/drive/MyDrive/IR_Assignment5/fire_cbow_model.model.wv.vectors.npy
2021-03-10 10:40:32,422 : INFO : not storing attribute vectors_norm
2021-03-10 10:40:32,423 : INFO : storing np array 'syn1' to /content/drive/MyDrive/IR_Assignment5/fire_cbow_model.model.trainables.syn1.npy
2021-03-10 10:40:33,476 : INFO : not storing attribute cum_table
2021-03-10 10:40:37,723 : INFO : saved /content/drive/MyDrive/IR_Assignment5/fire_cbow_model.model


In [44]:
fire_cbow_model.wv.save('/content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors')

2021-03-10 10:40:50,108 : INFO : saving Word2VecKeyedVectors object under /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors, separately None
2021-03-10 10:40:50,116 : INFO : storing np array 'vectors' to /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors.vectors.npy
2021-03-10 10:40:50,968 : INFO : not storing attribute vectors_norm
2021-03-10 10:40:54,413 : INFO : saved /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors


## Build document and query vectors from word vectors of both approaches

In [45]:
SKIPGRAM_WV_PATH = "/content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors"
CBOW_WV_PATH = "/content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors"

In [46]:
# Load skipgram and cbow word vectors
skipgram_wv = models.KeyedVectors.load(SKIPGRAM_WV_PATH)
cbow_wv = models.KeyedVectors.load(CBOW_WV_PATH)

2021-03-10 10:41:04,628 : INFO : loading Word2VecKeyedVectors object from /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors
2021-03-10 10:41:06,089 : INFO : loading vectors from /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors.vectors.npy with mmap=None
2021-03-10 10:41:06,561 : INFO : setting ignored attribute vectors_norm to None
2021-03-10 10:41:06,562 : INFO : loaded /content/drive/MyDrive/IR_Assignment5/fire_skipgram.wordvectors
2021-03-10 10:41:06,650 : INFO : loading Word2VecKeyedVectors object from /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors
2021-03-10 10:41:09,042 : INFO : loading vectors from /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors.vectors.npy with mmap=None
2021-03-10 10:41:09,707 : INFO : setting ignored attribute vectors_norm to None
2021-03-10 10:41:09,709 : INFO : loaded /content/drive/MyDrive/IR_Assignment5/fire_cbow.wordvectors


In [56]:
# Define functions to build document vectors and query vectors from word embedding matrices
def build_document_vectors(corpus, wordvectors):
    docno_list = corpus.get_docno_list()
    tokens_list = corpus.get_tokens_list()
    doc_vectors = {}

    for idx in range(len(docno_list)):
        doc_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for token in tokens_list[idx]:
            try:
                doc_vec += wordvectors.get_vector(token)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            doc_vec = doc_vec / vectors_added
        doc_vectors[docno_list[idx]] = doc_vec
    
    return doc_vectors

def build_query_vectors(queries_path, wordvectors):
    from bs4 import BeautifulSoup
    with open(queries_path) as topics_file:
        soup = BeautifulSoup(topics_file, features="html.parser")

        qid_all = [int(num.text) for num in soup.find_all("num")]
        # Here we are taking text/query from the <desc> tag.
        text_all = [preprocess_string(desc.text) for desc in soup.find_all("desc")]
    
    query_vectors = {}

    for idx in range(len(text_all)):
        query_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for qtoken in text_all[idx]:
            try:
                query_vec += wordvectors.get_vector(qtoken)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            query_vec = query_vec / vectors_added
        query_vectors[qid_all[idx]] = query_vec
    
    return query_vectors

In [57]:
sg_doc_vectors = build_document_vectors(fire_corpus, skipgram_wv)
cb_doc_vectors = build_document_vectors(fire_corpus, cbow_wv)

sg_query_vectors = build_query_vectors(TOPICS_PATH, skipgram_wv)
cb_query_vectors = build_query_vectors(TOPICS_PATH, cbow_wv)

'''
Document vectors will be of the form:
{
    'docno1.utf8': array([...], dtype=float32),
    'docno2.utf8': array([...], dtype=float32),
    ...
    'docnoN.utf8': array([...], dtype=float32),
}
where each value is a numpy array of shape (300,)
'''

## Perform retrieval for the queries and get top 10 documents rank list for Skipgram and CBoW approaches

In [58]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / ((np.linalg.norm(vec1))*(np.linalg.norm(vec2)))

def perform_retrieval(doc_vectors, query_vectors):
    '''
    Calculates cosine similarity of query vectors with document vectors and
    retrieves top 10 documents for each query with their scores.
    '''
    top10docs_per_query = {qid: dict() for qid in query_vectors.keys()}
    
    for i, (docno, doc_vec) in enumerate(doc_vectors.items()):
        is_doc_empty = False
        for qid, query_vec in query_vectors.items():
            if all(doc_vec == 0):
                top10docs_per_query[qid].update({docno: 0.0})
                is_doc_empty = True
            else:
                top10docs_per_query[qid].update({
                    docno: cosine_similarity(doc_vec, query_vec) 
                })
        
        if is_doc_empty:
            print(f"Assigned 0.0 score to empty document {docno} for all queries.")
        if i % 10000 == 0:
            print(f"Processed {i+1} documents.")

    print("\nRetrieving top 10 documents for each query...")
    top10docs_per_query = {
        qid: {
            docno: score 
            for docno, score 
            in sorted(top10docs_per_query[qid].items(), 
                      key= lambda x: x[1], reverse=True)[:10]
        } 
        for qid in top10docs_per_query.keys()
    }

    return top10docs_per_query

In [59]:
sg_rank_list = perform_retrieval(sg_doc_vectors, sg_query_vectors)
cb_rank_list = perform_retrieval(cb_doc_vectors, cb_query_vectors)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Assigned 0.0 score to empty document 1051119_nation_story_5495933.utf8 for all queries.
Assigned 0.0 score to empty document 1050420_nation_index.utf8 for all queries.
Processed 30001 documents.
Assigned 0.0 score to empty document 1050822_sports_index.utf8 for all queries.
Processed 40001 documents.
Assigned 0.0 score to empty document 1041105_bengal_index.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728792.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728789.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3940880.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3964865.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968774.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968787.utf8 for all queries.
Ass

In [61]:
pprint.pprint(sg_rank_list)
pprint.pprint(cb_rank_list)

{76: {'1060705_opinion_story_6435923.utf8': 0.7237748408992479,
      '1070110_nation_story_7242573.utf8': 0.717629959841929,
      '1070530_nation_story_7849973.utf8': 0.7578472024556191,
      '1070530_nation_story_7849974.utf8': 0.7411521493820568,
      '1070531_opinion_story_7852019.utf8': 0.7227926090723727,
      '1070602_nation_story_7865944.utf8': 0.7118074605613458,
      '1070603_nation_story_7869357.utf8': 0.740247584804927,
      '1070607_opinion_story_7886010.utf8': 0.714936293352962,
      '1070611_nation_story_7906812.utf8': 0.7405594869098351,
      '1070710_frontpage_story_8037684.utf8': 0.7226299452912215},
 77: {'1051229_foreign_story_5657775.utf8': 0.89850312156005,
      '1060713_foreign_story_6472449.utf8': 0.8882040379904228,
      '1060714_foreign_story_6477224.utf8': 0.8879688335217292,
      '1060716_foreign_story_6485226.utf8': 0.883279389522009,
      '1060719_foreign_story_6496938.utf8': 0.884135874985177,
      '1060720_foreign_story_6501311.utf8': 0.8886

## Calculate **Mean Average Precision** for both Skipgram and CboW approaches

In [67]:
def read_qrels(qrels_path):
    '''Reads and return qrels objects from the given path as a dictionary with the relevant documents.'''
    qrels = {}
    with open(qrels_path) as qrels_file:
        lines = qrels_file.readlines()
        for line in lines:
            line = line.strip()
            if line.endswith('1'):
                line = line.split()
                qid = int(line[0])
                qrels[qid] = qrels.get(qid, []) + [line[2]]
    return qrels

def get_avg_precision(rank_list, qrels):
    '''Returns a dictionary with qids and their relevant average precisions.'''
    # Store (precision x relevance) as a list for a query
    q_prec_rel = {qid: 0 for qid in qrels.keys()}

    for qid, rank_dict in sorted(rank_list.items()):
        rel_doc_ctr = 0
        avg_prec = 0.0
        for pos, (doc, _) in enumerate(sorted(rank_dict.items(), key=lambda x: x[1], reverse=True), 1):
            # Calculate precision@pos+1 x relevance@pos+1 for the retrieved documents
            if doc in qrels[qid]:
                rel_doc_ctr += 1
                avg_prec += rel_doc_ctr / (pos + 1)
            # print(f"doc in qrels: {doc in qrels[qid]}, pos+1: {pos+1}, rel_doc_ctr: {rel_doc_ctr}, avg_prec: {avg_prec}")
        if rel_doc_ctr != 0:
            q_prec_rel[qid] = ( 1 / rel_doc_ctr ) * avg_prec
        else:
            q_prec_rel[qid] = avg_prec
        # print(f"Qid: {qid}, Avg_prec: {q_prec_rel[qid]}")
    
    return q_prec_rel

**Average precision for each queries in skipgram approach**

In [71]:
sg_avg_prec = get_avg_precision(sg_rank_list, read_qrels(QRELS_PATH))

print("qid \t Avg. Precision")
for qid, ap in sg_avg_prec.items():
    print(f"{qid} \t {ap: 0.4}")

qid 	 Avg. Precision
76 	  0.5333
77 	  0.4446
78 	  0.0
79 	  0.5
80 	  0.3278
81 	  0.4304
82 	  0.5848
83 	  0.2778
84 	  0.5556
85 	  0.2508
86 	  0.1667
87 	  0.0
88 	  0.393
89 	  0.5054
90 	  0.3778
91 	  0.6073
92 	  0.3492
93 	  0.5
94 	  0.798
95 	  0.4843
96 	  0.2
97 	  0.1742
98 	  0.4821
99 	  0.0
100 	  0.5639
101 	  0.3882
102 	  0.2
103 	  0.0
104 	  0.4459
105 	  0.4778
106 	  0.5833
107 	  0.4709
108 	  0.3929
109 	  0.3333
110 	  0.6479
111 	  0.5613
112 	  0.7347
113 	  0.5595
114 	  0.6079
115 	  0.5667
116 	  0.6092
117 	  0.0
118 	  0.0
119 	  0.2786
120 	  0.6155
121 	  0.3333
122 	  0.09091
123 	  0.6479
124 	  0.09091
125 	  0.4433


**Average precision for each queries in cbow approach**

In [73]:
cb_avg_prec = get_avg_precision(cb_rank_list, read_qrels(QRELS_PATH))

print("qid \t Avg. Precision")
for qid, ap in cb_avg_prec.items():
    print(f"{qid} \t {ap: 0.4}")

qid 	 Avg. Precision
76 	  0.3778
77 	  0.4974
78 	  0.3333
79 	  0.5
80 	  0.3667
81 	  0.2761
82 	  0.2667
83 	  0.375
84 	  0.5889
85 	  0.3056
86 	  0.0
87 	  0.125
88 	  0.3929
89 	  0.2944
90 	  0.2917
91 	  0.3175
92 	  0.284
93 	  0.6098
94 	  0.798
95 	  0.6679
96 	  0.25
97 	  0.0
98 	  0.4167
99 	  0.0
100 	  0.6073
101 	  0.4083
102 	  0.5556
103 	  0.0
104 	  0.4988
105 	  0.4268
106 	  0.5833
107 	  0.5417
108 	  0.5
109 	  0.4016
110 	  0.71
111 	  0.605
112 	  0.5938
113 	  0.5644
114 	  0.5613
115 	  0.5139
116 	  0.6239
117 	  0.0
118 	  0.1
119 	  0.225
120 	  0.5903
121 	  0.5
122 	  0.0
123 	  0.4635
124 	  0.0
125 	  0.4028


### Get $MAP$ value for both approaches

In [76]:
sg_N = len(sg_avg_prec.keys())
sg_map = (1 / sg_N) * sum(sg_avg_prec.values())
print(f"Skipgram approach Mean Average Precision: {sg_map: 0.4}")

cb_N = len(cb_avg_prec.keys())
cb_map = (1 / cb_N) * sum(cb_avg_prec.values())
print(f"CBoW Approach Mean Average Precision: {cb_map: 0.4}")

Skipgram approach Mean Average Precision:  0.3917
CBoW Approach Mean Average Precision:  0.3862
