## IT550 Information Retrieval Assignment - 6
### Student ID - 202011032

## Importing necessary libraries and setting paths

In [1]:
!pip install --upgrade gensim
import pprint
import os
import re
import json
import gzip
import logging
import nltk
import scipy
import numpy as np
import pandas as pd
from smart_open import open, register_compressor
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora, models

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# FILE_PATHS = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames]
DATASET_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/English-Data.tgz"
TOPICS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt"
QRELS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.rel.txt"

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 66.0MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Creating classes for FIRE corpus, Query expansion and Rocchio algorithm

In [61]:
class FIRECorpus:
    
    def __init__(self, path):
        self._docno_list = []
        self._text_list = []
        self._tokens_list = []
        self._dataset_path = path
        self._process_corpus()
    
    def __iter__(self):
        for text in self._tokens_list:
            yield text
    
    @property
    def docno_list(self):
        return self._docno_list
    
    @property
    def text_list(self):
        return self._text_list

    @property
    def tokens_list(self):
        return self._tokens_list

    def _handle_tgz(self, fileobj, mode):
        return gzip.GzipFile(fileobj=fileobj, mode=mode)

    def _process_corpus(self):
        register_compressor('.tgz', self._handle_tgz)

        self._docno_list = []
        self._text_list = []
        text_str = ""
        text_flag = 0

        for line in open(self._dataset_path, encoding='utf-8'):
            if line.startswith('<DOCNO>'):
                self._docno_list.append(line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip())
                continue
            
            if line.startswith('<TEXT>'):
                text_flag = 1
                continue
            
            if text_flag:
                if line.startswith('</TEXT>'):
                    text_flag = 0
                    text_tokens = list(map(str, preprocess_string(text_str)))
                    
                    self._text_list.append(text_str)
                    self._tokens_list.append(text_tokens)
                    
                    text_tokens = []
                    text_str = ""
                    continue
                
                text_str += " " + line.strip()


In [7]:
class QueryExpansionRocchio:
    
    def query_expansion(self, queries):
        from nltk.corpus import wordnet
        # For each word finding at most 10 synonyms if available using wordnet
        queries_with_synonyms = []
        count = 0
        for i in range(50):
            synonyms = []
            for x in queries[i].split():
                synonyms.append(x)
                for syn in wordnet.synsets(x):
                    for lemma in syn.lemmas():
                        if count < 10:
                            if lemma.name() not in synonyms:
                                if lemma.name() not in queries[i].split():
                                    synonyms.append(lemma.name())
                                    #print(synonyms)
                                    count += 1
            count = 0
            queries_with_synonyms.append(' '.join(synonyms))
        return queries_with_synonyms
    
    def rocchio_query_vec(self, query_mat, doc_mat, qrels, docno_list, alpha=1, beta=0.75, gamma=0.15):
        # Get the sum of all document vectors
        docs_sum = np.sum(doc_mat, axis=0)

        # Get the sum of relevant document vectors per query
        sum_rel_doc = np.zeros((50,350))
        for q_no in range(50):
            for f_name in qrels[q_no + 76]:
                sum_rel_doc[q_no, :] += doc_mat[docno_list.index(f_name), :]

        # Get the sum of non relevant document vectors per query
        sum_non_rel_doc = np.zeros((50,350))
        for q_no in range(50):
            sum_non_rel_doc[q_no, :] = docs_sum - sum_rel_doc[q_no, :]

        # Apply Rocchio's Equation
        new_queries = np.zeros((50,350))
        for q_no in range(50):
            len_rel_docs = len(qrels[q_no + 76])
            if len_rel_docs != 0:
                new_queries[q_no, :] = (alpha * query_mat[q_no, :]) + ((beta / len_rel_docs) * sum_rel_doc[q_no, :]) - ((gamma / (doc_mat.shape[0] - len_rel_docs)) * sum_non_rel_doc[q_no, :])
            else:
                new_queries[q_no, :] = query_mat[q_no, :]

        # Return new query vector
        return new_queries


## Creating class for extracting queries and qrels, and performing retrieval and calculating MAP score

In [8]:
class QueryQrelsExtractor:
    
    def extract_queries(self, queries_path):
        from bs4 import BeautifulSoup
        with open(queries_path) as topics_file:
            soup = BeautifulSoup(topics_file, features="html.parser")

            qid_all = [int(num.text) for num in soup.find_all("num")]
            # Here we are taking text/query from the <desc> tag.
            text_all = [' '.join(preprocess_string(desc.text)) for desc in soup.find_all("desc")]
        
        return {qid: query for qid, query in zip(qid_all, text_all)}
    
    def extract_qrels(self, qrels_path):
        qrels = {}
        with open(qrels_path) as qrels_file:
            lines = qrels_file.readlines()
            for line in lines:
                line = line.strip()
                if line.endswith('1'):
                    line = line.split()
                    qid = int(line[0])
                    qrels[qid] = qrels.get(qid, []) + [line[2]]
        return qrels


In [35]:
class Retrieval:

    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / ((np.linalg.norm(vec1))*(np.linalg.norm(vec2)))

    def perform_retrieval(self, doc_vectors, query_vectors):
        '''
        Calculates cosine similarity of query vectors with document vectors and
        retrieves top 10 documents for each query with their scores.
        '''
        top10docs_per_query = {qid: dict() for qid in query_vectors.keys()}
        
        for i, (docno, doc_vec) in enumerate(doc_vectors.items()):
            is_doc_empty = False
            for qid, query_vec in query_vectors.items():
                if all(doc_vec == 0):
                    top10docs_per_query[qid].update({docno: 0.0})
                    is_doc_empty = True
                else:
                    top10docs_per_query[qid].update({
                        docno: self.cosine_similarity(doc_vec, query_vec) 
                    })
            
            if is_doc_empty:
                print(f"Assigned 0.0 score to empty document {docno} for all queries.")
            if i % 10000 == 0:
                print(f"Processed {i+1} documents.")

        print("\nRetrieving top 10 documents for each query...")
        top10docs_per_query = {
            qid: {
                docno: score 
                for docno, score 
                in sorted(top10docs_per_query[qid].items(), 
                        key= lambda x: x[1], reverse=True)[:10]
            } 
            for qid in top10docs_per_query.keys()
        }

        return top10docs_per_query
    
    def get_avg_precision(self, rank_list, qrels):
        '''Returns a dictionary with qids and their relevant average precisions.'''
        # Store (precision x relevance) as a list for a query
        q_prec_rel = {qid: 0 for qid in qrels.keys()}

        for qid, rank_dict in sorted(rank_list.items()):
            rel_doc_ctr = 0
            avg_prec = 0.0
            for pos, (doc, _) in enumerate(sorted(rank_dict.items(), key=lambda x: x[1], reverse=True), 1):
                # Calculate precision@pos+1 x relevance@pos+1 for the retrieved documents
                if doc in qrels[qid]:
                    rel_doc_ctr += 1
                    avg_prec += rel_doc_ctr / (pos + 1)
                # print(f"doc in qrels: {doc in qrels[qid]}, pos+1: {pos+1}, rel_doc_ctr: {rel_doc_ctr}, avg_prec: {avg_prec}")
            if rel_doc_ctr != 0:
                q_prec_rel[qid] = ( 1 / rel_doc_ctr ) * avg_prec
            else:
                q_prec_rel[qid] = avg_prec
            # print(f"Qid: {qid}, Avg_prec: {q_prec_rel[qid]}")
        
        return q_prec_rel
    
    def get_map(self, avg_prec_dict):
        return (1 / len(avg_prec_dict)) * sum(avg_prec_dict.values())


## Performing Query expansion and Rocchio algorithm experiments on the corpus

### Using TF-IDF Document Representation

Create FIRECorpus object (Takes around 5 mins to preprocess the corpus)

In [10]:
fire_corpus = FIRECorpus(DATASET_PATH)

In [13]:
pprint.pprint(fire_corpus.text_list[:3])

['   The Telegraph - Calcutta : Frontpage  Delayer drug passes a test  - '
 'Formulation to treat bedroom woes works in trials  Washington, May 24 '
 '(Reuters): The first drug formulated to treat premature ejaculation delays '
 'climax and also increases reported satisfaction, researchers said.  The '
 'drug, called dapoxetine, helped men delay their orgasms significantly and '
 'doubled the numbers of men and their female partners reporting good sexual '
 'satisfaction, they told a conference.  Premature ejaculation is a really '
 'common problem, affecting between 10 and 30 per cent of all men. Here is '
 'something for the first time that we have that works, said Dr Jon Pryor, '
 'chairman of the department of urologic surgery at the University of '
 'Minnesota, who led the study.  He said the drug worked quickly and with few '
 'side-effects.  It gets in rapidly. It gets out rapidly. You can take it one '
 'to three hours before intercourse, Pryor said in a telephone interview.  T

Represent Documents using TF-IDF

In [14]:
vectorizer = TfidfVectorizer(stop_words='english')
fire_corpus_tfidf = vectorizer.fit_transform(fire_corpus.text_list)
fire_corpus_tfidf.shape

(125586, 250353)

In [15]:
fire_corpus_tfidf = fire_corpus_tfidf.T
fire_corpus_tfidf.shape

(250353, 125586)

In [16]:
# Perform Dimensionality Reduction using SVD on TF-IDF matrix to reduce processing
u, s, vt = scipy.sparse.linalg.svds(fire_corpus_tfidf, k=350)

s_diag = np.diag(s)
s_diag = np.linalg.inv(s_diag)

pprint.pprint(u.shape)
pprint.pprint(vt.shape)
pprint.pprint(s_diag.shape)

vt = np.matrix.transpose(vt)

(250353, 350)
(350, 125586)
(350, 350)


In [18]:
# Save u, s_diag and vt in disk
import pickle

name = '/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_u'
with open(name, 'wb') as obj:
    pickle.dump(u, obj)

name = '/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_sdiag'
with open(name, 'wb') as obj:
    pickle.dump(s_diag, obj)

name = '/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_vt'
with open(name, 'wb') as obj:
    pickle.dump(vt, obj)

In [19]:
# Read u, s_diag, vt from disk

with open('/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_u','rb') as file:
  u = pickle.load(file)

with open('/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_sdiag','rb') as file:
  s_diag = pickle.load(file)

with open('/content/drive/MyDrive/IR_Assignment6/data/TF_IDF_SVD_350_vt','rb') as file:
  vt = pickle.load(file)

Read queries and qrels

In [20]:
query_qrels_extractor = QueryQrelsExtractor()
queries_dict = query_qrels_extractor.extract_queries(TOPICS_PATH)
qrels_dict = query_qrels_extractor.extract_qrels(QRELS_PATH)

pprint.pprint(queries_dict[125])
pprint.pprint(qrels_dict[125])

'sieg lal masjid islamabad fundamentalist student'
['1070521_foreign_story_7806832.utf8',
 '1070705_foreign_index.utf8',
 '1070705_foreign_story_8019084.utf8',
 '1070705_foreign_story_8019086.utf8',
 '1070706_foreign_story_8023448.utf8',
 '1070707_foreign_story_8027755.utf8',
 '1070711_foreign_story_8044170.utf8',
 '1070711_frontpage_story_8044207.utf8',
 '1070712_foreign_story_8049110.utf8',
 '1070712_foreign_story_8049115.utf8',
 '1070715_foreign_story_8061044.utf8',
 '1070716_foreign_story_8064283.utf8',
 '1070726_foreign_story_8108245.utf8']


Get TF-IDF matrix of queries

In [22]:
queries_tfidf = vectorizer.transform(queries_dict.values())
pprint.pprint(queries_tfidf)

<50x250353 sparse matrix of type '<class 'numpy.float64'>'
	with 424 stored elements in Compressed Sparse Row format>


In [23]:
# Query transformation
queries_tfidf = queries_tfidf.toarray()
tmp = np.matmul(queries_tfidf, u)
svd_queries_tfidf = np.matmul(tmp, s_diag)
svd_queries_tfidf.shape

(50, 350)

#### Using Rocchio algorithm on normal queries

In [24]:
query_exp_rocchio = QueryExpansionRocchio()
rocchio_queries = query_exp_rocchio.rocchio_query_vec(
    svd_queries_tfidf, vt, qrels_dict, fire_corpus.docno_list
)

In [28]:
doc_vecs = {docno: doc_vec for docno, doc_vec in zip(fire_corpus.docno_list, vt)}
doc_vecs['1040901_bengal_story_3702359.utf8'].shape

(350,)

In [30]:
query_vecs = {qid: q_vec for qid, q_vec in zip(queries_dict.keys(), rocchio_queries)}
query_vecs[125].shape

(350,)

Perform retrieval and get MAP score

In [36]:
retrieval = Retrieval()
rank_list = retrieval.perform_retrieval(doc_vecs, query_vecs)
avg_prec_dict = retrieval.get_avg_precision(rank_list, qrels_dict)
rocchio_map = retrieval.get_map(avg_prec_dict)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Processed 30001 documents.
Processed 40001 documents.
Processed 50001 documents.
Processed 60001 documents.
Processed 70001 documents.
Processed 80001 documents.
Processed 90001 documents.
Processed 100001 documents.
Processed 110001 documents.
Processed 120001 documents.

Retrieving top 10 documents for each query...


In [37]:
print(rocchio_map)

0.5213616591080877


#### Using query expansion + rocchio algorithm

In [40]:
expanded_queries = query_exp_rocchio.query_expansion(list(queries_dict.values()))
pprint.pprint(expanded_queries[49])

('sieg lal masjid musjid islamabad Islamabad capital_of_Pakistan '
 'fundamentalist fundamentalistic student pupil educatee scholar '
 'scholarly_person bookman')


In [41]:
queries_tfidf = vectorizer.transform(expanded_queries)
pprint.pprint(queries_tfidf)

<50x250353 sparse matrix of type '<class 'numpy.float64'>'
	with 756 stored elements in Compressed Sparse Row format>


In [42]:
# Query transformation
queries_tfidf = queries_tfidf.toarray()
tmp = np.matmul(queries_tfidf, u)
svd_queries_tfidf = np.matmul(tmp, s_diag)
svd_queries_tfidf.shape

(50, 350)

In [43]:
rocchio_queries = query_exp_rocchio.rocchio_query_vec(
    svd_queries_tfidf, vt, qrels_dict, fire_corpus.docno_list
)
query_vecs = {qid: q_vec for qid, q_vec in zip(queries_dict.keys(), rocchio_queries)}
query_vecs[125].shape

(350,)

In [44]:
rank_list = retrieval.perform_retrieval(doc_vecs, query_vecs)
avg_prec_dict = retrieval.get_avg_precision(rank_list, qrels_dict)
expanded_rocchio_map = retrieval.get_map(avg_prec_dict)
print(expanded_rocchio_map)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Processed 30001 documents.
Processed 40001 documents.
Processed 50001 documents.
Processed 60001 documents.
Processed 70001 documents.
Processed 80001 documents.
Processed 90001 documents.
Processed 100001 documents.
Processed 110001 documents.
Processed 120001 documents.

Retrieving top 10 documents for each query...
0.5325267848553564


### Using Word2Vec Document Representation

Training a cbow word2vec model

In [62]:
fire_cbow_model = models.Word2Vec(
    sentences=fire_corpus.tokens_list,
    size=350,
    window=10,
    min_count=1,
    workers=4,
    sg=0,
    hs=1,
    negative=0,
    iter=5
)

2021-03-16 20:02:25,092 : INFO : collecting all words and their counts
2021-03-16 20:02:25,093 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-16 20:02:25,473 : INFO : PROGRESS: at sentence #10000, processed 2408443 words, keeping 52140 word types
2021-03-16 20:02:25,830 : INFO : PROGRESS: at sentence #20000, processed 4528056 words, keeping 72421 word types
2021-03-16 20:02:26,140 : INFO : PROGRESS: at sentence #30000, processed 6510850 words, keeping 84647 word types
2021-03-16 20:02:26,443 : INFO : PROGRESS: at sentence #40000, processed 8241739 words, keeping 101474 word types
2021-03-16 20:02:26,790 : INFO : PROGRESS: at sentence #50000, processed 10388700 words, keeping 114428 word types
2021-03-16 20:02:27,101 : INFO : PROGRESS: at sentence #60000, processed 12252237 words, keeping 123698 word types
2021-03-16 20:02:27,638 : INFO : PROGRESS: at sentence #70000, processed 15575041 words, keeping 136990 word types
2021-03-16 20:02:27,952 : INFO :

In [66]:
fire_cbow_model.save("/content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model")

2021-03-16 20:14:38,110 : INFO : saving Word2Vec object under /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model, separately None
2021-03-16 20:14:38,114 : INFO : storing np array 'vectors' to /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.wv.vectors.npy
2021-03-16 20:14:39,477 : INFO : not storing attribute vectors_norm
2021-03-16 20:14:39,479 : INFO : storing np array 'syn1' to /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.trainables.syn1.npy
2021-03-16 20:14:41,301 : INFO : not storing attribute cum_table
2021-03-16 20:14:43,925 : INFO : saved /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model


In [67]:
fire_cbow_model = models.Word2Vec.load("/content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model")

2021-03-16 20:15:05,279 : INFO : loading Word2Vec object from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model
2021-03-16 20:15:07,404 : INFO : loading wv recursively from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.wv.* with mmap=None
2021-03-16 20:15:07,405 : INFO : loading vectors from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.wv.vectors.npy with mmap=None
2021-03-16 20:15:07,737 : INFO : setting ignored attribute vectors_norm to None
2021-03-16 20:15:07,738 : INFO : loading vocabulary recursively from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.vocabulary.* with mmap=None
2021-03-16 20:15:07,738 : INFO : loading trainables recursively from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.trainables.* with mmap=None
2021-03-16 20:15:07,746 : INFO : loading syn1 from /content/drive/MyDrive/IR_Assignment6/fire_cbow_350.model.trainables.syn1.npy with mmap=None
2021-03-16 20:15:08,099 : INFO : setting ignored attribute cum_ta

In [68]:
# Define functions to build document vectors and query vectors from word embedding matrices
def build_document_vectors(corpus, wordvectors):
    docno_list = corpus.docno_list
    tokens_list = corpus.tokens_list
    doc_vectors = {}

    for idx in range(len(docno_list)):
        doc_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for token in tokens_list[idx]:
            try:
                doc_vec += wordvectors.get_vector(token)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            doc_vec = doc_vec / vectors_added
        doc_vectors[docno_list[idx]] = doc_vec
    
    return doc_vectors

def build_query_vectors(queries_dict, wordvectors):
    query_vectors = {}

    for qid, query in queries_dict.items():
        query_vec = np.zeros((wordvectors.vectors.shape[1],))
        vectors_added = 0
        for qtoken in query.split():
            try:
                query_vec += wordvectors.get_vector(qtoken)
                vectors_added += 1
            except:
                continue
        if vectors_added != 0:
            query_vec = query_vec / vectors_added
        query_vectors[qid] = query_vec
    
    return query_vectors

In [69]:
doc_vecs = build_document_vectors(fire_corpus, fire_cbow_model.wv)
query_vecs = build_query_vectors(queries_dict, fire_cbow_model.wv)

docs_mat = np.array([vec for vec in doc_vecs.values()])
queries_mat = np.array([qvec for qvec in query_vecs.values()])

#### Using Rocchio algorithm on normal word2vec queries

In [72]:
rocchio_queries = query_exp_rocchio.rocchio_query_vec(
    queries_mat, docs_mat, qrels_dict, fire_corpus.docno_list
)
query_vecs.update({qid: q_vec for qid, q_vec in zip(queries_dict.keys(), rocchio_queries)})
query_vecs[125].shape

(350,)

In [73]:
rank_list = retrieval.perform_retrieval(doc_vecs, query_vecs)
avg_prec_dict = retrieval.get_avg_precision(rank_list, qrels_dict)
rocchio_map = retrieval.get_map(avg_prec_dict)
print(rocchio_map)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Assigned 0.0 score to empty document 1051119_nation_story_5495933.utf8 for all queries.
Assigned 0.0 score to empty document 1050420_nation_index.utf8 for all queries.
Processed 30001 documents.
Assigned 0.0 score to empty document 1050822_sports_index.utf8 for all queries.
Processed 40001 documents.
Assigned 0.0 score to empty document 1041105_bengal_index.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728792.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728789.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3940880.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3964865.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968774.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968787.utf8 for all queries.
Ass

#### Using query expansion + rocchio algorithm

In [79]:
expanded_queries_dict = {qid: exp_query for qid, exp_query in zip(queries_dict.keys(), expanded_queries)}
query_vecs = build_query_vectors(expanded_queries_dict, fire_cbow_model.wv)

queries_mat = np.array([qvec for qvec in query_vecs.values()])

In [80]:
rocchio_queries = query_exp_rocchio.rocchio_query_vec(
    queries_mat, docs_mat, qrels_dict, fire_corpus.docno_list
)
query_vecs.update({qid: q_vec for qid, q_vec in zip(queries_dict.keys(), rocchio_queries)})
query_vecs[125].shape

(350,)

In [81]:
rank_list = retrieval.perform_retrieval(doc_vecs, query_vecs)
avg_prec_dict = retrieval.get_avg_precision(rank_list, qrels_dict)
expanded_rocchio_map = retrieval.get_map(avg_prec_dict)
print(expanded_rocchio_map)

Processed 1 documents.
Processed 10001 documents.
Processed 20001 documents.
Assigned 0.0 score to empty document 1051119_nation_story_5495933.utf8 for all queries.
Assigned 0.0 score to empty document 1050420_nation_index.utf8 for all queries.
Processed 30001 documents.
Assigned 0.0 score to empty document 1050822_sports_index.utf8 for all queries.
Processed 40001 documents.
Assigned 0.0 score to empty document 1041105_bengal_index.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728792.utf8 for all queries.
Assigned 0.0 score to empty document 1040908_opinion_story_3728789.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3940880.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3964865.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968774.utf8 for all queries.
Assigned 0.0 score to empty document 1041105_calcutta_story_3968787.utf8 for all queries.
Ass

## Final Results

### TF-IDF Document Representation
1.   MAP using Rocchio: ***0.521***
2.   MAP using Query Expansion + Rocchio: ***0.533***

### Word2Vec Document Representation
1.   MAP using Rocchio: ***0.459***
2.   MAP using Query Expansion + Rocchio: ***0.466***