# IT550 Information Retrieval Assignment - 3
Student ID - 202011032

## Unzipping dataset, importing necessary libraries, and setting paths

In [1]:
!tar -xvf "/content/drive/MyDrive/FIRE_Dataset_EN_2010/English-Data.tgz" -C "/content/"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TELEGRAPH_UTF8/2007_utf8/sports/1070225_sports_story_7438352.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070621_sports_story_7952331.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070207_sports_story_7360730.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070919_sports_story_8334184.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070218_sports_story_7407969.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070317_sports_story_7529504.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070411_sports_story_7632626.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070507_sports_story_7743729.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070318_sports_story_7533511.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070625_sports_story_7969700.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070521_sports_story_7807303.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070517_sports_story_7787900.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070814_sports_story_8191386.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070908_sports_story_8291527.utf8
TELEGRAPH_U

In [1]:
import nltk
import os
import re
import json
import joblib
import numpy as np
import pandas as pd
from collections import Counter
from scipy import sparse
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

PATH = "TELEGRAPH_UTF8"
FILE_PATHS = [os.path.join(dp, f) for dp, dn, filenames in os.walk(PATH) for f in filenames]
TOPICS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Extracting text from documents and performing tokenization
Here a generator function is made to make the text extraction from documents faster.

A generator function for tokenization is also there for the above mentioned reason.


In [None]:
def tokenize_text(text_data):
    '''Tokenizer for documents which uses Porter Stemmer.'''
    stopwords = nltk.corpus.stopwords.words("english")
    # Creating PorterStemmer object for lemmatization of words
    stemmer = nltk.stem.PorterStemmer()

    for word in nltk.word_tokenize(text_data):
        if word not in stopwords:
            yield stemmer.stem(word)
    # return (lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text_data) if word not in stopwords)

def fire_docs_iter():
    '''Generator function for extracting contents from the documents'''
    for i, doc_file_path in enumerate(FILE_PATHS):
        if i % 10000 == 0:
            print(f"Processing document {i}")
        
        with open(doc_file_path) as doc_file:
            soup = BeautifulSoup(doc_file, features="html.parser")

            docno = soup.find("docno").text
            text = soup.find("text").text.lower().replace("\n", " ")

            text = re.sub(r"[^a-zA-Z\s]", "", text).strip()
            text = "".join(filter(lambda x: not x.isdigit(), text))

            text = tokenize_text(text)
        
        yield {"docno": docno, "text": text}

### Creating dataframe from extracted contents of `<docno>` and `<text>` tags

In [None]:
docs_iter = fire_docs_iter()
docs_df = pd.DataFrame(docs_iter)

Processing document 0
Processing document 10000
Processing document 20000
Processing document 30000
Processing document 40000
Processing document 50000
Processing document 60000
Processing document 70000
Processing document 80000
Processing document 90000
Processing document 100000
Processing document 110000
Processing document 120000


Run all the generator functions in the text column

In [None]:
docs_df['text'] = docs_df['text'].apply(list)

In [None]:
docs_df

Unnamed: 0,docno,text
0,1041215_careergraph_index.utf8,"[telegraph, calcutta, careergraph, wednesday, ..."
1,1041229_careergraph_story_4188659.utf8,"[telegraph, calcutta, knowhow, better, privat,..."
2,1041215_careergraph_story_4116626.utf8,"[telegraph, calcutta, careergraph, short, take..."
3,1041215_careergraph_story_4127477.utf8,"[telegraph, calcutta, careergraph, classact, t..."
4,1041208_careergraph_story_4098235.utf8,"[telegraph, calcutta, careergraph, plastic, dr..."
...,...,...
125581,1061130_calcutta_story_7064558.utf8,"[telegraph, calcutta, metro, day, runway, jack..."
125582,1060830_calcutta_story_6672732.utf8,"[telegraph, calcutta, metro, pep, pill, resear..."
125583,1061106_calcutta_restadd.utf8,"[telegraph, calcutta, metro, restaur, bengali,..."
125584,1060123_calcutta_story_5749847.utf8,"[telegraph, calcutta, metro, care, indian, art..."


Save the generated document dataframe using joblib

In [None]:
joblib.dump(docs_df, 'docs_df.gz', compress=True)

['docs_df.gz']

In [None]:
del docs_df

## Calculating TF-IDF matrix for the dataset with topics

In [3]:
DOCS_DF_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/docs_df.gz"

Creating Dataframe of queries and merging it with docs dataframe

In [4]:
def tokenize_topics(text_data):
    '''Tokenizer for topics using Porter Stemmer.'''
    stopwords = nltk.corpus.stopwords.words("english")
    # Creating PorterStemmer object for lemmatization of words
    stemmer = nltk.stem.PorterStemmer()

    return [stemmer.stem(word) for word in nltk.word_tokenize(text_data) if word not in stopwords]

def fire_topics():
    '''Returns dictionary of qid taken from <num> tag and query from <narr> tag.'''
    with open(TOPICS_PATH) as topics_file:
        soup = BeautifulSoup(topics_file, features="html.parser")

        qid_all = [int(num.text) for num in soup.find_all("num")]
        # Here we are taking text/query from the <narr> tag.
        text_all = [desc.text.lower().replace("\n", " ") for desc in soup.find_all("narr")]

        text_all = [re.sub(r"[^a-zA-Z\s]", "", desc).strip() for desc in text_all]
        text_all = [tokenize_topics(desc) for desc in text_all]
    
    return {"qid": qid_all, "query": text_all}

In [11]:
topics_df = pd.DataFrame(fire_topics())
topics_df.rename(columns={'qid': 'docno', 'query': 'text'}, inplace=True)

docs_df = joblib.load(DOCS_DF_PATH)

docs_topics_df = pd.concat([docs_df, topics_df], ignore_index=True)
docs_topics_df

Unnamed: 0,docno,text
0,1041215_careergraph_index.utf8,"[telegraph, calcutta, careergraph, wednesday, ..."
1,1041229_careergraph_story_4188659.utf8,"[telegraph, calcutta, knowhow, better, privat,..."
2,1041215_careergraph_story_4116626.utf8,"[telegraph, calcutta, careergraph, short, take..."
3,1041215_careergraph_story_4127477.utf8,"[telegraph, calcutta, careergraph, classact, t..."
4,1041208_careergraph_story_4098235.utf8,"[telegraph, calcutta, careergraph, plastic, dr..."
...,...,...
125631,121,"[relev, document, report, blast, samjhauta, ex..."
125632,122,"[relev, document, contain, inform, surrend, sa..."
125633,123,"[relev, document, contain, inform, death, pale..."
125634,124,"[relev, document, mention, variou, place, indi..."


In [12]:
del topics_df, docs_df
joblib.dump(docs_topics_df, "/content/drive/MyDrive/FIRE_Dataset_EN_2010/docs_topics_df.gz", compress=True)

['/content/drive/MyDrive/FIRE_Dataset_EN_2010/docs_topics_df.gz']

In [13]:
del docs_topics_df

Given below functions are generators which loads the data from the above generated and saved document dataframe using joblib.

In [2]:
DOCS_TOPICS_DF_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/docs_topics_df.gz"

In [5]:
def docno_gen():
    for docno in joblib.load(DOCS_TOPICS_DF_PATH)['docno']:
        yield docno

def words_gen():
    return (text for text in joblib.load(DOCS_TOPICS_DF_PATH)['text'])

def tf_gen():
    for text in joblib.load(DOCS_TOPICS_DF_PATH)['text']:
        tokens, counts = np.unique(text, return_counts=True)
        counts = (counts / len(counts)).astype(np.float16)
        yield [(token, count) for token, count in zip(tokens, counts)]
        

Get list of docnos/qids or document names / query num

In [6]:
docnos = docno_gen()
docnos = list(docnos)

Generate a df dictionary of vocabulary

In [7]:
doc_freq = []
for text in words_gen():
    doc_freq.extend(text)
doc_freq = Counter(doc_freq)
vocab = list(doc_freq.keys())

In [8]:
def fit_and_save(docnos, doc_freq, tf_gen):
    '''Generates TF-IDF dictionary for the documents and directly writes them to a csv file.'''
    from csv import DictWriter
    N = len(docnos)
    # tf_idf = sparse.csr_matrix(np.zeros( (N, len(doc_freq.keys()) ) ), dtype=np.float16)
    with open("/content/drive/MyDrive/FIRE_Dataset_EN_2010/tf_idf.csv",'w') as tf_idf_file:
        wrtObj = DictWriter(tf_idf_file, fieldnames=['docno','scores'])
        wrtObj.writeheader()
        for i, (docno, tf_list) in enumerate(zip(docnos, tf_gen)):
            # doc_idx = docnos.index(docno)
            tf_idf = {}
            for token, tf in tf_list:
                # token_idx = vocab.index(token)
                tf_idf_val = tf * (1 + np.log10(N / (1 + doc_freq[token])))
                tf_idf.update({token: tf_idf_val})
            
            wrtObj.writerow({"docno": docno, "scores" :json.dumps(tf_idf)})
            if i % 10000 == 0:
                print(f"Calculated scores for {i} documents.")
    
    return True

Save the TF-IDF matrix as a csv file.

In [9]:
fit_and_save(docnos, doc_freq, tf_gen())

Calculated scores for 0 documents.
Calculated scores for 10000 documents.
Calculated scores for 20000 documents.
Calculated scores for 30000 documents.
Calculated scores for 40000 documents.
Calculated scores for 50000 documents.
Calculated scores for 60000 documents.
Calculated scores for 70000 documents.
Calculated scores for 80000 documents.
Calculated scores for 90000 documents.
Calculated scores for 100000 documents.
Calculated scores for 110000 documents.
Calculated scores for 120000 documents.


True

In [10]:
def read_tfidf(file):
    '''Reads and returns a tf-idf dictionary for the documents.'''
    import csv
    tfidf_vec_mat = {}
    with open(file) as f:
        data = csv.reader(f,delimiter=',')
        for i, line in enumerate(data):
            if i == 0:
                continue
            # if i % 10000 == 0:
            #     print(i, line)
            scores = line[1]
            tfidf_vec_mat.update({line[0]: scores})
    return tfidf_vec_mat

In [11]:
del docnos, doc_freq, vocab

## Calculating cosine similarity between document vectors and query vectors and Performing Retrieval task

In [12]:
TFIDF_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/tf_idf.csv"

Loading term-document matrix

In [14]:
tf_idf = read_tfidf(TFIDF_PATH)

In [51]:
json.loads(tf_idf['1040908_opinion_story_3728789.utf8'])

{}

Performing retrieval and generating rank list of top 10 documents per query

In [59]:
def buildVector(dict1, dict2):
    all_items = set(dict1.keys()).union( set(dict2.keys()) )
    vector1 = np.array([dict1.get(k, 0) for k in all_items])
    vector2 = np.array([dict2.get(k, 0) for k in all_items])
    return vector1, vector2

def perform_retrieval(tf_idf):
    '''Retrieves top 10 documents for each query based on the cosine similarity'''
    qids, docnos = [], []
    for id in tf_idf.keys():
        if id.isnumeric(): qids.append(id)
        else: docnos.append(id)
    
    top10docs_per_query = {qid: dict() for qid in qids}
    
    for i, docno in enumerate(docnos):
        doc_vec = json.loads(tf_idf[docno])
        is_doc_empty = False
        for qid in sorted(qids):
            query_vec = json.loads(tf_idf[qid])
            d_vec, q_vec = buildVector(doc_vec, query_vec)
            if all(d_vec == 0):
                top10docs_per_query[qid].update({docno: 0})
                is_doc_empty = True
            else:
                top10docs_per_query[qid].update({
                    docno: 1 - nltk.cluster.cosine_distance(d_vec, q_vec) 
                })
        
        if is_doc_empty:
            print(f"Assigned 0 score to empty document {docno} for all queries.")
        if i % 10000 == 0:
            print(f"Processed {i+1} documents.")

    print("\nRetrieving top 10 documents for each query...")
    top10docs_per_query = {
        qid: {
            docno: score 
            for docno, score 
            in sorted(top10docs_per_query[qid].items(), 
                      key= lambda x: x[1], reverse=True)[:10]
        } 
        for qid in top10docs_per_query.keys()
    }

    return top10docs_per_query
        

In [60]:
rank_list_per_query = perform_retrieval(tf_idf)
rank_list_per_query

Processed 1 documents.
Assigned 0 score to empty document 1040908_opinion_story_3728789.utf8 for all queries.
Assigned 0 score to empty document 1040908_opinion_story_3728792.utf8 for all queries.
Assigned 0 score to empty document 1041105_bengal_index.utf8 for all queries.
Assigned 0 score to empty document 1041130_foreign_index.utf8 for all queries.
Processed 10001 documents.
Assigned 0 score to empty document 1041105_business_index.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3968787.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3966828.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3965095.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3967239.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3965348.utf8 for all queries.
Assigned 0 score to empty document 1041105_calcutta_story_3964865.utf8 for all queries.
Assigned 0 

{'100': {'1041004_nation_story_3838101.utf8': 0.4024996892742122,
  '1041110_nation_story_3987208.utf8': 0.35675482517213264,
  '1050227_nation_story_4430335.utf8': 0.3247956709335571,
  '1050810_nation_story_5095926.utf8': 0.3279453587941916,
  '1051112_frontpage_story_5468137.utf8': 0.3723329717345698,
  '1051112_nation_story_5468011.utf8': 0.35141094582695365,
  '1051115_nation_story_5478696.utf8': 0.34079856936470776,
  '1051125_nation_story_5520551.utf8': 0.33730969155531065,
  '1070720_nation_story_8082600.utf8': 0.3633408226954087,
  '1070724_nation_story_8096554.utf8': 0.3271599678042594},
 '101': {'1060603_frontpage_story_6305867.utf8': 0.35507261275824364,
  '1060603_nation_story_6305901.utf8': 0.34799171781699334,
  '1060603_nation_story_6306769.utf8': 0.4107809429240743,
  '1060607_frontpage_story_6320995.utf8': 0.3506695196237888,
  '1060608_frontpage_story_6325839.utf8': 0.3463756485819073,
  '1060609_opinion_story_6310526.utf8': 0.3798475163688244,
  '1060718_frontpage_s

### Save the rank list

In [61]:
from csv import DictWriter
with open("/content/drive/MyDrive/FIRE_Dataset_EN_2010/rank_list.csv",'w') as rank_list_file:
        wrtObj = DictWriter(rank_list_file, fieldnames=['qid','retrieved_docs'])
        wrtObj.writeheader()
        for qid, top_docs in rank_list_per_query.items():
            wrtObj.writerow({"qid": qid, "retrieved_docs" :json.dumps(top_docs)})

## Calculating AP and MAP from the rank list

Read qrels file for the given topics

In [62]:
QRELS_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.rel.txt"
RANK_LIST_PATH = "/content/drive/MyDrive/FIRE_Dataset_EN_2010/rank_list.csv"

In [63]:
def read_qrels(qrels_path):
    '''Reads and return qrels objects from the given path as a dictionary with the relevant documents.'''
    qrels = {}
    with open(qrels_path) as qrels_file:
        lines = qrels_file.readlines()
        for line in lines:
            line = line.strip()
            if line.endswith('1'):
                line = line.split()
                qrels[line[0]] = qrels.get(line[0], []) + [line[2]]
    return qrels

def read_rank_list(file_path):
    '''Reads and returns the saved rank list from the given path.'''
    import csv
    rank_list = {}
    with open(file_path) as f:
        data = csv.reader(f,delimiter=',')
        for i, line in enumerate(data):
            if i == 0:
                continue
            doc_score_dict = json.loads(line[1])
            rank_list.update({line[0]: doc_score_dict})
    return rank_list

Calculate **Average Precision** and **Mean Average Precision**

In [64]:
qrels = read_qrels(QRELS_PATH)
rank_list = read_rank_list(RANK_LIST_PATH)

In [65]:
qrels

{'100': ['1041107_frontpage_story_3975998.utf8',
  '1041110_nation_story_3987208.utf8',
  '1041117_nation_story_4013184.utf8',
  '1050227_nation_story_4430335.utf8',
  '1050721_nation_story_5015427.utf8',
  '1050810_nation_story_5095926.utf8',
  '1051112_frontpage_story_5468009.utf8',
  '1051112_frontpage_story_5468137.utf8',
  '1051112_nation_story_5468011.utf8',
  '1051115_nation_story_5478696.utf8',
  '1051119_frontpage_story_5495967.utf8',
  '1051121_nation_story_5502034.utf8',
  '1051123_frontpage_story_5511154.utf8',
  '1051123_nation_story_5511465.utf8',
  '1051206_nation_story_5564372.utf8',
  '1051213_nation_story_5593623.utf8',
  '1060118_nation_story_5734651.utf8',
  '1060929_nation_story_6807710.utf8',
  '1061001_nation_story_6815631.utf8',
  '1070518_nation_story_7794373.utf8',
  '1070726_nation_story_8107361.utf8'],
 '101': ['1060603_frontpage_story_6305867.utf8',
  '1060603_frontpage_story_6306133.utf8',
  '1060603_nation_story_6305903.utf8',
  '1060604_frontpage_story_6

In [74]:
def get_avg_precision(rank_list, qrels):
    '''Returns a dictionary with qids and their relevant average precisions.'''
    # Store (precision x relevance) as a list for a query
    q_prec_rel = {qid: 0 for qid in qrels.keys()}

    for qid, rank_dict in sorted(rank_list.items()):
        rel_doc_ctr = 0
        avg_prec = 0.0
        for pos, (doc, _) in enumerate(sorted(rank_dict.items(), key=lambda x: x[1], reverse=True), 1):
            # Calculate precision@pos+1 x relevance@pos+1 for the retrieved documents
            if doc in qrels[qid]:
                rel_doc_ctr += 1
                avg_prec += rel_doc_ctr / (pos + 1)
            # print(f"doc in qrels: {doc in qrels[qid]}, pos+1: {pos+1}, rel_doc_ctr: {rel_doc_ctr}, avg_prec: {avg_prec}")
        if rel_doc_ctr != 0:
            q_prec_rel[qid] = ( 1 / rel_doc_ctr ) * avg_prec
        else:
            q_prec_rel[qid] = avg_prec
        # print(f"Qid: {qid}, Avg_prec: {q_prec_rel[qid]}")
    
    return q_prec_rel


**Average Precision for each query**

In [75]:
q_avg_prec = get_avg_precision(rank_list, qrels)

print("QId \t Avg Precision")
for qid, ap in q_avg_prec.items():
    print(f"{qid} \t {ap: 0.4}")

QId 	 Avg Precision
76 	  0.5333
77 	  0.3595
78 	  0.4167
79 	  0.25
80 	  0.4821
81 	  0.6544
82 	  0.514
83 	  0.5
84 	  0.5
85 	  0.09091
86 	  0.0
87 	  0.1111
88 	  0.4709
89 	  0.2667
90 	  0.1111
91 	  0.6076
92 	  0.5214
93 	  0.2758
94 	  0.6973
95 	  0.6347
96 	  0.0
97 	  0.5891
98 	  0.6083
99 	  0.3333
100 	  0.4843
101 	  0.3667
102 	  0.4659
103 	  0.4988
104 	  0.3909
105 	  0.612
106 	  0.5556
107 	  0.7198
108 	  0.225
109 	  0.2528
110 	  0.225
111 	  0.4909
112 	  0.5
113 	  0.5317
114 	  0.625
115 	  0.519
116 	  0.6509
117 	  0.0
118 	  0.1
119 	  0.4583
120 	  0.7198
121 	  0.5417
122 	  0.1833
123 	  0.6454
124 	  0.0
125 	  0.1623


**Mean Average Precision for the queries**

In [76]:
N = len(q_avg_prec.keys())
mean_avg_precision = (1 / N) * sum(q_avg_prec.values())
print("Mean Average Precision:", mean_avg_precision)

Mean Average Precision: 0.409071675084175
