# Assignment 3 : Document Retrieval and Evaluation
## Dreamy pujara - 202211005

### Extracting files

In [1]:
!pip install rarfile

Collecting rarfile
  Downloading rarfile-4.0-py3-none-any.whl (28 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import rarfile
import tarfile

main_path = "/content/drive/MyDrive/FIRE_Dataset_EN_2010.rar"
data_extracted = "/content/extracted_files"

if not os.path.exists(data_extracted):
    os.makedirs(data_extracted)

with rarfile.RarFile(main_path, 'r') as main_rar:
    main_rar.extractall(data_extracted)

for root, dirs, files in os.walk(data_extracted):
    for file in files:
        if file.lower().endswith('.tgz'):
            tgz_file_path = os.path.join(root, file)
            with tarfile.open(tgz_file_path, 'r:gz') as nested_tgz:
                nested_tgz.extractall(root)
            os.remove(tgz_file_path)

print("Extraction completed.")

Extraction completed.


In [None]:
import gzip
gz_file_path = '/content/extracted_files/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt.gz'
extracted_file_path = '/content/extracted_files/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt'
with gzip.open(gz_file_path, 'rb') as gz_file:
    with open(extracted_file_path, 'wb') as extracted_file:
        extracted_file.write(gz_file.read())

print("Extraction complete.")

Extraction complete.


### Read the files using BeautifulSoup library, perform preprocessing steps

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

main_file_location = "/content/extracted_files/FIRE_Dataset_EN_2010/TELEGRAPH_UTF8"
corpus = {}

def extract_text_from_xml(xml_path):
    with open(xml_path, 'r', encoding='utf-8') as xml_file:
        soup = BeautifulSoup(xml_file, 'xml')
        text_elements = soup.find_all('TEXT')

        if text_elements:
            text = " ".join([elem.get_text() for elem in text_elements if elem.get_text()])
            return text.strip()
        else:
            return None

for root_dir, _, files in os.walk(main_file_location):
    for file in files:
        if file.endswith(".utf8"):

            xml_path = os.path.join(root_dir, file)
            doc_id = file
            doc_text = extract_text_from_xml(xml_path)

            if doc_text:

                corpus[doc_id] = doc_text

corpus_df = pd.DataFrame(corpus.items(), columns=["docno", "Text"])

In [None]:
corpus_df

Unnamed: 0,docno,Text
0,1050702_calcutta_story_4937658.utf8,The Telegraph - Calcutta : Metro\n\n The final...
1,1050128_calcutta_story_4300622.utf8,The Telegraph - Calcutta : Metro\n\n ENGLISH\n...
2,1050731_calcutta_story_5055128.utf8,The Telegraph - Calcutta : Metro\n\n Metro Rec...
3,1050330_calcutta_story_4552367.utf8,The Telegraph - Calcutta : Metro\n\n An actor ...
4,1051102_calcutta_story_5425738.utf8,The Telegraph - Calcutta : Metro\n\nGuide foxe...
...,...,...
125511,1041023_frontpage_index.utf8,The Telegraph - Calcutta : Frontpage
125512,1041011_frontpage_story_3867944.utf8,The Telegraph - Calcutta : Frontpage\n\n Manmo...
125513,1040930_frontpage_index.utf8,The Telegraph - Calcutta : Frontpage
125514,1040910_frontpage_story_3740017.utf8,The Telegraph - Calcutta : Frontpage\n\n Murde...


In [None]:
corpus_df['Text'][0]

'The Telegraph - Calcutta : Metro\n\n The final journey\n\n Based on a novel by Kamal Kumar Mazumdar, Gobardanga Shilpayans production, Antarjali Jatra, is a commentary on the cruelty and barbarism that women were subjected to in olden days. Being staged as part of Hutch Odeon 2005, the ongoing theatre festival that aims to promote Calcutta?s cultural heritage, it is a heart-wrenching performance that evokes feelings of pity, compassion and outrage at the heinous practices that were committed against women in the name of religious customs and rituals. The play relates the story of Jashobati, a beautiful, young girl, who becomes a victim of the Hindu koulinya protha. Born into a poor family, her father is compelled to marry her off to an old, dying man, or else the bride?s family runs the risk of being ostracised for not following a custom (protha) believed to be sacrosanct by upper class Hindus. As his newly wed bride, Jashobati accompanies her groom to the riverside, waiting for him t

In [None]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):

    processed_text = re.sub(r'[^\w\s]', ' ', text)
    processed_text = processed_text.lower()
    words = nltk.word_tokenize(processed_text)
    stemmed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    processed_text = " ".join(stemmed_words)

    return processed_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
corpus_df['processed_text'] = corpus_df['Text'].apply(preprocess)

In [None]:
corpus_df['processed_text'][0]

'telegraph calcutta metro final journey base novel kamal kumar mazumdar gobardanga shilpayan product antarjali jatra commentari cruelti barbar women subject olden day stage part hutch odeon 2005 ongo theatr festiv aim promot calcutta cultur heritag heart wrench perform evok feel piti compass outrag heinou practic commit women name religi custom ritual play relat stori jashobati beauti young girl becom victim hindu koulinya protha born poor famili father compel marri old die man els bride famili run risk ostracis follow custom protha believ sacrosanct upper class hindu newli wed bride jashobati accompani groom riversid wait die surprisingli baijunath dom burn ghat one express feel human poor girl fate direct ashi chattopadhyay event play bengali antarjali jatra produc gobardanga shilpayan today 6 30 pm rabindra sadan'

### Index the documents using TF-IDF.

In [None]:
!pip install python-terrier

Collecting python-terrier
  Downloading python-terrier-0.9.2.tar.gz (104 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/104.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m92.2/104.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)


In [None]:
import pyterrier as pt
pt.init()

terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [None]:
pd_indexer = pt.DFIndexer("./pd_index")
indexref = pd_indexer.index(corpus_df["processed_text"].astype(str), corpus_df["docno"].astype(str))

  for column, value in meta_column[1].iteritems():


In [None]:
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())

Number of documents: 125516
Number of terms: 188876
Number of postings: 19833976
Number of fields: 0
Number of tokens: 32873281
Field names: []
Positions:   false



### Use the text in title in the queries convert to TF-IDF.

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

# Define the path to the query file
path_query_file = "/content/extracted_files/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt"

# Read the query file into a DataFrame
columns = ["qid", "query"]
queries = []

tree = ET.parse(path_query_file)
root = tree.getroot()

for topic in root.findall('top'):
    topic_number = topic.find('num').text
    title = topic.find('title').text
    queries.append([topic_number, preprocess(title)])

query_df = pd.DataFrame(queries, columns=columns)

query_df

Unnamed: 0,qid,query
0,76,clash gurjar meena
1,77,attack hezbollah guerrilla
2,78,conflict advani singhal ram mandir issu
3,79,build road china mount everest
4,80,babri masjid demolit case start advani
5,81,problem relat immun programm japanes enceph india
6,82,propos bu servic srinagar muzaffarabad
7,83,elect campaign laloo prasad yadav ram vila paswan
8,84,brinda karat alleg swami ramdev
9,85,abu salem accus mumbai bomb blast case jail cu...


In [None]:
br = pt.BatchRetrieve(index, wmodel="TF_IDF")
res = br.transform(query_df)
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,76,99361,1070603_nation_story_7869357.utf8,0,20.186933,clash gurjar meena
1,76,99688,1070611_nation_story_7906812.utf8,1,16.385560,clash gurjar meena
2,76,100514,1070602_nation_story_7865940.utf8,2,13.973796,clash gurjar meena
3,76,70248,1060912_nation_story_6733766.utf8,3,12.765150,clash gurjar meena
4,76,101971,1070602_nation_story_7865944.utf8,4,12.434667,clash gurjar meena
...,...,...,...,...,...,...
49995,125,61577,1060225_sports_index.utf8,995,3.938441,attack lal masjid
49996,125,91159,1070608_opinion_index.utf8,996,3.938441,attack lal masjid
49997,125,29317,1050513_nation_story_4733846.utf8,997,3.938288,attack lal masjid
49998,125,87630,1070430_foreign_story_7715573.utf8,998,3.934818,attack lal masjid


### Perform retrieval and perform evaluation using:
* Precision@10
* Recall@10
* F1-score

**Ground Truth**

In [None]:
q_path=("/content/extracted_files/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt")
qrels = pt.io.read_qrels(q_path)
qrels

Unnamed: 0,qid,docno,label
0,76,1040901_nation_story_3702283.utf8,0
1,76,1040901_opinion_story_3675790.utf8,0
2,76,1040902_nation_story_3707291.utf8,0
3,76,1040904_opinion_story_3713095.utf8,0
4,76,1040908_calcutta_story_3729202.utf8,0
...,...,...,...
15130,125,1070914_nation_story_8315528.utf8,0
15131,125,1070914_nation_story_8315898.utf8,0
15132,125,1070918_nation_story_8329783.utf8,0
15133,125,1070921_foreign_story_8343703.utf8,0


**Query wise, Evaluation metric values are:**

In [None]:
eval = pt.Utils.evaluate(res,qrels,metrics=["P_10","recall_10"],perquery=True)
eval_df = pd.DataFrame(eval).T
eval_df['F1_10'] = 2 * (eval_df['P_10'] * eval_df['recall_10']) / (eval_df['P_10'] + eval_df['recall_10'])
eval_df['F1_10'].fillna(0, inplace=True)
eval_df

Unnamed: 0,P_10,recall_10,F1_10
76,0.4,1.0,0.571429
77,0.0,0.0,0.0
78,0.3,1.0,0.461538
79,0.3,0.75,0.428571
80,0.3,0.375,0.333333
81,0.2,0.333333,0.25
82,0.4,0.173913,0.242424
83,0.0,0.0,0.0
84,0.3,1.0,0.461538
85,0.5,0.227273,0.3125


 **Thus, Overall evaluation scores are as follow:**

In [None]:
eval = pt.Utils.evaluate(res,qrels,metrics=["P_10","recall_10"])
p = eval['P_10']
r = eval['recall_10']
eval['F1_10'] = 2*p*r/(p+r)

eval_df = pd.DataFrame([eval])
eval_df

Unnamed: 0,P_10,recall_10,F1_10
0,0.36,0.340001,0.349715
