Mount Google Drive and Install Requirements

In [7]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
!pip install nltk



Importing packages

In [9]:
import nltk
import re
import math
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import defaultdict
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Assigning path for dataset

In [10]:
CRAN_DOCS_PATH = "/content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cran.all.1400 (1).xml"
CRAN_QUERIES_PATH = "/content/drive/MyDrive/MOS/cranfield-trec-dataset-main/sorted_cran.qry (2).xml"
CRAN_RELEVANCE_PATH = "/content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt"

Creating Text processor class using lemmatation

In [11]:
class TextProcessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def process_text(self, text):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
        tokens = text.split()
        filtered = [word for word in tokens if word not in self.stop_words]
        lemmatized = [self.lemmatizer.lemmatize(word) for word in filtered]

        return lemmatized

Creating Text Processor class using Stemming

In [12]:
class TextProcessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()

    def process_text(self, text):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
        tokens = text.split()
        filtered = [word for word in tokens if word not in self.stop_words]
        stemmed = [self.stemmer.stem(word) for word in filtered]

        return stemmed

Creating inverted index

In [13]:
class InvertedIndex:
    def __init__(self, processor):
        self.index = defaultdict(dict)
        self.doc_lengths = {}
        self.total_docs = 0
        self.avg_doc_length = 0
        self.processor = processor

    def build_index(self, docs):
        self.total_docs = len(docs)
        total_length = 0

        for doc_id, text in docs.items():
            terms = self.processor.process_text(text)
            self.doc_lengths[doc_id] = len(terms)
            total_length += len(terms)
            term_counts = defaultdict(int)
            for term in terms:
                term_counts[term] += 1

            for term, count in term_counts.items():
                self.index[term][doc_id] = count

        self.avg_doc_length = total_length / self.total_docs

Creating class for 3 ranking models

In [14]:
class Ranker:
    def __init__(self, index):
        self.index = index
        self.k1 = 1.5
        self.b = 0.75
        self.mu = 2000

    def vsm_score(self, query_terms, doc_id):
        score = 0.0
        for term in query_terms:
            if term in self.index.index:
                tf = self.index.index[term].get(doc_id, 0)
                df = len(self.index.index[term])
                idf = math.log((self.index.total_docs + 1) / (df + 0.5))
                score += tf * idf
        return score

    def bm25_score(self, query_terms, doc_id):
        score = 0.0
        doc_length = self.index.doc_lengths[doc_id]
        for term in query_terms:
            if term in self.index.index:
                tf = self.index.index[term].get(doc_id, 0)
                df = len(self.index.index[term])
                idf = math.log((self.index.total_docs - df + 0.5) / (df + 0.5))
                numerator = tf * (self.k1 + 1)
                denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.index.avg_doc_length))
                score += idf * (numerator / denominator)
        return score

    def lm_score(self, query_terms, doc_id):
        score = 0.0
        doc_length = self.index.doc_lengths[doc_id]
        for term in query_terms:
            if term in self.index.index:
                tf = self.index.index[term].get(doc_id, 0)
                collection_cf = sum(self.index.index[term].values())
                p = (tf + self.mu * (collection_cf / self.index.total_docs)) / (doc_length + self.mu)
                score += math.log(p) if p > 0 else 0
        return score


Function for loading Data into program

In [15]:
def load_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = {}
    for elem in root.findall('.//doc'):
        doc_id = elem.find('docno').text.strip()
        text = "".join(elem.find('text').itertext()).strip()
        data[doc_id] = text
    return data

def load_queries_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    queries = {}
    for top in root.findall('.//top'):
        query_num = top.find('num').text.strip()
        query_title = top.find('title').text.strip()
        queries[query_num] = query_title
    return queries

def load_relevance(relevance_path):
    qrel = defaultdict(dict)
    with open(relevance_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            q_id, _, doc_id, rel = parts
            qrel[q_id][doc_id] = int(rel)
    return qrel


Function for genetating Trec Output

In [16]:
def generate_trec_output(queries, index, ranker, output_prefix):
    with open(f"{output_prefix}_vsm.trec", 'w') as vsm_file, \
         open(f"{output_prefix}_bm25.trec", 'w') as bm25_file, \
         open(f"{output_prefix}_lm.trec", 'w') as lm_file:

        for q_id, query_text in queries.items():
            query_terms = processor.process_text(query_text)
            vsm_scores = []
            bm25_scores = []
            lm_scores = []
            for doc_id in index.doc_lengths:
                vsm = ranker.vsm_score(query_terms, doc_id)
                bm25 = ranker.bm25_score(query_terms, doc_id)
                lm = ranker.lm_score(query_terms, doc_id)
                vsm_scores.append(f"{q_id} 0 {doc_id} 1 {vsm:.4f} VSM")
                bm25_scores.append(f"{q_id} 0 {doc_id} 1 {bm25:.4f} BM25")
                lm_scores.append(f"{q_id} 0 {doc_id} 1 {lm:.4f} LM")
            vsm_file.write("\n".join(vsm_scores[:2500]) + "\n")
            bm25_file.write("\n".join(bm25_scores[:2500]) + "\n")
            lm_file.write("\n".join(lm_scores[:1500]) + "\n")



Main Class

In [17]:
if __name__ == "__main__":
    processor = TextProcessor()

    print("Loading documents...")
    documents = load_xml(CRAN_DOCS_PATH)
    queries = load_queries_xml(CRAN_QUERIES_PATH)
    relevance = load_relevance(CRAN_RELEVANCE_PATH)

    print("Building index...")
    index = InvertedIndex(processor)
    index.build_index(documents)

    ranker = Ranker(index)


    print("Generating results...")
    generate_trec_output(queries, index, ranker, "results")  # Use "results" as the prefix

    print("Done! Results saved to:")
    print("- results_vsm.trec (Vector Space Model)")
    print("- results_bm25.trec (BM25)")
    print("- results_lm.trec (Language Model)")
    print("Use trec_eval with commands:")
    print(f"!./trec_eval {CRAN_RELEVANCE_PATH} results_vsm.trec -m all_trec")
    print(f"!./trec_eval {CRAN_RELEVANCE_PATH} results_bm25.trec -m all_trec")
    print(f"!./trec_eval {CRAN_RELEVANCE_PATH} results_lm.trec -m all_trec")

Loading documents...
Building index...
Generating results...
Done! Results saved to:
- results_vsm.trec (Vector Space Model)
- results_bm25.trec (BM25)
- results_lm.trec (Language Model)
Use trec_eval with commands:
!./trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt results_vsm.trec -m all_trec
!./trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt results_bm25.trec -m all_trec
!./trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt results_lm.trec -m all_trec


Evaluating Results

In [18]:
!wget -q https://github.com/usnistgov/trec_eval/archive/refs/tags/v9.0.7.tar.gz
!tar -xzf v9.0.7.tar.gz
!cd trec_eval-9.0.7 && make


gcc -g -I.  -Wall -DVERSIONID=\"9.0.7\"  -o trec_eval trec_eval.c formats.c meas_init.c meas_acc.c meas_avg.c meas_print_single.c meas_print_final.c get_qrels.c get_trec_results.c get_prefs.c get_qrels_prefs.c get_qrels_jg.c form_res_rels.c form_res_rels_jg.c form_prefs_counts.c utility_pool.c get_zscores.c convert_zscores.c measures.c  m_map.c m_P.c m_num_q.c m_num_ret.c m_num_rel.c m_num_rel_ret.c m_gm_map.c m_Rprec.c m_recip_rank.c m_bpref.c m_iprec_at_recall.c m_recall.c m_Rprec_mult.c m_utility.c m_11pt_avg.c m_ndcg.c m_ndcg_cut.c m_Rndcg.c m_ndcg_rel.c m_binG.c m_G.c m_rel_P.c m_success.c m_infap.c m_map_cut.c m_gm_bpref.c m_runid.c m_relstring.c m_set_P.c m_set_recall.c m_set_rel_P.c m_set_map.c m_set_F.c m_num_nonrel_judged_ret.c m_prefs_num_prefs_poss.c m_prefs_num_prefs_ful.c m_prefs_num_prefs_ful_ret.c m_prefs_simp.c m_prefs_pair.c m_prefs_avgjg.c m_prefs_avgjg_Rnonrel.c m_prefs_simp_ret.c m_prefs_pair_ret.c m_prefs_avgjg_ret.c m_prefs_avgjg_Rnonrel_ret.c m_prefs_simp_imp.c 

In [19]:
!trec_eval-9.0.7/trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt /content/results_bm25.trec -m all_trec


runid                 	all	BM25
num_q                 	all	225
num_ret               	all	315000
num_rel               	all	1612
num_rel_ret           	all	1612
map                   	all	0.2957
gm_map                	all	0.1809
Rprec                 	all	0.2905
bpref                 	all	0.2851
recip_rank            	all	0.5365
iprec_at_recall_0.00  	all	0.5828
iprec_at_recall_0.10  	all	0.5470
iprec_at_recall_0.20  	all	0.4911
iprec_at_recall_0.30  	all	0.4106
iprec_at_recall_0.40  	all	0.3629
iprec_at_recall_0.50  	all	0.3208
iprec_at_recall_0.60  	all	0.2457
iprec_at_recall_0.70  	all	0.2048
iprec_at_recall_0.80  	all	0.1474
iprec_at_recall_0.90  	all	0.1106
iprec_at_recall_1.00  	all	0.1047
P_5                   	all	0.3138
P_10                  	all	0.2311
P_15                  	all	0.1846
P_20                  	all	0.1520
P_30                  	all	0.1179
P_100                 	all	0.0488
P_200                 	all	0.0278
P_500                 	all	0.0127
P_1000                	

In [20]:
!trec_eval-9.0.7/trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt /content/results_vsm.trec -m all_trec


runid                 	all	VSM
num_q                 	all	225
num_ret               	all	315000
num_rel               	all	1612
num_rel_ret           	all	1612
map                   	all	0.2152
gm_map                	all	0.1234
Rprec                 	all	0.1973
bpref                 	all	0.3753
recip_rank            	all	0.4550
iprec_at_recall_0.00  	all	0.4774
iprec_at_recall_0.10  	all	0.4449
iprec_at_recall_0.20  	all	0.3579
iprec_at_recall_0.30  	all	0.2937
iprec_at_recall_0.40  	all	0.2479
iprec_at_recall_0.50  	all	0.2198
iprec_at_recall_0.60  	all	0.1556
iprec_at_recall_0.70  	all	0.1373
iprec_at_recall_0.80  	all	0.0949
iprec_at_recall_0.90  	all	0.0698
iprec_at_recall_1.00  	all	0.0656
P_5                   	all	0.2142
P_10                  	all	0.1667
P_15                  	all	0.1384
P_20                  	all	0.1218
P_30                  	all	0.0994
P_100                 	all	0.0460
P_200                 	all	0.0270
P_500                 	all	0.0127
P_1000                	a

In [21]:
!trec_eval-9.0.7/trec_eval /content/drive/MyDrive/MOS/cranfield-trec-dataset-main/cranqrel.trec.txt /content/results_lm.trec -m all_trec


runid                 	all	LM
num_q                 	all	225
num_ret               	all	315000
num_rel               	all	1612
num_rel_ret           	all	1612
map                   	all	0.0368
gm_map                	all	0.0116
Rprec                 	all	0.0376
bpref                 	all	0.3221
recip_rank            	all	0.1033
iprec_at_recall_0.00  	all	0.1092
iprec_at_recall_0.10  	all	0.1000
iprec_at_recall_0.20  	all	0.0727
iprec_at_recall_0.30  	all	0.0502
iprec_at_recall_0.40  	all	0.0360
iprec_at_recall_0.50  	all	0.0322
iprec_at_recall_0.60  	all	0.0172
iprec_at_recall_0.70  	all	0.0160
iprec_at_recall_0.80  	all	0.0107
iprec_at_recall_0.90  	all	0.0101
iprec_at_recall_1.00  	all	0.0100
P_5                   	all	0.0320
P_10                  	all	0.0276
P_15                  	all	0.0240
P_20                  	all	0.0184
P_30                  	all	0.0141
P_100                 	all	0.0085
P_200                 	all	0.0072
P_500                 	all	0.0057
P_1000                	al

In [22]:
from google.colab import files
files.download('mos_final.ipynb')


FileNotFoundError: Cannot find file: mos_final.ipynb