In [None]:
from google.colab import drive
import os
import re
import nltk
import numpy as np
import xml.etree.ElementTree as ET
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import subprocess

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/searchEngine'
print(os.listdir(path))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['cran.all.1400.xml', 'cran.qry.xml', 'cranqrel.trec.txt']


In [None]:
 #Paths to files
all_docs_file = "/content/drive/MyDrive/searchEngine/cran.all.1400.xml"
all_querys_file = "/content/drive/MyDrive/searchEngine/cran.qry.xml"
actual_results_file = "/content/drive/MyDrive/searchEngine/cranqrel.trec.txt"


In [None]:
# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words("english"))

In [None]:
# Step 1: Preprocessing
def pre_processing(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)   # Remove numbers
    text = text.lower()               # Convert to lowercase
    words = word_tokenize(text)       # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords_set and len(word) > 2]  # Lemmatize and remove stopwords
    return " ".join(words)

In [None]:
# Step 2: Extract documents and queries
def get_all_docs_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        content = f.read()
    docs = re.findall(r"<text>(.*?)</text>", content, re.DOTALL | re.IGNORECASE)
    return [doc.strip() for doc in docs]

In [None]:
def get_queries(queryfile):
    tree = ET.parse(queryfile)
    root = tree.getroot()
    queries = []
    for query_el in root.findall("top"):
        query_text_el = query_el.find("title")
        if query_text_el is not None:
            query_text = query_text_el.text.strip()
            queries.append(pre_processing(query_text))
    return queries

In [None]:
# Load and preprocess documents and queries
docs = get_all_docs_data(all_docs_file)
pre_processed_docs = [pre_processing(doc) for doc in docs]
queries = get_queries(all_querys_file)

In [None]:
# Step 3: Build Inverted Index
def build_inverted_index(docs):
    inverted_index = defaultdict(list)
    for doc_id, doc in enumerate(docs):
        words = set(word_tokenize(doc))
        for word in words:
            inverted_index[word].append(doc_id)
    return inverted_index

In [None]:
#Building models
#Parameters for BM25+
k1 = 1.5
b = 0.75
delta = 1.0  # smoothing parameter for BM25+

# Dirichlet smoothing parameter
mu = 2000

# Preprocessing and setup
vectorizer = TfidfVectorizer()
mat = vectorizer.fit_transform(pre_processed_docs)

# VSM Scoring using Cosine Similarity with TF-IDF
def vsm_score(qry):
    qry_vec = vectorizer.transform([qry])
    scores = cosine_similarity(qry_vec, mat).flatten()
    return scores

# BM25+ setup
term_counts = [Counter(word_tokenize(doc)) for doc in pre_processed_docs]
doc_lengths = np.array([sum(doc.values()) for doc in term_counts])
avg_length = np.mean(doc_lengths)
doc_freq = Counter()

for doc_count in term_counts:
    doc_freq.update(doc_count.keys())

N = len(pre_processed_docs)
idf = {term: np.log((N - freq + 0.5) / (freq + 0.5) + 1) for term, freq in doc_freq.items()}

# BM25+ scoring function
def bm25_plus_score(qry, doc_index):
    query_tokens = word_tokenize(qry)
    doc_counts = term_counts[doc_index]
    doc_length = sum(doc_counts.values())
    score = 0
    for token in query_tokens:
        if token in doc_counts:
            term_freq = doc_counts[token]
            numerator = (term_freq * (k1 + 1)) + delta
            denominator = term_freq + k1 * (1 - b + b * (doc_length / avg_length)) + delta
            score += idf.get(token, 0) * (numerator / denominator)
    return score

# Language Model setup (Unigram with Dirichlet smoothing)
collection_counts = Counter()
for count in term_counts:
    collection_counts.update(count)
collection_size = sum(collection_counts.values())

# Language Model scoring with Dirichlet smoothing
def lm_dirichlet_score(query, doc_index):
    query_tokens = word_tokenize(query)
    doc_counts = term_counts[doc_index]
    doc_length = sum(doc_counts.values())
    score = 0.0

    for token in query_tokens:
        term_freq = doc_counts.get(token, 0)
        collection_freq = collection_counts.get(token, 0)

        prob = (term_freq + mu * (collection_freq / collection_size)) / (doc_length + mu)
        score += np.log(prob)

    return score

In [None]:
#Ranking the Documents for Each Query
trec_results_vsm = []
trec_results_bm25 = []
trec_results_lm = []

In [None]:
# Implementation of scoring and ranking results

trec_results_vsm = []
trec_results_bm25 = []
trec_results_lm = []

top_n = 100

for idx, query in enumerate(queries, start=1):
    vsm_scores = vsm_score(query)
    bm25_scores = [bm25_plus_score(query, i) for i in range(N)]
    lm_scores = [lm_dirichlet_score(query, i) for i in range(N)]

    vsm_top = sorted(range(N), key=lambda i: vsm_scores[i], reverse=True)[:top_n]
    bm25_top = sorted(range(N), key=lambda i: bm25_scores[i], reverse=True)[:top_n]
    lm_top = sorted(range(N), key=lambda i: lm_scores[i], reverse=True)[:top_n]

    for rank, doc_id in enumerate(vsm_top):
        trec_results_vsm.append(f"{idx} Q0 {doc_id} {rank + 1} {vsm_scores[doc_id]:.6f} VSM")

    for rank, doc_id in enumerate(bm25_top):
        trec_results_bm25.append(f"{idx} Q0 {doc_id} {rank + 1} {bm25_scores[doc_id]:.6f} BM25+")

    for rank, doc_id in enumerate(lm_top):
        trec_results_lm.append(f"{idx} Q0 {doc_id} {rank + 1} {lm_scores[doc_id]:.6f} LM")

  score += np.log(prob)


In [None]:
#Evaluation using TREC Eval
!wget https://trec.nist.gov/trec_eval/trec_eval-9.0.7.tar.gz
!tar -xvzf trec_eval-9.0.7.tar.gz
%cd trec_eval-9.0.7
!make
!cp trec_eval /usr/local/bin/

--2025-03-09 22:22:37--  https://trec.nist.gov/trec_eval/trec_eval-9.0.7.tar.gz
Resolving trec.nist.gov (trec.nist.gov)... 132.163.4.175, 2610:20:6005:13::19
Connecting to trec.nist.gov (trec.nist.gov)|132.163.4.175|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 181743 (177K) [application/x-gzip]
Saving to: ‘trec_eval-9.0.7.tar.gz’


2025-03-09 22:22:38 (3.52 MB/s) - ‘trec_eval-9.0.7.tar.gz’ saved [181743/181743]

trec_eval-9.0.7/
trec_eval-9.0.7/CHANGELOG
trec_eval-9.0.7/Makefile
trec_eval-9.0.7/README
trec_eval-9.0.7/README.windows.md
trec_eval-9.0.7/bpref_bug
trec_eval-9.0.7/common.h
trec_eval-9.0.7/convert_zscores.c
trec_eval-9.0.7/form_prefs_counts.c
trec_eval-9.0.7/form_res_rels.c
trec_eval-9.0.7/form_res_rels_jg.c
trec_eval-9.0.7/formats.c
trec_eval-9.0.7/functions.h
trec_eval-9.0.7/get_prefs.c
trec_eval-9.0.7/get_qrels.c
trec_eval-9.0.7/get_qrels_jg.c
trec_eval-9.0.7/get_qrels_prefs.c
trec_eval-9.0.7/get_trec_results.c
trec_eval-9.0.7/get_zscores.c
tr

In [None]:
# Run TREC eval for each model
def run_trec_eval(command):
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error while running trec_eval: {e.stderr}")
        return None

In [None]:
def get_metric(metric_name, results_file):
    command = ["trec_eval", "-m", metric_name, actual_results_file, results_file]
    return run_trec_eval(command)


In [None]:
# Save the results to text files
output_file_vsm = "/content/trec_results_vsm.txt"
with open(output_file_vsm, "w") as f:
    f.write("\n".join(trec_results_vsm))

output_file_bm25 = "/content/trec_results_bm25.txt"
with open(output_file_bm25, "w") as f:
    f.write("\n".join(trec_results_bm25))

output_file_lm = "/content/trec_results_lm.txt"
with open(output_file_lm, "w") as f:
    f.write("\n".join(trec_results_lm))

# Check the content of the files
with open(output_file_vsm, 'r') as f:
    print(f.read())

with open(output_file_bm25, 'r') as f:
    print(f.read())

with open(output_file_lm, 'r') as f:
    print(f.read())


1 Q0 183 1 0.269963 VSM
1 Q0 358 2 0.248519 VSM
1 Q0 12 3 0.244611 VSM
1 Q0 11 4 0.231409 VSM
1 Q0 50 5 0.210076 VSM
1 Q0 55 6 0.190338 VSM
1 Q0 664 7 0.179487 VSM
1 Q0 1185 8 0.171706 VSM
1 Q0 485 9 0.169350 VSM
1 Q0 1267 10 0.149819 VSM
1 Q0 873 11 0.138578 VSM
1 Q0 326 12 0.135644 VSM
1 Q0 140 13 0.131314 VSM
1 Q0 252 14 0.130980 VSM
1 Q0 662 15 0.126777 VSM
1 Q0 745 16 0.126534 VSM
1 Q0 434 17 0.124688 VSM
1 Q0 877 18 0.122226 VSM
1 Q0 13 19 0.119349 VSM
1 Q0 816 20 0.114428 VSM
1 Q0 791 21 0.114116 VSM
1 Q0 24 22 0.112169 VSM
1 Q0 1168 23 0.112063 VSM
1 Q0 493 24 0.111937 VSM
1 Q0 875 25 0.108976 VSM
1 Q0 874 26 0.107844 VSM
1 Q0 878 27 0.106163 VSM
1 Q0 746 28 0.104967 VSM
1 Q0 1143 29 0.102464 VSM
1 Q0 153 30 0.102025 VSM
1 Q0 572 31 0.101042 VSM
1 Q0 817 32 0.099609 VSM
1 Q0 799 33 0.098748 VSM
1 Q0 77 34 0.098402 VSM
1 Q0 56 35 0.095291 VSM
1 Q0 539 36 0.095276 VSM
1 Q0 171 37 0.094680 VSM
1 Q0 179 38 0.094528 VSM
1 Q0 428 39 0.093907 VSM
1 Q0 1360 40 0.090864 VSM
1 Q0 331 41 

In [None]:
# After collecting the results in 'trec_results_vsm', check the format
print("Sample result from VSM:")
print(trec_results_vsm[:5])  # Print the first 5 results


Sample result from VSM:
['1 Q0 183 1 0.269963 VSM', '1 Q0 358 2 0.248519 VSM', '1 Q0 12 3 0.244611 VSM', '1 Q0 11 4 0.231409 VSM', '1 Q0 50 5 0.210076 VSM']


In [None]:
!trec_eval /content/drive/MyDrive/searchEngine/cranqrel.trec.txt /content/trec_results_vsm.txt


runid                 	all	VSM
num_q                 	all	225
num_ret               	all	22500
num_rel               	all	1612
num_rel_ret           	all	623
map                   	all	0.1062
gm_map                	all	0.0211
Rprec                 	all	0.1248
bpref                 	all	0.3932
recip_rank            	all	0.2897
iprec_at_recall_0.00  	all	0.3143
iprec_at_recall_0.10  	all	0.2922
iprec_at_recall_0.20  	all	0.2337
iprec_at_recall_0.30  	all	0.1584
iprec_at_recall_0.40  	all	0.1113
iprec_at_recall_0.50  	all	0.0923
iprec_at_recall_0.60  	all	0.0476
iprec_at_recall_0.70  	all	0.0287
iprec_at_recall_0.80  	all	0.0240
iprec_at_recall_0.90  	all	0.0161
iprec_at_recall_1.00  	all	0.0153
P_5                   	all	0.1378
P_10                  	all	0.1129
P_15                  	all	0.0933
P_20                  	all	0.0758
P_30                  	all	0.0581
P_100                 	all	0.0277
P_200                 	all	0.0138
P_500                 	all	0.0055
P_1000                	all

In [None]:
# Run TREC eval for each model
def run_trec_eval(command):
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error while running trec_eval: {e.stderr}")
        return None

# Run TREC eval for each model
evaluation_results_vsm = run_trec_eval(["trec_eval", actual_results_file, output_file_vsm])
evaluation_results_bm25 = run_trec_eval(["trec_eval", actual_results_file, output_file_bm25])
evaluation_results_lm = run_trec_eval(["trec_eval", actual_results_file, output_file_lm])


In [None]:
output_file_vsm = "/content/drive/MyDrive/searchEngine/trec_results_vsm.txt"
output_file_bm25 = "/content/drive/MyDrive/searchEngine/trec_results_bm25.txt"
output_file_lm = "/content/drive/MyDrive/searchEngine/trec_results_lm.txt"


In [None]:
# Debugging step
with open("/content/trec_results_vsm.txt", 'r') as f:
    print(f.read())

with open("/content/trec_results_bm25.txt", 'r') as f:
    print(f.read())

with open("/content/trec_results_lm.txt", 'r') as f:
    print(f.read())

# Ensure the content of the result files is in the expected format


1 Q0 183 1 0.269963 VSM
1 Q0 358 2 0.248519 VSM
1 Q0 12 3 0.244611 VSM
1 Q0 11 4 0.231409 VSM
1 Q0 50 5 0.210076 VSM
1 Q0 55 6 0.190338 VSM
1 Q0 664 7 0.179487 VSM
1 Q0 1185 8 0.171706 VSM
1 Q0 485 9 0.169350 VSM
1 Q0 1267 10 0.149819 VSM
1 Q0 873 11 0.138578 VSM
1 Q0 326 12 0.135644 VSM
1 Q0 140 13 0.131314 VSM
1 Q0 252 14 0.130980 VSM
1 Q0 662 15 0.126777 VSM
1 Q0 745 16 0.126534 VSM
1 Q0 434 17 0.124688 VSM
1 Q0 877 18 0.122226 VSM
1 Q0 13 19 0.119349 VSM
1 Q0 816 20 0.114428 VSM
1 Q0 791 21 0.114116 VSM
1 Q0 24 22 0.112169 VSM
1 Q0 1168 23 0.112063 VSM
1 Q0 493 24 0.111937 VSM
1 Q0 875 25 0.108976 VSM
1 Q0 874 26 0.107844 VSM
1 Q0 878 27 0.106163 VSM
1 Q0 746 28 0.104967 VSM
1 Q0 1143 29 0.102464 VSM
1 Q0 153 30 0.102025 VSM
1 Q0 572 31 0.101042 VSM
1 Q0 817 32 0.099609 VSM
1 Q0 799 33 0.098748 VSM
1 Q0 77 34 0.098402 VSM
1 Q0 56 35 0.095291 VSM
1 Q0 539 36 0.095276 VSM
1 Q0 171 37 0.094680 VSM
1 Q0 179 38 0.094528 VSM
1 Q0 428 39 0.093907 VSM
1 Q0 1360 40 0.090864 VSM
1 Q0 331 41 

In [None]:
# Get metrics for all models
evaluation_results_vsm = run_trec_eval(["trec_eval", actual_results_file, "/content/trec_results_vsm.txt"])
evaluation_results_bm25 = run_trec_eval(["trec_eval", actual_results_file, "/content/trec_results_bm25.txt"])
evaluation_results_lm = run_trec_eval(["trec_eval", actual_results_file, "/content/trec_results_lm.txt"])


In [None]:
# Get MAP, P@5, and NDCG for each model
map_score_vsm = get_metric("map", "/content/trec_results_vsm.txt")
p5_score_vsm = get_metric("P.5", "/content/trec_results_vsm.txt")
ndcg_score_vsm = get_metric("ndcg", "/content/trec_results_vsm.txt")

map_score_bm25 = get_metric("map", "/content/trec_results_bm25.txt")
p5_score_bm25 = get_metric("P.5", "/content/trec_results_bm25.txt")
ndcg_score_bm25 = get_metric("ndcg", "/content/trec_results_bm25.txt")

map_score_lm = get_metric("map", "/content/trec_results_lm.txt")
p5_score_lm = get_metric("P.5", "/content/trec_results_lm.txt")
ndcg_score_lm = get_metric("ndcg", "/content/trec_results_lm.txt")

In [None]:
# Evaluation scores for different models
evaluation_results = {
    "VSM": {"MAP Score": map_score_vsm, "P@5 Score": p5_score_vsm, "NDCG Score": ndcg_score_vsm},
    "BM25": {"MAP Score": map_score_bm25, "P@5 Score": p5_score_bm25, "NDCG Score": ndcg_score_bm25},
    "LM": {"MAP Score": map_score_lm, "P@5 Score": p5_score_lm, "NDCG Score": ndcg_score_lm},
}

# Print results using a loop
for model, scores in evaluation_results.items():
    print(f"\n{model} Evaluation Results:")
    for metric, score in scores.items():
        print(f"{metric}: {score}")



VSM Evaluation Results:
MAP Score: map                   	all	0.1062

P@5 Score: P_5                   	all	0.1378

NDCG Score: ndcg                  	all	0.2463


BM25 Evaluation Results:
MAP Score: map                   	all	0.1045

P@5 Score: P_5                   	all	0.1467

NDCG Score: ndcg                  	all	0.2409


LM Evaluation Results:
MAP Score: map                   	all	0.0774

P@5 Score: P_5                   	all	0.1040

NDCG Score: ndcg                  	all	0.1902

