# INFORMATION RETRIEVAL
## Deep Merge

- Abhirup Paul - CB.EN.U4CSE20401 <br>
- Anjali Ganesan - CB.EN.U4CSE20612 <br>
- Devaganga - CB.EN.U4CSE20415 <br>
- Sasidharan GS - CB.EN.U4CSE20458 <br>
- Shivaramakrishnan SS - CB.EN.U4CSE20460

<hr>

## Loading & Pre-processing

In [1]:
import nltk
from nltk.corpus import stopwords
import numpy as np
from collections import defaultdict

In [2]:
query = "dancing little vulture"

In [3]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

file_paths = ["control.txt", "imagination.txt", "rap_god.txt"]

In [4]:
def preprocess(document):
    tokens = nltk.word_tokenize(document)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

preprocessed_documents = [preprocess(read_text_file(file_path)) 
                          for file_path in file_paths]

In [5]:
inverted_index = defaultdict(list)

for doc_id, doc in enumerate(preprocessed_documents):
    for term in doc:
        inverted_index[term].append(doc_id)

print(inverted_index)

defaultdict(<class 'list'>, {'little': [0, 0, 2, 2], 'vulture': [0, 0], 'come': [0, 0, 2, 2], 'nest': [0, 0], 'catch': [0, 0], 'world': [0, 0], 'impressed': [0, 0], 'built': [0, 0], 'culture': [0, 0], 'lies': [0, 0], 'fortress': [0, 0], 'fit': [0, 0], 'surprise': [0, 0], 'shit': [0, 2, 2], 'uncover': [0], 'sonnets': [0], 'sweeping': [0], 'heads': [0], 'move': [0], 'sand': [0], 'blocks': [0], 'roots': [0], 'wish': [0], 'spread': [0], 'need': [0, 2], 'control': [0, 1, 1], 'eyes': [0], 'every': [0, 2], 'corner': [0], 'nothing': [0, 1], 'could': [0, 2], 'know': [0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2], 'speck': [0], 'disorder': [0], 'buildings': [0, 0], 'glisten': [0, 0], 'around': [0, 0, 1, 1, 1, 2], 'cities': [0, 0], 'rising': [0, 0], 'ground': [0, 0], 'shape': [0], 'heart': [0], 'drag': [0], 'dark': [0], 'oh': [0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2], 'paranoia': [0, 0], 'led': [0, 0], 'detain': [0, 0], 'million': [0, 0, 2], 'getting': [0, 0], 'way': [0, 0, 2, 2, 2], 'feel': [0, 0, 2, 2, 2, 2, 

<hr>

## TF-IDF based retrieval model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(
                [" ".join(doc) for doc in preprocessed_documents])

In [7]:
def retrieve_documents_tfidf(query, tfidf_matrix):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = (tfidf_matrix * query_vector.T).toarray().flatten()
    ranked_documents = sorted(enumerate(cosine_similarities),
                              key=lambda x: x[1], reverse=True)
    return ranked_documents

In [8]:
retrieved_documents_tfidf = retrieve_documents_tfidf(query, tfidf_matrix)
print("Cosine Similarity scores:\n")
for doc_id, score in retrieved_documents_tfidf:
    print(f"Doc{doc_id + 1}->{file_paths[doc_id]}\t: {score}")

Cosine Similarity scores:

Doc1->control.txt	: 0.15142078194717465
Doc2->imagination.txt	: 0.1003258105717071
Doc3->rap_god.txt	: 0.015386187973523315


<hr>

## BM25 retrieval model

In [9]:
from rank_bm25 import BM25Okapi

# Create a BM25 index
bm25 = BM25Okapi(preprocessed_documents)

In [10]:
def retrieve_documents_bm25(query, bm25, preprocessed_documents):
    tokenized_query = preprocess(query)
    scores = bm25.get_scores(tokenized_query)
    ranked_documents = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    return ranked_documents

In [11]:
retrieved_documents_bm25 = retrieve_documents_bm25(query, bm25, preprocessed_documents)
print("BM25 scores:\n")
for doc_id, score in retrieved_documents_bm25:
    print(f"Doc{doc_id + 1}->{file_paths[doc_id]}\t: {score}")

BM25 scores:

Doc1->control.txt	: 1.1356501864548376
Doc2->imagination.txt	: 0.9998366301469569
Doc3->rap_god.txt	: 0.10877415031604586


<hr>

## Deep Merge based retrieval model

In [12]:
sorted_bm25 = sorted(retrieved_documents_bm25, key=lambda x: x[0])
sorted_tfidf = sorted(retrieved_documents_tfidf, key=lambda x: x[0])

rel_score = [sorted_tfidf, sorted_bm25]
print("Relevance scores:")   
for row in rel_score:
    print(row)
 
relevance_scores = np.array([[i[1] for i in sorted_tfidf],
                             [i[1] for i in sorted_bm25]])

Relevance scores:
[(0, 0.15142078194717465), (1, 0.1003258105717071), (2, 0.015386187973523315)]
[(0, 1.1356501864548376), (1, 0.9998366301469569), (2, 0.10877415031604586)]


In [13]:
weights = np.array([0.45, 0.55])

# Compute the lambda value using the relevance scores and weighting factors
lambdas = np.zeros(relevance_scores.shape[1])
for i in range(relevance_scores.shape[1]):
    lambdas[i] = np.sum(relevance_scores[:, i] * weights) / np.sum(weights)

print(f"Lambda Scores:\n{lambdas}")

Lambda Scores:
[0.69274695 0.59505676 0.06674957]


In [14]:
# overall relevance score (for each document) using the lambda values
overall_scores = np.sum(lambdas * relevance_scores, axis=0)
print(overall_scores)

[0.89161449 0.6546591  0.00828765]


In [15]:
sorted_indices = np.argsort(overall_scores)[::-1]
print("LambdaMerged scores:\n")
for i in sorted_indices:
    print(f"Doc{i}->{file_paths[i]}\t: {overall_scores[i]}")

LambdaMerged scores:

Doc0->control.txt	: 0.8916144934911177
Doc1->imagination.txt	: 0.6546590989198616
Doc2->rap_god.txt	: 0.008287648851921185
