In [55]:
#import the neccessary libraries which are required
import re
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

In [56]:
# Load the dataset
queries_df = pd.read_csv('/kaggle/input/dataset/queries.csv')
docs_df = pd.read_csv('/kaggle/input/dataset/docs.csv')
qdrel_df = pd.read_csv('/kaggle/input/dataset/qdrel.csv')

# Task 1 

*  Preprocess the docs and queries – remove characters other than alphanumeric or whitespaces.
*  Correct spelling in the queries and documents using SpaCy. Only for each query with some correction, print the original and corrected query in separate lines, followed by two newlines (\n).
*  Tokenize the words in the documents using spacy. Remove all words that occur in less than 5 documents or more than 85% of the documents.
*  For each query, find the cosine similarity of its vector with that of the documents. Use this to find the top 5 and top 10 most similar documents.
*  Calculate the Precision@k scores: report P@1, P@5 and P@10 averaged over all queries

In [57]:
# Preprocess the Docs and Queries
# The purpose of preprocess_text function is to clean the text.
# It removes alphanumeric or whitespaces in queries and docs.

def preprocess_text(text):
    processed_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return processed_text

queries_df['query_text_preprocessed'] = queries_df['query_text'].apply(preprocess_text)
docs_df['doc_text_preprocessed'] = docs_df['doc_text'].apply(preprocess_text)

# print(queries_df)
# print(docs_df)

In [58]:
# the function correct_spelling corrects the spellings,
# to get the correct version of the text.

def correct_spelling(text):
    doc = nlp(text)
    corrected_text = " ".join(token.text for token in doc)    
    return corrected_text

queries_df['query_text_corrected'] = queries_df['query_text_preprocessed'].apply(correct_spelling)

# printing the original and corrected query
for i, row in queries_df.iterrows():
    original_query = row['query_text_preprocessed']
    corrected_query = row['query_text_corrected']
    if(original_query != row['query_text_corrected']):
        #print if corrected is not equal to original query
        if (original_query != corrected_query):
            print(f"Original Query: {original_query}")
            print(f"Corrected Query: {corrected_query}\n\n")



Original Query: Kindly tell me whole process of admission at vits Vellore for biotechi m a bio student in 12I dont have math there
Corrected Query: Kindly tell me whole process of admission at vits Vellore for biotechi m a bio student in 12I do nt have math there


Original Query: Whats your secret to success
Corrected Query: What s your secret to success


Original Query: Why do people say Dhanush South Indian actor is ugly I dont think so
Corrected Query: Why do people say Dhanush South Indian actor is ugly I do nt think so


Original Query: How do I reset my Gmail password when I dont remember my recovery information
Corrected Query: How do I reset my Gmail password when I do nt remember my recovery information




In [59]:
# tokenize the text in the document

tokenized_docs = []

for _, row in docs_df.iterrows():
    doc_text = row['doc_text_preprocessed']
    doc_tokens = []
    for token in nlp(doc_text):
        if token.is_alpha and not token.is_stop:
            doc_tokens.append(token.text.lower())
    tokenized_docs.append(doc_tokens)
all_tokens = [token for doc_tokens in tokenized_docs for token in doc_tokens] 
    
# Count frequency of each word
# And remove all words that occur in less than 5
# documents or more than 85% of the documents

word_document_frequency = Counter(all_tokens)
min_document_frequency = 5
max_document_frequency_percentage = 0.85

filtered_tokens=[]
#filtered_tokens 
for token, count in word_document_frequency.items():
    if min_document_frequency <= count <= len(docs_df) * max_document_frequency_percentage:
        filtered_tokens.append(token)
        
# Tokenize documents & queries using the filtered vocabulary

tokenized_docs = []
for doc_text in docs_df['doc_text_preprocessed']:
    doc_tokens = [tok.text.lower() for tok in nlp(doc_text) if tok.text.lower() in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []
for query_text in queries_df['query_text_corrected']:
    query_tokens = [tok.text.lower() for tok in nlp(query_text) if tok.text.lower() in filtered_tokens]
    tokenized_queries.append(query_tokens)

In [60]:
# Create TF-IDF 
# For each document and query the TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])
tfidf_doc_matrix = tfidf_matrix[:len(docs_df)]
tfidf_queries_matrix = tfidf_matrix[len(docs_df):]

print("\nTF-IDF Matrix for Queries:")
print(tfidf_queries_matrix)

print("\nTF-IDF Matrix for Document:")
print(tfidf_doc_matrix)


TF-IDF Matrix for Queries:
  (0, 1409)	0.6876081489380635
  (0, 106)	0.5372837027503974
  (0, 1541)	0.4883863801057443
  (1, 170)	1.0
  (3, 650)	1.0
  (4, 2053)	0.5337099720015381
  (4, 1858)	0.6274904124627819
  (4, 1387)	0.4105031208761348
  (4, 958)	0.3910199941235213
  (5, 1105)	0.5209225664041514
  (5, 622)	0.8536039361500816
  (6, 553)	0.7746235893302349
  (6, 353)	0.6324225603606687
  (7, 1646)	0.7225072874844494
  (7, 1349)	0.6913633050226656
  (8, 1183)	0.45957608669235206
  (8, 27)	0.46696082530765126
  (8, 1316)	0.2840229847626598
  (8, 1494)	0.42157269809062087
  (8, 1837)	0.3895450492149863
  (8, 1900)	0.4007486333747614
  (9, 690)	0.4810378733611324
  (9, 266)	0.4909296377731392
  (9, 188)	0.3906393868051933
  (9, 1657)	0.48746872945023484
  :	:
  (91, 597)	0.5865181077656336
  (91, 337)	0.5840967867765781
  (91, 891)	0.5610948698217595
  (92, 1610)	1.0
  (93, 1749)	0.8441007948952699
  (93, 1084)	0.5361845279912254
  (94, 449)	0.6179342718794169
  (94, 9)	0.575468921843

In [61]:
# For each query, find the cosine similarity of its vector with that of the documents
cosine_similarities = cosine_similarity(tfidf_queries_matrix, tfidf_doc_matrix)
print(f"\n For each query, the cosine similarity of its vector with that of the documents is :\n", cosine_similarities)


 For each query, the cosine similarity of its vector with that of the documents is :
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [62]:
# get_top_indices function to get the indices of the top N scores in descending order.
def get_top_indices(similarity_scores, N):
    return np.argsort(similarity_scores)[::-1][:N]

top_5_indices = []
top_10_indices = []

for similarities in cosine_similarities:
    top_5_indices.append(get_top_indices(similarities, 5))
    top_10_indices.append(get_top_indices(similarities, 10))

# print the results
for i, (top_5, top_10) in enumerate(zip(top_5_indices, top_10_indices)):
    print(f"\n\nTop 5 documents for Query {i + 1}: {top_5}")
    print(f"Top 10 documents for Query {i + 1}: {top_10}")



Top 5 documents for Query 1: [1328   44 4275 2521 1720]
Top 10 documents for Query 1: [1328   44 4275 2521 1720 4439 8892 2287 4274 2520]


Top 5 documents for Query 2: [7669  417 4907 3978 1850]
Top 10 documents for Query 2: [7669  417 4907 3978 1850 2137 2136 2249 2250 6915]


Top 5 documents for Query 3: [9999 3329 3336 3335 3334]
Top 10 documents for Query 3: [9999 3329 3336 3335 3334 3333 3332 3331 3330 3328]


Top 5 documents for Query 4: [9954 2138 9978 2854 7717]
Top 10 documents for Query 4: [9954 2138 9978 2854 7717 7354 9979 2139 6134 6133]


Top 5 documents for Query 5: [9221  249 8140 3288  561]
Top 10 documents for Query 5: [9221  249 8140 3288  561  560 5288 6682 8660  269]


Top 5 documents for Query 6: [7385 4947 4946  138 2103]
Top 10 documents for Query 6: [7385 4947 4946  138 2103 9449  276 3087 5858 4090]


Top 5 documents for Query 7: [7083 1052 8311 1843  516]
Top 10 documents for Query 7: [7083 1052 8311 1843  516 7207 8310 9451 5466  691]


Top 5 documents fo

In [63]:
# Calculates the Precision@k scores: report P@1, P@5 and P@10 averaged over all queries

k_values = [1, 5, 10]
# to store Precision@k scores for each k
precision_at_k = {k: [] for k in k_values}

# precision@k for each query
for i,query_vector in enumerate(tfidf_queries_matrix):
    query_id = queries_df['query_id'].iloc[i]
    relevant_docs = set(qdrel_df[qdrel_df['query_id'] == query_id]['doc_id'])

    # evaluate cosine similarities
    cosine_similarities = cosine_similarity(query_vector, tfidf_doc_matrix)[0]
    sorted_indices = np.argsort(cosine_similarities)[::-1]

    # for each k calculating Precision@k
    for k in k_values:
        top_k_indices =sorted_indices[:k]
        retrieved_docs = set(docs_df['doc_id'].iloc[top_k_indices])

        # Calculating Precision@k
        precision = 1 if (len(relevant_docs.intersection(retrieved_docs)) / k) >0  else 0
        precision_at_k[k].append(precision)

# Calculating average Precision@k
average_precision_at_k= {k: np.mean(scores) for k, scores in precision_at_k.items()}

# # print the results
print("\nAverage Precisions before Stemming and lemmatization : ")
for k, avg_precision in average_precision_at_k.items():
    print(f"\nAverage Precision@{k}: {avg_precision}")


print("\nThe vocabulary size before Stemming and lemmatization : ",vectorizer.get_feature_names_out().shape)


Average Precisions before Stemming and lemmatization : 

Average Precision@1: 0.53

Average Precision@5: 0.76

Average Precision@10: 0.79

The vocabulary size before Stemming and lemmatization :  (2129,)


# Task 2

*  Improve the performance of Task1 by stemming the tokens (using spacy) before calculating the vocabulary.
*  Improve the performance of Task1 by lemmatizing the tokens (using spacy) before calculating the vocabulary.
*  Report the size of the vocabulary you obtained as part of Task 1, the vocabulary size after stemming and the vocabulary size after lemmatization.
*  Report the performance metrics in both these cases and discuss the results (why or why not performance has increased).


In [64]:
# By stemming the tokens
print("\n Improve the performance of Task1 by stemming the tokens \n")
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

tokenized_docs = []
all_tokens = []
# using stemming to improve performance
for i, row in docs_df.iterrows():
    doc_text = row['doc_text_preprocessed']
    doc_tokens = [stemmer.stem(token.text.lower()) for token in nlp(doc_text) if not token.is_stop and token.is_alpha]
    tokenized_docs.append(doc_tokens)
    all_tokens.extend(doc_tokens)

#countinf the frequency of tokens
word_document_frequency = Counter(all_tokens)

# Filter out tokens
min_document_frequency = 5
max_document_frequency_percentage = 0.85

filtered_tokens=[]
for token, count in word_document_frequency.items():
    if min_document_frequency <= count <= len(docs_df) * max_document_frequency_percentage:
        filtered_tokens.append(token)
# Tokenize documents and queries using the filtered vocabulary
tokenized_docs = []

for doc_text in docs_df['doc_text_preprocessed']:
    doc_tokens = [tok.text.lower() for tok in nlp(doc_text) if tok.text.lower() in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []

for query_text in queries_df['query_text_corrected']:
    query_tokens = [tok.text.lower() for tok in nlp(query_text) if tok.text.lower() in filtered_tokens]
    tokenized_queries.append(query_tokens)

#print(tokenized_docs[0])
# print(tokenized_queries)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])

# Separate TF-IDF vecctors
tfidf_doc_matrix = tfidf_matrix[:len(docs_df)]
tfidf_queries_matrix = tfidf_matrix[len(docs_df):]

# get the vocabulary size
print("After Stemming")
print("The vocabulary size : ",vectorizer.get_feature_names_out().shape)

# Calculate cosine similarity between query & documents
cosine_similarities = cosine_similarity(tfidf_queries_matrix, tfidf_doc_matrix)
print(f"\n For each query, the cosine similarity of its vector with that of the documents is :\n", cosine_similarities)


#to calculate performance matrix
# Calculate Precision@k for each query
for i, query_vector in enumerate(tfidf_queries_matrix):
    query_id = queries_df['query_id'].iloc[i]
    relevant_docs = set(qdrel_df[qdrel_df['query_id'] == query_id]['doc_id'])

    # cosine similarities
    cosine_similarities = cosine_similarity(query_vector, tfidf_doc_matrix)[0]
    sorted_indices = np.argsort(cosine_similarities)[::-1]

    # Calculate Precision@k for each k
    for k in k_values:
        top_k_indices = sorted_indices[:k]
        retrieved_docs = set(docs_df['doc_id'].iloc[top_k_indices])

        # Calculating Precision@k
        precision = 1 if (len(relevant_docs.intersection(retrieved_docs)) / k) >0  else 0
        precision_at_k[k].append(precision)

#average Precision@k
average_precision_at_k = {k: np.mean(scores) for k, scores in precision_at_k.items()}

# # print the results
print("\nAverage Precisions after stemming : ")
for k, avg_precision in average_precision_at_k.items():
    print(f"\nAverage Precision@{k}: {avg_precision}")




 Improve the performance of Task1 by stemming the tokens 

After Stemming
The vocabulary size :  (1336,)

 For each query, the cosine similarity of its vector with that of the documents is :
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Average Precisions after stemming : 

Average Precision@1: 0.43

Average Precision@5: 0.655

Average Precision@10: 0.705


In [65]:
# performance improvrment by lemmatization
print("\n Improve the performance of Task1 by lemmatization the tokens \n")

tokenized_docs = []
all_tokens = []

#lemmatizing the tokens
for i, row in docs_df.iterrows():
    doc_text = row['doc_text_preprocessed']
    doc_tokens = [token.lemma_ for token in nlp(doc_text) if not token.is_stop and token.is_alpha]
    tokenized_docs.append(doc_tokens)
    all_tokens.extend(doc_tokens)

# fitlering out the vocubalary
word_document_frequency = Counter(all_tokens)
min_document_frequency = 5
max_document_frequency_percentage = 0.85

filtered_tokens = [token for token, count in word_document_frequency.items() if min_document_frequency <= count <= len(docs_df) * max_document_frequency_percentage]

# using filtered tokens, tokenize the doc and queries
tokenized_docs = []
for doc_text in docs_df['doc_text_preprocessed']:
    doc_tokens = [tok.text.lower() for tok in nlp(doc_text) if tok.text.lower() in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []
for query_text in queries_df['query_text_corrected']:
    query_tokens = [tok.text.lower() for tok in nlp(query_text) if tok.text.lower() in filtered_tokens]
    tokenized_queries.append(query_tokens)

# print(tokenized_docs[0])
# print(tokenized_queries)
# TF-IDF Vectorization

#tf-idf vectors for both doc and query
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([" ".join(tokens) for tokens in tokenized_docs + tokenized_queries])

# Separate TF-IDF vectors for documents and queries
tfidf_doc_matrix = tfidf_matrix[:len(docs_df)]
tfidf_queries_matrix = tfidf_matrix[len(docs_df):]

print("After lemmatization : \n")
print("The vocabulary size : ",vectorizer.get_feature_names_out().shape)
# print(tfidf_queries_matrix)

# calculating cosine similarity of tfidf_queries_matrix & tfidf_doc_matrix
cosine_similarities = cosine_similarity(tfidf_queries_matrix, tfidf_doc_matrix)
print(f"\n cosine similarity of tfidf_queries_matrix & tfidf_doc_matrix : \n", cosine_similarities)

# Calculating the precisions
precision_at_k = {k: [] for k in k_values}

for i,query_vector in enumerate(tfidf_queries_matrix):
    query_id = queries_df['query_id'].iloc[i]
    relevant_docs = set(qdrel_df[qdrel_df['query_id'] == query_id]['doc_id'])

    # evaluate cosine similarities
    cosine_similarities = cosine_similarity(query_vector, tfidf_doc_matrix)[0]
    sorted_indices = np.argsort(cosine_similarities)[::-1]

    # for each k calculating Precision@k
    for k in k_values:
        top_k_indices =sorted_indices[:k]
        retrieved_docs = set(docs_df['doc_id'].iloc[top_k_indices])

        # Calculating Precision@k
        precision = 1 if (len(relevant_docs.intersection(retrieved_docs)) / k) >0  else 0
        precision_at_k[k].append(precision)

average_precision_at_k = {k: np.mean(scores) for k, scores in precision_at_k.items()}

# print the results
for k, avg_precision in average_precision_at_k.items():
    print(f"\nAverage Precision@{k}: {avg_precision}")


 Improve the performance of Task1 by lemmatization the tokens 

After lemmatization : 

The vocabulary size :  (1585,)

 cosine similarity of tfidf_queries_matrix & tfidf_doc_matrix : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Average Precision@1: 0.4

Average Precision@5: 0.58

Average Precision@10: 0.62


# Task 3

*   Improve the model from Task 2.2 further with Named Entity Recognition (NER) and Parts-Of-Speech (POS) tagging using spaCy.
*   For each query and document vector, give more weightage to some important words. In essence, for each of the tf-idf vectors, multiply 2 along the dimensions which contain nouns, and multiply 4 for the named entities.
*   Report the performance metrics 

In [66]:
print(" \n Improvement of the model from Task 2.2 further with Named Entity Recognition (NER) and Parts-Of-Speech(POS) tagging\n ")
# extract_E_P is a function to extract named entities and POS tags
def extract_E_P(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    pos_tags = [token.pos_ for token in doc]
    return entities, pos_tags

# apply it to document and queries
docs_df['entities'], docs_df['pos_tags'] = zip(*docs_df['doc_text_preprocessed'].apply(extract_E_P))
queries_df['entities'], queries_df['pos_tags'] = zip(*queries_df['query_text_corrected'].apply(extract_E_P))

#tokenizing it 
tokenized_docs = []
for i in docs_df['doc_text_preprocessed']:
    doc_tokens = [tok.text.lower() for tok in nlp(i) if tok.text.lower() in filtered_tokens]
    tokenized_docs.append(doc_tokens)

tokenized_queries = []
for query_text in queries_df['query_text_corrected']:
    query_tokens = [tok.text.lower() for tok in nlp(query_text) if tok.text.lower() in filtered_tokens]
    tokenized_queries.append(query_tokens)

all_tokenized_texts = [" ".join(tokens) for tokens in tokenized_docs + tokenized_queries]

# creating if-idf vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_tokenized_texts)

tfidf_doc_matrix = tfidf_matrix[:len(docs_df)]
tfidf_queries_matrix = tfidf_matrix[len(docs_df):]

# Append Named Entity Recognition (NER) and Parts-Of-Speech features to TF-IDF matrix
tfidf_doc_matrix_with_features = pd.concat([pd.DataFrame(tfidf_doc_matrix.toarray()), docs_df[['entities', 'pos_tags']]], axis=1)
tfidf_queries_matrix_with_features = pd.concat([pd.DataFrame(tfidf_queries_matrix.toarray()), queries_df[['entities', 'pos_tags']]], axis=1)

k=vectorizer.get_feature_names_out()
print("\n The vocabulary size : ",k.shape)
x,y=extract_E_P(' '.join(k))

# modify_tfidf_vector is a function that gives more weightage to 
# some important words - multiply 2 along the dimensions which contain nouns, and multiply 4 for the
#  named entities.
def modify_tfidf_vector(tfidf_vector, pos_tags, entities):
    if len(pos_tags) != tfidf_vector.shape[1]:
        raise ValueError("Number of dimensions in TF-IDF matrix is not matching with length of pos_tags.")

    noun_indices = [i for i, pos_tag in enumerate(pos_tags) if 'NOUN' in pos_tag]

    if max(noun_indices, default=-1) >= tfidf_vector.shape[1]:
        raise ValueError("Index in noun_indices is out of range")

    tfidf_vector[:, noun_indices] *= 2
    
    entity_indices = [i for i,j in enumerate(k) if j in entities]

    if max(entity_indices, default=-1) >= tfidf_vector.shape[1]:
        raise ValueError("Index in entity_indices is out of range")

    tfidf_vector[:, entity_indices] *= 4

    return tfidf_vector

#do the modifications in doc and queries matrix
modified_tfidf_doc_matrix = modify_tfidf_vector(tfidf_doc_matrix, y,x)
modified_tfidf_queries_matrix = modify_tfidf_vector(tfidf_queries_matrix, y, x)

# Calculating cosine similarity 
cosine_similarities = cosine_similarity(modified_tfidf_queries_matrix, modified_tfidf_doc_matrix)
# print(cosine_similarities)

# calculating precision
precision_at_k = {k: [] for k in k_values}

for i,query_vector in enumerate(tfidf_queries_matrix):
    query_id = queries_df['query_id'].iloc[i]
    relevant_docs = set(qdrel_df[qdrel_df['query_id'] == query_id]['doc_id'])

    # evaluate cosine similarities
    cosine_similarities = cosine_similarity(query_vector, tfidf_doc_matrix)[0]
    sorted_indices = np.argsort(cosine_similarities)[::-1]

    # for each k calculating Precision@k
    for k in k_values:
        top_k_indices =sorted_indices[:k]
        retrieved_docs = set(docs_df['doc_id'].iloc[top_k_indices])

        # Calculating Precision@k
        precision = 1 if (len(relevant_docs.intersection(retrieved_docs)) / k) >0  else 0
        precision_at_k[k].append(precision)

# Calculating average Precision@k scores
average_precision_at_k = {k: np.mean(scores) for k, scores in precision_at_k.items()}

# print result
for k, avg_precision in average_precision_at_k.items():
    print(f"\nAverage Precision@{k}: {avg_precision}")



 
 Improvement of the model from Task 2.2 further with Named Entity Recognition (NER) and Parts-Of-Speech(POS) tagging
 

 The vocabulary size :  (1585,)

Average Precision@1: 0.43

Average Precision@5: 0.56

Average Precision@10: 0.6
