In [9]:
import ir_datasets
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pandas as pd
import pickle
import os
import re
from typing import List
import joblib
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [10]:


dataset = ir_datasets.load("antique/train")


df = pd.read_csv('collection.tsv', sep='\t', header=None, names=['doc_id', 'text'])

# Build the corpus dictionary
corpus = {}
counter =0
for index, row in df.iterrows():
    if counter<2000:
 
        if isinstance(row['text'], str):
            corpus[row['doc_id']] = row['text']
        else: 
            corpus[row['doc_id']] = ""
            counter +=1
    else:
        break
# Convert the corpus to a list of documents and handle NaN values
documents = list(corpus.values())

def custom_tokenizer(text: str) -> List[str]:
    """Tokenizes and lowercases the text."""
    tokens = word_tokenize(text.lower())
    return tokens

def get_wordnet_pos(tag):
    """Converts POS tag to a format that WordNetLemmatizer can understand."""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def remove_markers(tokens: List[str]) -> List[str]:
    """Removes specific markers from tokens."""
    return [re.sub(r'\u00AE', '', token) for token in tokens]

def remove_punctuation(tokens: List[str]) -> List[str]:
    """Removes punctuation from tokens."""
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

def replace_under_score_with_space(tokens: List[str]) -> List[str]:
    """Replaces underscores with spaces in tokens."""
    return [re.sub(r'_', ' ', token) for token in tokens]

def remove_apostrophe(tokens: List[str]) -> List[str]:
    """Removes apostrophes from tokens."""
    return [token.replace("'", " ") for token in tokens]
def normalize_appreviations( tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
def preprocess_text(text: str) -> str:
    """Preprocesses the input text by tokenizing, removing punctuation, stopwords, and then stemming and lemmatizing."""
    # Convert text to lowercase and tokenize
    words = custom_tokenizer(text)
    
    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
  
    # Further token cleaning
    words = remove_markers(words)
    words = replace_under_score_with_space(words)
    words = remove_apostrophe(words)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    words = normalize_appreviations(words)
    return ' '.join(words)


In [None]:


# Preprocess documents
processed_documents = [preprocess_text(doc) for doc in documents]

# Train Word2Vec model
word2vec_model = Word2Vec([doc.split() for doc in processed_documents], vector_size=100, sg=1, epochs=35)
# Save the Word2Vec model
word2vec_model.save("word2vec_model.kv")
# Load the Word2Vec model

word2vec_model = Word2Vec.load("word2vec_model.kv")



In [12]:

def vectorize_documents(documents: List[str]) -> List[np.ndarray]:
    document_vectors = []
    for document in documents:
        vectors = [word2vec_model.wv[token] for token in document.split() if token in word2vec_model.wv]
        if vectors:
            document_vectors.append(np.mean(vectors, axis=0))
        else:
            document_vectors.append(np.zeros(100))
    return document_vectors

# Compute document vectors
doc_vectors = vectorize_documents(processed_documents)
print(doc_vectors)
# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    with open(file_location, 'wb') as file:
        pickle.dump(content, file, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
    with open(file_location, 'rb') as file:
        loaded_file = pickle.load(file)
    return loaded_file
save_file("doc_vectors.pkl",doc_vectors)
doc_vectors = load_file("doc_vectors.pkl")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [13]:

def compute_relevance_scores(query_text: str) -> List[str]:
    """Compute relevance scores between a query vector and all document vectors."""
    query_tokens = preprocess_text(query_text)
    query_vec = vectorize_documents([query_tokens])[0].reshape(1, -1)
    similarities = cosine_similarity(doc_vectors, query_vec)
    top_10_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    return [list(corpus.keys())[index] for index in top_10_indices]



In [14]:
def calculate_recall_precision(query_id):
    relevant_docs = []
    retrieved_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = compute_relevance_scores(query[1])
            break

    truncated_retrieved_docs = retrieved_docs[:len(relevant_docs)]
    y_true = [1 if doc in relevant_docs else 0 for doc in retrieved_docs]
    true_positives = sum(1 for doc in truncated_retrieved_docs if doc in relevant_docs)
    recall_at_10 = true_positives / len(relevant_docs)
    precision_at_10 = true_positives / 10
    print(f"query id: {query_id}, Recall@10: {recall_at_10}")
    print(f"query id: {query_id}, Precision@10: {precision_at_10}")

    return recall_at_10

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

for query_id in queries_ids.keys():
    calculate_recall_precision(query_id)

query id: 2531329, Recall@10: 0.0
query id: 2531329, Precision@10: 0.0
query id: 3825668, Recall@10: 0.0
query id: 3825668, Precision@10: 0.0
query id: 2146313, Recall@10: 0.1
query id: 2146313, Precision@10: 0.3
query id: 4038667, Recall@10: 0.037037037037037035
query id: 4038667, Precision@10: 0.1
query id: 2550445, Recall@10: 0.0
query id: 2550445, Precision@10: 0.0
query id: 1384464, Recall@10: 0.1
query id: 1384464, Precision@10: 0.1
query id: 204818, Recall@10: 0.0
query id: 204818, Precision@10: 0.0
query id: 3751940, Recall@10: 0.0
query id: 3751940, Precision@10: 0.0
query id: 2908187, Recall@10: 0.0
query id: 2908187, Precision@10: 0.0
query id: 524318, Recall@10: 0.0
query id: 524318, Precision@10: 0.0
query id: 1335330, Recall@10: 0.0
query id: 1335330, Precision@10: 0.0
query id: 639012, Recall@10: 0.0
query id: 639012, Precision@10: 0.0
query id: 1949736, Recall@10: 0.1111111111111111
query id: 1949736, Precision@10: 0.1
query id: 4457820, Recall@10: 0.0
query id: 4457820

In [15]:

def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel[0] == query_id]
    relevant_docs = []

    for query in dataset.queries_iter():
        if query[0] == query_id:
            relevant_docs = compute_relevance_scores(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(relevant_docs) and relevant_docs[j] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(relevant_docs) and relevant_docs[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(relevant_docs) and relevant_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids.keys())
print(map_sum / dataset.queries_count())

1.0


In [19]:
def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel[0] == query_id]
    ordered_results = []

    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results = compute_relevance_scores(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(ordered_results) and ordered_results[j] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids.keys())
print(map_sum / dataset.queries_count())

0.2681936733847459


In [16]:
def calculate_MRR(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel.query_id == query_id and qrel.relevance > 0:
            relevant_docs.append(qrel.doc_id)
    
    retrieved_docs = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            retrieved_docs = compute_relevance_scores(query.text)
            break

    for i, result in enumerate(retrieved_docs):
        if result in relevant_docs:
            return 1 / (i + 1)

    return 0

queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel.query_id: ''})

mrr_sum = 0
for query_id in list(queries_ids.keys()):
    mrr_sum += calculate_MRR(query_id)

print(f"Mean Reciprocal Rank : {mrr_sum / len(queries_ids)}")

Mean Reciprocal Rank : 0.29515271071330473
