In [89]:
import ir_datasets
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pandas as pd
import pickle
import os
from spellchecker import SpellChecker
import re
from typing import List
import joblib
from gensim.models import Word2Vec
import numpy as np

In [90]:

dataset = ir_datasets.load("antique/test")


df = pd.read_csv('collection.tsv', sep='\t', header=None, names=['doc_id', 'text'])

# Build the corpus dictionary
corpus = {}
for index, row in df.iterrows():
    
    if  len(row['text']) >0:
        if isinstance(row['text'], str):
            corpus[row['doc_id']] = row['text']
        else: 
            corpus[row['doc_id']] = ""
  
  
    

# Convert the corpus to a list of documents and handle NaN values
documents = list(corpus.values())

def custom_tokenizer(text: str) -> List[str]:
    """Tokenizes and lowercases the text."""
    tokens = word_tokenize(text.lower())
    return tokens

def get_wordnet_pos(tag):
    """Converts POS tag to a format that WordNetLemmatizer can understand."""
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def remove_markers(tokens: List[str]) -> List[str]:
    """Removes specific markers from tokens."""
    return [re.sub(r'\u00AE', '', token) for token in tokens]

def remove_punctuation(tokens: List[str]) -> List[str]:
    """Removes punctuation from tokens."""
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

def replace_under_score_with_space(tokens: List[str]) -> List[str]:
    """Replaces underscores with spaces in tokens."""
    return [re.sub(r'_', ' ', token) for token in tokens]

def remove_apostrophe(tokens: List[str]) -> List[str]:
    """Removes apostrophes from tokens."""
    return [token.replace("'", " ") for token in tokens]

def normalize_country_names(text):

    # List of country names and their variations
    country_names = {
    "uae": "united arab emirates", "u.a.e": "united arab emirates",
    "cn": "china", "china": "china",
    "sy": "syria", "syria": "syria",
    "usa": "united states of america", "u.s.a": "united states of america", 
    "us": "united states of america", "u.s.": "united states of america",
    "uk": "united kingdom", "u.k.": "united kingdom", "united kingdom": "united kingdom",
    "england": "united kingdom", "gb": "united kingdom", "g.b.": "united kingdom",
    "great britain": "united kingdom", "fr": "france", "france": "france",
    "de": "germany", "germany": "germany", "deutschland": "germany",
    "jp": "japan", "japan": "japan", "it": "italy", "italy": "italy",
    "itália": "italy", "es": "spain", "spain": "spain", "españa": "spain",
    "ru": "russia", "russia": "russia", "россия": "russia", "in": "india",
    "india": "india", "br": "brazil", "brazil": "brazil", "brasil": "brazil",
    "au": "australia", "australia": "australia", "ca": "canada", "canada": "canada",
    "mx": "mexico", "mexico": "mexico", "méxico": "mexico", "za": "south africa",
    "south africa": "south africa", "southafrica": "south africa", "kr": "south korea",
    "south korea": "south korea", "southkorea": "south korea", "sa": "saudi arabia",
    "saudi arabia": "saudi arabia", "ksa": "saudi arabia", "kingdom of saudi arabia": "saudi arabia",
    "tr": "turkey", "turkey": "turkey", "trkiye": "turkey", "ch": "switzerland",
    "switzerland": "switzerland", "suisse": "switzerland", "chile": "chile", 
    "pt": "portugal", "portugal": "portugal", "pl": "poland", "poland": "poland",
    "polska": "poland", "eg": "egypt", "egypt": "egypt", "egito": "egypt",
    "ng": "nigeria", "nigeria": "nigeria", "nigéria": "nigeria", "ar": "argentina",
    "argentina": "argentina", "gr": "greece", "greece": "greece", "ellada": "greece",
    "se": "sweden", "sweden": "sweden", "sverige": "sweden", "no": "norway",
    "norway": "norway", "norge": "norway", "fi": "finland", "finland": "finland",
    "suomi": "finland", "nl": "netherlands", "netherlands": "netherlands", 
    "holland": "netherlands", "vn": "vietnam", "vietnam": "vietnam", "hk": "hong kong",
    "hong kong": "hong kong", "ir": "iran", "iran": "iran", "iq": "iraq", 
    "iraq": "iraq", "ph": "philippines", "philippines": "philippines", "pk": "pakistan",
    "pakistan": "pakistan", "th": "thailand", "thailand": "thailand", "my": "malaysia",
    "malaysia": "malaysia", "id": "indonesia", "indonesia": "indonesia", 
    "bd": "bangladesh", "bangladesh": "bangladesh", "af": "afghanistan",
    "afghanistan": "afghanistan", "il": "israel", "israel": "israel", "at": "austria",
    "austria": "austria", "be": "belgium", "belgium": "belgium", "cl": "chile",
    "co": "colombia", "colombia": "colombia", "cz": "czech republic",
    "czech republic": "czech republic", "dk": "denmark", "denmark": "denmark",
    "hu": "hungary", "hungary": "hungary", "is": "iceland", "iceland": "iceland",
    "ie": "ireland", "ireland": "ireland", "ke": "kenya", "kenya": "kenya", 
    "lt": "lithuania", "lithuania": "lithuania", "lu": "luxembourg", 
    "luxembourg": "luxembourg", "mt": "malta", "malta": "malta", "ma": "morocco",
    "morocco": "morocco", "nz": "new zealand", "new zealand": "new zealand", 
    "pe": "peru", "peru": "peru", "ro": "romania", "romania": "romania", 
    "sg": "singapore", "singapore": "singapore", "sk": "slovakia", 
    "slovakia": "slovakia", "tw": "taiwan", "taiwan": "taiwan", "ua": "ukraine", 
    "ukraine": "ukraine", "ve": "venezuela", "venezuela": "venezuela"
}
    return country_names.get(text, text)    
def preprocess_text(text: str) -> str:
    """Preprocesses the input text by tokenizing, removing punctuation, stopwords, and then stemming and lemmatizing."""
    # Convert text to lowercase and tokenize
    words = custom_tokenizer(text)
    
    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
  
    # Further token cleaning
    words = remove_markers(words)
    words = replace_under_score_with_space(words)
    words = remove_apostrophe(words)
    words = [normalize_country_names(word) for word in words]

    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
 
    return ' '.join(words)


In [91]:

# Preprocess documents
processed_documents = [preprocess_text(doc) for doc in documents]
word2vec_model = Word2Vec([doc.split() for doc in processed_documents], vector_size=100, sg=1, epochs=35)
word2vec_model.save("word2vec_model")

# Load the Word2Vec model
word2vec_model = Word2Vec.load("word2vec_model")


# Train Word2Vec model
#word2vec_model = Word2Vec(processed_documents, vector_size=100, sg=1, epochs=35)
#word2vec_model.save("word2vec_model")

# Load the Word2Vec model
#word2vec_model = Word2Vec.load("word2vec_model")


In [92]:
def vectorize_documents(documents: List[str]) -> List[np.ndarray]:
    document_vectors = []
    for document in documents:
        vectors = [word2vec_model.wv[token] for token in document.split() if token in word2vec_model.wv]
        if vectors:
            document_vectors.append(np.mean(vectors, axis=0))
        else:
            document_vectors.append(np.zeros(100))
    return document_vectors

# Compute document vectors
doc_vectors = vectorize_documents(processed_documents)
print(doc_vectors)
# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    with open(file_location, 'wb') as file:
        pickle.dump(content, file, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
    with open(file_location, 'rb') as file:
        loaded_file = pickle.load(file)
    return loaded_file
save_file("doc_vectors.pkl",doc_vectors)
doc_vectors = load_file("doc_vectors.pkl")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [93]:

def compute_relevance_scores(query_text: str) -> List[str]:
    """Compute relevance scores between a query vector and all document vectors."""
    query_tokens = preprocess_text(query_text)
    query_vec = vectorize_documents([query_tokens])[0].reshape(1, -1)
    similarities = cosine_similarity(doc_vectors, query_vec)
    top_10_indices = similarities.argsort(axis=0)[-10:][::-1].flatten()
    return [list(corpus.keys())[index] for index in top_10_indices]



In [94]:
def calculate_recall_precision(query_id):
    relevant_docs = []
    retrieved_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = compute_relevance_scores(query[1])
            break

    truncated_retrieved_docs = retrieved_docs[:len(relevant_docs)]
    y_true = [1 if doc in relevant_docs else 0 for doc in retrieved_docs]
    true_positives = sum(1 for doc in truncated_retrieved_docs if doc in relevant_docs)
    recall_at_10 = true_positives / len(relevant_docs)
    precision_at_10 = true_positives / 10
    print(f"Query ID: {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")

    return recall_at_10

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

for query_id in queries_ids.keys():
    calculate_recall_precision(query_id)

Query ID: 2531329, Recall@10: 0.0
Query ID: 2531329, Precision@10: 0.0
Query ID: 3825668, Recall@10: 0.0
Query ID: 3825668, Precision@10: 0.0
Query ID: 2146313, Recall@10: 0.1
Query ID: 2146313, Precision@10: 0.3
Query ID: 4038667, Recall@10: 0.037037037037037035
Query ID: 4038667, Precision@10: 0.1
Query ID: 2550445, Recall@10: 0.0
Query ID: 2550445, Precision@10: 0.0
Query ID: 1384464, Recall@10: 0.1
Query ID: 1384464, Precision@10: 0.1
Query ID: 204818, Recall@10: 0.0
Query ID: 204818, Precision@10: 0.0
Query ID: 3751940, Recall@10: 0.05555555555555555
Query ID: 3751940, Precision@10: 0.1
Query ID: 2908187, Recall@10: 0.0
Query ID: 2908187, Precision@10: 0.0
Query ID: 524318, Recall@10: 0.0
Query ID: 524318, Precision@10: 0.0
Query ID: 1335330, Recall@10: 0.0
Query ID: 1335330, Precision@10: 0.0
Query ID: 639012, Recall@10: 0.0
Query ID: 639012, Precision@10: 0.0
Query ID: 1949736, Recall@10: 0.1111111111111111
Query ID: 1949736, Precision@10: 0.1
Query ID: 4457820, Recall@10: 0.0
Q

Query ID: 3400018, Recall@10: 0.0
Query ID: 3400018, Precision@10: 0.0
Query ID: 1130835, Recall@10: 0.125
Query ID: 1130835, Precision@10: 0.2
Query ID: 3146069, Recall@10: 0.21428571428571427
Query ID: 3146069, Precision@10: 0.3
Query ID: 3334489, Recall@10: 0.0
Query ID: 3334489, Precision@10: 0.0
Query ID: 483676, Recall@10: 0.0
Query ID: 483676, Precision@10: 0.0
Query ID: 2433082, Recall@10: 0.16666666666666666
Query ID: 2433082, Precision@10: 0.4
Query ID: 3490142, Recall@10: 0.0
Query ID: 3490142, Precision@10: 0.0
Query ID: 3752287, Recall@10: 0.08333333333333333
Query ID: 3752287, Precision@10: 0.2
Query ID: 1136016, Recall@10: 0.0
Query ID: 1136016, Precision@10: 0.0
Query ID: 205155, Recall@10: 0.0
Query ID: 205155, Precision@10: 0.0
Query ID: 1663332, Recall@10: 0.2
Query ID: 1663332, Precision@10: 0.4
Query ID: 98662, Recall@10: 0.2
Query ID: 98662, Precision@10: 0.2
Query ID: 3465597, Recall@10: 0.16666666666666666
Query ID: 3465597, Precision@10: 0.1
Query ID: 3064194, 

Query ID: 2466505, Recall@10: 0.2222222222222222
Query ID: 2466505, Precision@10: 0.2
Query ID: 2835161, Recall@10: 0.0
Query ID: 2835161, Precision@10: 0.0
Query ID: 3097310, Recall@10: 0.0
Query ID: 3097310, Precision@10: 0.0
Query ID: 2310879, Recall@10: 0.11764705882352941
Query ID: 2310879, Precision@10: 0.2
Query ID: 975584, Recall@10: 0.3333333333333333
Query ID: 975584, Precision@10: 0.3
Query ID: 2384614, Recall@10: 0.0
Query ID: 2384614, Precision@10: 0.0
Query ID: 2777831, Recall@10: 0.0
Query ID: 2777831, Precision@10: 0.0
Query ID: 3703530, Recall@10: 0.0
Query ID: 3703530, Precision@10: 0.0
Query ID: 751, Recall@10: 0.0
Query ID: 751, Precision@10: 0.0
Query ID: 1303286, Recall@10: 0.0
Query ID: 1303286, Precision@10: 0.0
Query ID: 3097338, Recall@10: 0.3333333333333333
Query ID: 3097338, Precision@10: 0.2
Query ID: 934652, Recall@10: 0.0
Query ID: 934652, Precision@10: 0.0
Query ID: 1254145, Recall@10: 0.0
Query ID: 1254145, Precision@10: 0.0
Query ID: 2441987, Recall@10

Query ID: 2982969, Recall@10: 0.0
Query ID: 2982969, Precision@10: 0.0
Query ID: 1049660, Recall@10: 0.0
Query ID: 1049660, Precision@10: 0.0
Query ID: 1246269, Recall@10: 0.2
Query ID: 1246269, Precision@10: 0.1
Query ID: 894016, Recall@10: 0.14285714285714285
Query ID: 894016, Precision@10: 0.1
Query ID: 1893441, Recall@10: 0.3333333333333333
Query ID: 1893441, Precision@10: 0.2
Query ID: 1107012, Recall@10: 0.25
Query ID: 1107012, Precision@10: 0.1
Query ID: 476230, Recall@10: 0.07142857142857142
Query ID: 476230, Precision@10: 0.1
Query ID: 894023, Recall@10: 0.0
Query ID: 894023, Precision@10: 0.0
Query ID: 1066058, Recall@10: 0.0
Query ID: 1066058, Precision@10: 0.0
Query ID: 1123407, Recall@10: 0.5
Query ID: 1123407, Precision@10: 0.4
Query ID: 2466898, Recall@10: 0.0
Query ID: 2466898, Precision@10: 0.0
Query ID: 1598547, Recall@10: 0.0
Query ID: 1598547, Precision@10: 0.0
Query ID: 574551, Recall@10: 0.0
Query ID: 574551, Precision@10: 0.0
Query ID: 2147417, Recall@10: 0.0
Que

Query ID: 460171, Recall@10: 0.2222222222222222
Query ID: 460171, Precision@10: 0.2
Query ID: 2745742, Recall@10: 0.2727272727272727
Query ID: 2745742, Precision@10: 0.3
Query ID: 2106769, Recall@10: 0.0
Query ID: 2106769, Precision@10: 0.0
Query ID: 1877395, Recall@10: 0.24
Query ID: 1877395, Precision@10: 0.6
Query ID: 2180502, Recall@10: 0.058823529411764705
Query ID: 2180502, Precision@10: 0.1
Query ID: 845209, Recall@10: 0.05555555555555555
Query ID: 845209, Precision@10: 0.2
Query ID: 4400543, Recall@10: 0.125
Query ID: 4400543, Precision@10: 0.1
Query ID: 3114401, Recall@10: 0.0
Query ID: 3114401, Precision@10: 0.0
Query ID: 4392357, Recall@10: 0.18181818181818182
Query ID: 4392357, Precision@10: 0.2
Query ID: 746919, Recall@10: 0.0
Query ID: 746919, Precision@10: 0.0
Query ID: 107944, Recall@10: 0.1
Query ID: 107944, Precision@10: 0.1
Query ID: 1369513, Recall@10: 0.0
Query ID: 1369513, Precision@10: 0.0
Query ID: 4253098, Recall@10: 0.25
Query ID: 4253098, Precision@10: 0.4
Qu

Query ID: 2885400, Recall@10: 0.0
Query ID: 2885400, Precision@10: 0.0
Query ID: 3524378, Recall@10: 0.0
Query ID: 3524378, Precision@10: 0.0
Query ID: 4179739, Recall@10: 0.25
Query ID: 4179739, Precision@10: 0.1
Query ID: 2098975, Recall@10: 0.0
Query ID: 2098975, Precision@10: 0.0
Query ID: 1140512, Recall@10: 0.06896551724137931
Query ID: 1140512, Precision@10: 0.2
Query ID: 3147557, Recall@10: 0.15789473684210525
Query ID: 3147557, Precision@10: 0.3
Query ID: 788265, Recall@10: 0.1
Query ID: 788265, Precision@10: 0.1
Query ID: 3213105, Recall@10: 0.0
Query ID: 3213105, Precision@10: 0.0
Query ID: 4433714, Recall@10: 0.2
Query ID: 4433714, Precision@10: 0.2
Query ID: 2044211, Recall@10: 0.0
Query ID: 2044211, Precision@10: 0.0
Query ID: 4097849, Recall@10: 0.0
Query ID: 4097849, Precision@10: 0.0
Query ID: 894780, Recall@10: 0.08695652173913043
Query ID: 894780, Precision@10: 0.2
Query ID: 1034050, Recall@10: 0.0
Query ID: 1034050, Precision@10: 0.0
Query ID: 1132356, Recall@10: 0.

Query ID: 1132659, Recall@10: 0.0
Query ID: 1132659, Precision@10: 0.0
Query ID: 1501302, Recall@10: 0.25
Query ID: 1501302, Precision@10: 0.1
Query ID: 1173625, Recall@10: 0.0
Query ID: 1173625, Precision@10: 0.0
Query ID: 1247354, Recall@10: 0.0
Query ID: 1247354, Precision@10: 0.0
Query ID: 3442042, Recall@10: 0.23076923076923078
Query ID: 3442042, Precision@10: 0.3
Query ID: 3614846, Recall@10: 0.25
Query ID: 3614846, Precision@10: 0.2
Query ID: 2418824, Recall@10: 0.4
Query ID: 2418824, Precision@10: 0.4
Query ID: 1587762, Recall@10: 0.5
Query ID: 1587762, Precision@10: 0.3
Query ID: 182417, Recall@10: 0.0
Query ID: 182417, Precision@10: 0.0
Query ID: 1960083, Recall@10: 0.25
Query ID: 1960083, Precision@10: 0.4
Query ID: 3598485, Recall@10: 0.0
Query ID: 3598485, Precision@10: 0.0
Query ID: 1878166, Recall@10: 0.0
Query ID: 1878166, Precision@10: 0.0
Query ID: 4106400, Recall@10: 1.0
Query ID: 4106400, Precision@10: 0.4
Query ID: 920603, Recall@10: 0.0
Query ID: 920603, Precision

Query ID: 3172912, Recall@10: 0.2
Query ID: 3172912, Precision@10: 0.1
Query ID: 764472, Recall@10: 0.0
Query ID: 764472, Precision@10: 0.0
Query ID: 4352570, Recall@10: 0.0
Query ID: 4352570, Precision@10: 0.0
Query ID: 1858655, Recall@10: 0.0
Query ID: 1858655, Precision@10: 0.0
Query ID: 1468989, Recall@10: 0.1111111111111111
Query ID: 1468989, Precision@10: 0.3
Query ID: 1976900, Recall@10: 0.0
Query ID: 1976900, Precision@10: 0.0
Query ID: 994401, Recall@10: 0.0
Query ID: 994401, Precision@10: 0.0
Query ID: 2689609, Recall@10: 0.0
Query ID: 2689609, Precision@10: 0.0
Query ID: 911954, Recall@10: 0.0
Query ID: 911954, Precision@10: 0.0
Query ID: 3017307, Recall@10: 0.058823529411764705
Query ID: 3017307, Precision@10: 0.1
Query ID: 4115042, Recall@10: 0.0
Query ID: 4115042, Precision@10: 0.0
Query ID: 2345573, Recall@10: 0.0
Query ID: 2345573, Precision@10: 0.0
Query ID: 4270694, Recall@10: 0.0
Query ID: 4270694, Precision@10: 0.0
Query ID: 961127, Recall@10: 0.0
Query ID: 961127, 

Query ID: 330693, Recall@10: 0.09090909090909091
Query ID: 330693, Precision@10: 0.1
Query ID: 1158088, Recall@10: 0.09090909090909091
Query ID: 1158088, Precision@10: 0.2
Query ID: 4238378, Recall@10: 0.0
Query ID: 4238378, Precision@10: 0.0
Query ID: 658382, Recall@10: 0.0
Query ID: 658382, Precision@10: 0.0
Query ID: 1608655, Recall@10: 0.0
Query ID: 1608655, Precision@10: 0.0
Query ID: 4246480, Recall@10: 0.0
Query ID: 4246480, Precision@10: 0.0
Query ID: 2558930, Recall@10: 0.0
Query ID: 2558930, Precision@10: 0.0
Query ID: 4241596, Recall@10: 0.0
Query ID: 4241596, Precision@10: 0.0
Query ID: 3271638, Recall@10: 0.2
Query ID: 3271638, Precision@10: 0.2
Query ID: 3312600, Recall@10: 0.0
Query ID: 3312600, Precision@10: 0.0
Query ID: 1387481, Recall@10: 0.0
Query ID: 1387481, Precision@10: 0.0
Query ID: 2190298, Recall@10: 0.0
Query ID: 2190298, Precision@10: 0.0
Query ID: 1772508, Recall@10: 0.5
Query ID: 1772508, Precision@10: 0.1
Query ID: 2837473, Recall@10: 0.10526315789473684

Query ID: 3370270, Recall@10: 0.0
Query ID: 3370270, Precision@10: 0.0
Query ID: 3427615, Recall@10: 0.0
Query ID: 3427615, Precision@10: 0.0
Query ID: 949467, Recall@10: 0.1
Query ID: 949467, Precision@10: 0.1
Query ID: 863525, Recall@10: 0.0
Query ID: 863525, Precision@10: 0.0
Query ID: 1051943, Recall@10: 0.0
Query ID: 1051943, Precision@10: 0.0
Query ID: 3213191, Recall@10: 0.0
Query ID: 3213191, Precision@10: 0.0
Query ID: 2895152, Recall@10: 0.0
Query ID: 2895152, Precision@10: 0.0
Query ID: 3747127, Recall@10: 0.3333333333333333
Query ID: 3747127, Precision@10: 0.1
Query ID: 2067770, Recall@10: 0.2
Query ID: 2067770, Precision@10: 0.1
Query ID: 1781052, Recall@10: 0.0
Query ID: 1781052, Precision@10: 0.0
Query ID: 380222, Recall@10: 0.3333333333333333
Query ID: 380222, Precision@10: 0.1
Query ID: 1936704, Recall@10: 0.0
Query ID: 1936704, Precision@10: 0.0
Query ID: 3738948, Recall@10: 0.0
Query ID: 3738948, Precision@10: 0.0
Query ID: 1224005, Recall@10: 0.0
Query ID: 1224005, 

Query ID: 372324, Recall@10: 0.2
Query ID: 372324, Precision@10: 0.1
Query ID: 3313253, Recall@10: 0.0
Query ID: 3313253, Precision@10: 0.0
Query ID: 3911272, Recall@10: 0.0
Query ID: 3911272, Precision@10: 0.0
Query ID: 1691245, Recall@10: 0.0
Query ID: 1691245, Precision@10: 0.0
Query ID: 1175152, Recall@10: 0.0
Query ID: 1175152, Precision@10: 0.0
Query ID: 2248306, Recall@10: 0.0
Query ID: 2248306, Precision@10: 0.0
Query ID: 2657907, Recall@10: 0.0
Query ID: 2657907, Precision@10: 0.0
Query ID: 175732, Recall@10: 0.0
Query ID: 175732, Precision@10: 0.0
Query ID: 2764410, Recall@10: 0.13043478260869565
Query ID: 2764410, Precision@10: 0.6
Query ID: 1019517, Recall@10: 0.0625
Query ID: 1019517, Precision@10: 0.1
Query ID: 4337286, Recall@10: 0.0
Query ID: 4337286, Precision@10: 0.0
Query ID: 3026569, Recall@10: 0.2
Query ID: 3026569, Precision@10: 0.2
Query ID: 3059341, Recall@10: 0.0
Query ID: 3059341, Precision@10: 0.0
Query ID: 1404559, Recall@10: 0.0
Query ID: 1404559, Precision

Query ID: 2035638, Recall@10: 0.15384615384615385
Query ID: 2035638, Precision@10: 0.2
Query ID: 2421407, Recall@10: 0.0
Query ID: 2421407, Precision@10: 0.0
Query ID: 323517, Recall@10: 0.17391304347826086
Query ID: 323517, Precision@10: 0.4
Query ID: 3084222, Recall@10: 0.0
Query ID: 3084222, Precision@10: 0.0
Query ID: 3141572, Recall@10: 0.0
Query ID: 3141572, Precision@10: 0.0
Query ID: 2060239, Recall@10: 0.07692307692307693
Query ID: 2060239, Precision@10: 0.1
Query ID: 2404311, Recall@10: 0.0
Query ID: 2404311, Precision@10: 0.0
Query ID: 2502618, Recall@10: 0.0
Query ID: 2502618, Precision@10: 0.0
Query ID: 2764765, Recall@10: 0.25
Query ID: 2764765, Precision@10: 0.3
Query ID: 2723810, Recall@10: 0.0
Query ID: 2723810, Precision@10: 0.0
Query ID: 167923, Recall@10: 0.3333333333333333
Query ID: 167923, Precision@10: 0.1
Query ID: 4345846, Recall@10: 0.0
Query ID: 4345846, Precision@10: 0.0
Query ID: 3985402, Recall@10: 0.2
Query ID: 3985402, Precision@10: 0.1
Query ID: 4345857

Query ID: 2724170, Recall@10: 0.0
Query ID: 2724170, Precision@10: 0.0
Query ID: 2421078, Recall@10: 0.0
Query ID: 2421078, Precision@10: 0.0
Query ID: 2330967, Recall@10: 0.0
Query ID: 2330967, Precision@10: 0.0
Query ID: 1847644, Recall@10: 0.0
Query ID: 1847644, Precision@10: 0.0
Query ID: 2224478, Recall@10: 0.0
Query ID: 2224478, Precision@10: 0.0
Query ID: 2380130, Recall@10: 0.0
Query ID: 2380130, Precision@10: 0.0
Query ID: 2806118, Recall@10: 0.1875
Query ID: 2806118, Precision@10: 0.3
Query ID: 4436327, Recall@10: 0.06666666666666667
Query ID: 4436327, Precision@10: 0.1
Query ID: 1566780, Recall@10: 0.0
Query ID: 1566780, Precision@10: 0.0
Query ID: 1308733, Recall@10: 0.25
Query ID: 1308733, Precision@10: 0.2
Query ID: 897401, Recall@10: 0.0
Query ID: 897401, Precision@10: 0.0
Query ID: 3191168, Recall@10: 0.05263157894736842
Query ID: 3191168, Precision@10: 0.2
Query ID: 2355586, Recall@10: 0.0
Query ID: 2355586, Precision@10: 0.0
Query ID: 373125, Recall@10: 0.0
Query ID: 

Query ID: 4059874, Recall@10: 0.0967741935483871
Query ID: 4059874, Precision@10: 0.3
Query ID: 2585320, Recall@10: 0.0
Query ID: 2585320, Precision@10: 0.0
Query ID: 4068073, Recall@10: 0.0
Query ID: 4068073, Precision@10: 0.0
Query ID: 1991463, Recall@10: 0.17647058823529413
Query ID: 1991463, Precision@10: 0.3
Query ID: 586477, Recall@10: 0.16666666666666666
Query ID: 586477, Precision@10: 0.1
Query ID: 3044079, Recall@10: 0.0
Query ID: 3044079, Precision@10: 0.0
Query ID: 1708787, Recall@10: 0.0
Query ID: 1708787, Precision@10: 0.0
Query ID: 1905396, Recall@10: 0.09090909090909091
Query ID: 1905396, Precision@10: 0.1
Query ID: 3764982, Recall@10: 0.0
Query ID: 3764982, Precision@10: 0.0
Query ID: 2380537, Recall@10: 0.08333333333333333
Query ID: 2380537, Precision@10: 0.1
Query ID: 545531, Recall@10: 0.125
Query ID: 545531, Precision@10: 0.2
Query ID: 226048, Recall@10: 0.0
Query ID: 226048, Precision@10: 0.0
Query ID: 668422, Recall@10: 0.0
Query ID: 668422, Precision@10: 0.0
Quer

Query ID: 652392, Recall@10: 0.14285714285714285
Query ID: 652392, Precision@10: 0.1
Query ID: 2528361, Recall@10: 0.17391304347826086
Query ID: 2528361, Precision@10: 0.4
Query ID: 2645522, Recall@10: 0.05263157894736842
Query ID: 2645522, Precision@10: 0.1
Query ID: 3495025, Recall@10: 0.10714285714285714
Query ID: 3495025, Precision@10: 0.3
Query ID: 4142200, Recall@10: 0.0
Query ID: 4142200, Precision@10: 0.0
Query ID: 3150973, Recall@10: 0.030303030303030304
Query ID: 3150973, Precision@10: 0.1
Query ID: 1913984, Recall@10: 0.375
Query ID: 1913984, Precision@10: 0.3
Query ID: 144514, Recall@10: 0.0
Query ID: 144514, Precision@10: 0.0
Query ID: 2659459, Recall@10: 0.0
Query ID: 2659459, Precision@10: 0.0
Query ID: 1791111, Recall@10: 0.0
Query ID: 1791111, Precision@10: 0.0
Query ID: 1242249, Recall@10: 0.14285714285714285
Query ID: 1242249, Precision@10: 0.1
Query ID: 1815696, Recall@10: 0.0
Query ID: 1815696, Precision@10: 0.0
Query ID: 2233493, Recall@10: 0.0
Query ID: 2233493, 

Query ID: 2495991, Recall@10: 0.0
Query ID: 2495991, Precision@10: 0.0
Query ID: 4060665, Recall@10: 0.13793103448275862
Query ID: 4060665, Precision@10: 0.4
Query ID: 3282426, Recall@10: 0.0
Query ID: 3282426, Precision@10: 0.0
Query ID: 1873405, Recall@10: 0.06666666666666667
Query ID: 1873405, Precision@10: 0.2
Query ID: 1881599, Recall@10: 0.0
Query ID: 1881599, Precision@10: 0.0
Query ID: 1005057, Recall@10: 0.0
Query ID: 1005057, Precision@10: 0.0
Query ID: 136707, Recall@10: 0.14285714285714285
Query ID: 136707, Precision@10: 0.1
Query ID: 1537542, Recall@10: 0.058823529411764705
Query ID: 1537542, Precision@10: 0.1
Query ID: 325889, Recall@10: 0.0
Query ID: 325889, Precision@10: 0.0
Query ID: 1398281, Recall@10: 0.0
Query ID: 1398281, Precision@10: 0.0
Query ID: 2536970, Recall@10: 0.0
Query ID: 2536970, Precision@10: 0.0
Query ID: 2577932, Recall@10: 0.0
Query ID: 2577932, Precision@10: 0.0
Query ID: 833038, Recall@10: 0.0
Query ID: 833038, Precision@10: 0.0
Query ID: 2946575,

Query ID: 977032, Recall@10: 0.11764705882352941
Query ID: 977032, Precision@10: 0.2
Query ID: 866113, Recall@10: 0.0
Query ID: 866113, Precision@10: 0.0
Query ID: 2783044, Recall@10: 0.0
Query ID: 2783044, Precision@10: 0.0
Query ID: 2500918, Recall@10: 0.0
Query ID: 2500918, Precision@10: 0.0
Query ID: 3716940, Recall@10: 0.0
Query ID: 3716940, Precision@10: 0.0
Query ID: 513871, Recall@10: 0.0
Query ID: 513871, Precision@10: 0.0
Query ID: 1357649, Recall@10: 0.07692307692307693
Query ID: 1357649, Precision@10: 0.1
Query ID: 3364692, Recall@10: 0.0
Query ID: 3364692, Precision@10: 0.0
Query ID: 2561878, Recall@10: 0.2222222222222222
Query ID: 2561878, Precision@10: 0.4
Query ID: 587607, Recall@10: 0.0
Query ID: 587607, Precision@10: 0.0
Query ID: 3626847, Recall@10: 0.0
Query ID: 3626847, Precision@10: 0.0
Query ID: 391008, Recall@10: 0.2
Query ID: 391008, Precision@10: 0.1
Query ID: 735075, Recall@10: 0.0
Query ID: 735075, Precision@10: 0.0
Query ID: 3864422, Recall@10: 0.25
Query I

Query ID: 743628, Recall@10: 0.0
Query ID: 743628, Precision@10: 0.0
Query ID: 2521298, Recall@10: 0.07142857142857142
Query ID: 2521298, Precision@10: 0.1
Query ID: 3193043, Recall@10: 0.0
Query ID: 3193043, Precision@10: 0.0
Query ID: 2799828, Recall@10: 0.0
Query ID: 2799828, Precision@10: 0.0
Query ID: 1841367, Recall@10: 0.0
Query ID: 1841367, Precision@10: 0.0
Query ID: 1546463, Recall@10: 0.08333333333333333
Query ID: 1546463, Precision@10: 0.1
Query ID: 2062564, Recall@10: 0.08
Query ID: 2062564, Precision@10: 0.2
Query ID: 489705, Recall@10: 0.09090909090909091
Query ID: 489705, Precision@10: 0.1
Query ID: 922490, Recall@10: 0.0
Query ID: 922490, Precision@10: 0.0
Query ID: 227569, Recall@10: 0.2727272727272727
Query ID: 227569, Precision@10: 0.3
Query ID: 4069618, Recall@10: 0.0
Query ID: 4069618, Precision@10: 0.0
Query ID: 3111156, Recall@10: 0.2222222222222222
Query ID: 3111156, Precision@10: 0.2
Query ID: 530682, Recall@10: 0.07692307692307693
Query ID: 530682, Precision@

Query ID: 1239473, Recall@10: 0.0
Query ID: 1239473, Precision@10: 0.0
Query ID: 1759762, Recall@10: 0.0
Query ID: 1759762, Precision@10: 0.0
Query ID: 2914849, Recall@10: 0.16666666666666666
Query ID: 2914849, Precision@10: 0.1
Query ID: 973349, Recall@10: 0.0
Query ID: 973349, Precision@10: 0.0
Query ID: 522791, Recall@10: 0.0
Query ID: 522791, Precision@10: 0.0
Query ID: 566364, Recall@10: 0.0
Query ID: 566364, Precision@10: 0.0
Query ID: 3152429, Recall@10: 0.0
Query ID: 3152429, Precision@10: 0.0
Query ID: 3562030, Recall@10: 0.3333333333333333
Query ID: 3562030, Precision@10: 0.3
Query ID: 285233, Recall@10: 0.058823529411764705
Query ID: 285233, Precision@10: 0.2
Query ID: 2456114, Recall@10: 0.0
Query ID: 2456114, Precision@10: 0.0
Query ID: 137779, Recall@10: 0.0
Query ID: 137779, Precision@10: 0.0
Query ID: 4110902, Recall@10: 1.0
Query ID: 4110902, Precision@10: 0.2
Query ID: 3373628, Recall@10: 0.0
Query ID: 3373628, Precision@10: 0.0
Query ID: 1595983, Recall@10: 0.2857142

Query ID: 3545997, Recall@10: 0.16666666666666666
Query ID: 3545997, Precision@10: 0.1
Query ID: 3808142, Recall@10: 0.15789473684210525
Query ID: 3808142, Precision@10: 0.3
Query ID: 2489233, Recall@10: 0.0
Query ID: 2489233, Precision@10: 0.0
Query ID: 2169746, Recall@10: 0.0
Query ID: 2169746, Precision@10: 0.0
Query ID: 138140, Recall@10: 0.0
Query ID: 138140, Precision@10: 0.0
Query ID: 3767197, Recall@10: 0.0
Query ID: 3767197, Precision@10: 0.0
Query ID: 613280, Recall@10: 0.3333333333333333
Query ID: 613280, Precision@10: 0.2
Query ID: 3750817, Recall@10: 0.4
Query ID: 3750817, Precision@10: 0.4
Query ID: 4234146, Recall@10: 0.0
Query ID: 4234146, Precision@10: 0.0
Query ID: 2046885, Recall@10: 0.0
Query ID: 2046885, Precision@10: 0.0
Query ID: 3382182, Recall@10: 0.2727272727272727
Query ID: 3382182, Precision@10: 0.3
Query ID: 3677095, Recall@10: 0.25
Query ID: 3677095, Precision@10: 0.3
Query ID: 591004, Recall@10: 0.29411764705882354
Query ID: 591004, Precision@10: 0.5
Quer

Query ID: 1219786, Recall@10: 0.0
Query ID: 1219786, Precision@10: 0.0
Query ID: 2333900, Recall@10: 0.75
Query ID: 2333900, Precision@10: 0.6
Query ID: 802001, Recall@10: 0.0
Query ID: 802001, Precision@10: 0.0
Query ID: 1637590, Recall@10: 0.0
Query ID: 1637590, Precision@10: 0.0
Query ID: 900313, Recall@10: 0.5
Query ID: 900313, Precision@10: 0.5
Query ID: 3710170, Recall@10: 0.16666666666666666
Query ID: 3710170, Precision@10: 0.1
Query ID: 1629403, Recall@10: 0.05
Query ID: 1629403, Precision@10: 0.1
Query ID: 4127969, Recall@10: 0.0
Query ID: 4127969, Precision@10: 0.0
Query ID: 2768098, Recall@10: 0.14285714285714285
Query ID: 2768098, Precision@10: 0.1
Query ID: 3751146, Recall@10: 0.0
Query ID: 3751146, Precision@10: 0.0
Query ID: 1703147, Recall@10: 0.0
Query ID: 1703147, Precision@10: 0.0
Query ID: 2063597, Recall@10: 0.14285714285714285
Query ID: 2063597, Precision@10: 0.1
Query ID: 1948910, Recall@10: 0.0
Query ID: 1948910, Precision@10: 0.0
Query ID: 2153711, Recall@10: 0

Query ID: 2489934, Recall@10: 0.2222222222222222
Query ID: 2489934, Precision@10: 0.4
Query ID: 1807629, Recall@10: 0.0
Query ID: 1807629, Precision@10: 0.0
Query ID: 3104337, Recall@10: 0.0
Query ID: 3104337, Precision@10: 0.0
Query ID: 237138, Recall@10: 0.0
Query ID: 237138, Precision@10: 0.0
Query ID: 4193878, Recall@10: 0.0
Query ID: 4193878, Precision@10: 0.0
Query ID: 3554911, Recall@10: 0.0
Query ID: 3554911, Precision@10: 0.0
Query ID: 2809444, Recall@10: 0.0
Query ID: 2809444, Precision@10: 0.0
Query ID: 3325541, Recall@10: 0.5
Query ID: 3325541, Precision@10: 0.1
Query ID: 1015398, Recall@10: 0.0
Query ID: 1015398, Precision@10: 0.0
Query ID: 188009, Recall@10: 0.5
Query ID: 188009, Precision@10: 0.2
Query ID: 2825834, Recall@10: 0.0
Query ID: 2825834, Precision@10: 0.0
Query ID: 818796, Recall@10: 0.1111111111111111
Query ID: 818796, Precision@10: 0.1
Query ID: 4272061, Recall@10: 0.1111111111111111
Query ID: 4272061, Precision@10: 0.3
Query ID: 1687161, Recall@10: 0.052631

Query ID: 335819, Recall@10: 0.75
Query ID: 335819, Precision@10: 0.6
Query ID: 1458125, Recall@10: 0.0
Query ID: 1458125, Precision@10: 0.0
Query ID: 3751886, Recall@10: 0.06666666666666667
Query ID: 3751886, Precision@10: 0.1
Query ID: 3096528, Recall@10: 0.0
Query ID: 3096528, Precision@10: 0.0
Query ID: 2452131, Recall@10: 0.0
Query ID: 2452131, Precision@10: 0.0
Query ID: 835544, Recall@10: 0.0
Query ID: 835544, Precision@10: 0.0
Query ID: 917465, Recall@10: 0.16666666666666666
Query ID: 917465, Precision@10: 0.1
Query ID: 4235227, Recall@10: 0.23076923076923078
Query ID: 4235227, Precision@10: 0.3
Query ID: 344029, Recall@10: 0.4
Query ID: 344029, Precision@10: 0.2
Query ID: 966622, Recall@10: 0.2
Query ID: 966622, Precision@10: 0.2
Query ID: 475108, Recall@10: 0.0
Query ID: 475108, Precision@10: 0.0
Query ID: 3801064, Recall@10: 0.0
Query ID: 3801064, Precision@10: 0.0
Query ID: 2359288, Recall@10: 0.0
Query ID: 2359288, Precision@10: 0.0
Query ID: 884731, Recall@10: 0.2
Query I

In [95]:
def calculate_MAP(query_id):
    relevant_docs = [qrel[1] for qrel in dataset.qrels_iter() if qrel[0] == query_id]
    ordered_results = []

    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results = compute_relevance_scores(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = sum(1 for j in range(i) if j < len(ordered_results) and ordered_results[j] in relevant_docs)
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(ordered_results) and ordered_results[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

map_sum = sum(calculate_MAP(query_id) for query_id in queries_ids.keys())
print(map_sum / dataset.queries_count())

0.2688845431992837
