In [74]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List
import re
import numpy as np
import os 
import pickle
# Load dataset and corpus
import ir_datasets
from sklearn.metrics.pairwise import cosine_similarity

dataset = ir_datasets.load("antique/test")


df = pd.read_csv('collection.tsv', sep='\t', header=None, names=['doc_id', 'text'])

# Build the corpus dictionary
corpus = {}
counter=1
for index, row in df.iterrows():
    
    if isinstance(row['text'], str):
       corpus[row['doc_id']] = row['text']
    else: 
         corpus[row['doc_id']] = ""
    # counter+=1
    # if  counter>=10000:
    #        break
    

# Convert the corpus to a list of documents and handle NaN values
documents = list(corpus.values())




In [56]:

def normalize_country_names(text):

    # List of country names and their variations
    country_names = {
    "uae": "united arab emirates", "u.a.e": "united arab emirates",
    "cn": "china", "china": "china",
    "sy": "syria", "syria": "syria",
    "usa": "united states of america", "u.s.a": "united states of america", 
    "us": "united states of america", "u.s.": "united states of america",
    "uk": "united kingdom", "u.k.": "united kingdom", "united kingdom": "united kingdom",
    "england": "united kingdom", "gb": "united kingdom", "g.b.": "united kingdom",
    "great britain": "united kingdom", "fr": "france", "france": "france",
    "de": "germany", "germany": "germany", "deutschland": "germany",
    "jp": "japan", "japan": "japan", "it": "italy", "italy": "italy",
    "itália": "italy", "es": "spain", "spain": "spain", "españa": "spain",
    "ru": "russia", "russia": "russia", "россия": "russia", "in": "india",
    "india": "india", "br": "brazil", "brazil": "brazil", "brasil": "brazil",
    "au": "australia", "australia": "australia", "ca": "canada", "canada": "canada",
    "mx": "mexico", "mexico": "mexico", "méxico": "mexico", "za": "south africa",
    "south africa": "south africa", "southafrica": "south africa", "kr": "south korea",
    "south korea": "south korea", "southkorea": "south korea", "sa": "saudi arabia",
    "saudi arabia": "saudi arabia", "ksa": "saudi arabia", "kingdom of saudi arabia": "saudi arabia",
    "tr": "turkey", "turkey": "turkey", "trkiye": "turkey", "ch": "switzerland",
    "switzerland": "switzerland", "suisse": "switzerland", "chile": "chile", 
    "pt": "portugal", "portugal": "portugal", "pl": "poland", "poland": "poland",
    "polska": "poland", "eg": "egypt", "egypt": "egypt", "egito": "egypt",
    "ng": "nigeria", "nigeria": "nigeria", "nigéria": "nigeria", "ar": "argentina",
    "argentina": "argentina", "gr": "greece", "greece": "greece", "ellada": "greece",
    "se": "sweden", "sweden": "sweden", "sverige": "sweden", "no": "norway",
    "norway": "norway", "norge": "norway", "fi": "finland", "finland": "finland",
    "suomi": "finland", "nl": "netherlands", "netherlands": "netherlands", 
    "holland": "netherlands", "vn": "vietnam", "vietnam": "vietnam", "hk": "hong kong",
    "hong kong": "hong kong", "ir": "iran", "iran": "iran", "iq": "iraq", 
    "iraq": "iraq", "ph": "philippines", "philippines": "philippines", "pk": "pakistan",
    "pakistan": "pakistan", "th": "thailand", "thailand": "thailand", "my": "malaysia",
    "malaysia": "malaysia", "id": "indonesia", "indonesia": "indonesia", 
    "bd": "bangladesh", "bangladesh": "bangladesh", "af": "afghanistan",
    "afghanistan": "afghanistan", "il": "israel", "israel": "israel", "at": "austria",
    "austria": "austria", "be": "belgium", "belgium": "belgium", "cl": "chile",
    "co": "colombia", "colombia": "colombia", "cz": "czech republic",
    "czech republic": "czech republic", "dk": "denmark", "denmark": "denmark",
    "hu": "hungary", "hungary": "hungary", "is": "iceland", "iceland": "iceland",
    "ie": "ireland", "ireland": "ireland", "ke": "kenya", "kenya": "kenya", 
    "lt": "lithuania", "lithuania": "lithuania", "lu": "luxembourg", 
    "luxembourg": "luxembourg", "mt": "malta", "malta": "malta", "ma": "morocco",
    "morocco": "morocco", "nz": "new zealand", "new zealand": "new zealand", 
    "pe": "peru", "peru": "peru", "ro": "romania", "romania": "romania", 
    "sg": "singapore", "singapore": "singapore", "sk": "slovakia", 
    "slovakia": "slovakia", "tw": "taiwan", "taiwan": "taiwan", "ua": "ukraine", 
    "ukraine": "ukraine", "ve": "venezuela", "venezuela": "venezuela"
}
    return country_names.get(text, text)

In [57]:

# Custom tokenizer
def custom_tokenizer(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())
    return tokens

def get_wordnet_pos(tag):
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def correct_sentence_spelling(tokens):
    spell = SpellChecker()
    misspelled = spell.unknown(tokens)
    for i, token in enumerate(tokens):
        if token in misspelled:
            corrected = spell.correction(token)
            if corrected is not None:
                tokens[i] = corrected
    return tokens

def remove_punctuation(tokens: List[str]) -> List[str]:
    """Removes punctuation from tokens."""
    return [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]

def remove_apostrophe(tokens):
    new_tokens = []
    for token in tokens:
        new_tokens.append(str(np.char.replace(token, "'", " ")))
    return new_tokens

def remove_markers(tokens):
    new_tokens = []
    for token in tokens:
        new_tokens.append(re.sub(r'\u00AE', '', token))
    return new_tokens
def remove_links(text):
    url_pattern = re.compile(r"http[s]?://\S+|www\.\S+")
    cleaned_text = re.sub(url_pattern, "", text)
    
    return cleaned_text  
    

def remove_apostrophe(tokens: List[str]) -> List[str]:
    """Removes apostrophes from tokens."""
    return [token.replace("'", " ") for token in tokens]

def replace_under_score_with_space(tokens: List[str]) -> List[str]:
    """Replaces underscores with spaces in tokens."""
    return [re.sub(r'_', ' ', token) for token in tokens]
def preprocess_text(text: str) -> str:
    """Preprocesses the input text by tokenizing, removing punctuation, stopwords, and then stemming and lemmatizing."""
    #remove links
    text=remove_links(text)
        
    # Convert text to lowercase and tokenize
    text = text.lower()
    words = word_tokenize(text)
    
    # Remove punctuation
    words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Correct spelling
    #words = correct_sentence_spelling(words)
    
    # Further token cleaning
    words = remove_markers(words)
    words = replace_under_score_with_space(words)
    words = remove_apostrophe(words)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags]
    
    return ' '.join(words)


In [58]:
# Vectorizer setup
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, preprocessor=preprocess_text)
tfidf_matrix = vectorizer.fit_transform(documents)
tfidf_model = vectorizer
print("TF-IDF DataFrame created successfully.")



TF-IDF DataFrame created successfully.


In [59]:

# Save and load functions for TF-IDF data
def save_file(file_location: str, content):
    if os.path.exists(file_location):
        os.remove(file_location)
    with open(file_location, 'wb') as handle:
        pickle.dump(content, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_file(file_location: str):
    with open(file_location, 'rb') as handle:
        content = pickle.load(handle)
    return content

def save_tfidf_data(tfidf_matrix, tfidf_model):
    save_file("tfidf_matrix.pickle", tfidf_matrix)
    save_file("tfidf_model.pickle", tfidf_model)


save_tfidf_data(tfidf_matrix, tfidf_model)

In [60]:
def process_query(query: str, tfidf_model, tfidf_matrix):
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    ranked_doc_indices = cosine_similarities.argsort()[::-1]
    return ranked_doc_indices, cosine_similarities

tfidf_matrix = load_file("tfidf_matrix.pickle")
tfidf_model = load_file("tfidf_model.pickle")

def getRetrievedQueries(query: str, k=10):
    preprocessed_query = preprocess_text(query)
    ranked_indices, _ = process_query(preprocessed_query, tfidf_model, tfidf_matrix)
    idsList = []
    for idx in ranked_indices[:k]:
        doc_id = list(corpus.keys())[idx]
        idsList.append(doc_id)
    return idsList

def calculate_recall_precision(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break  

    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs]
    true_positives = sum(y_true)
    recall_at_10 = true_positives / len(relevant_docs) if relevant_docs else 0
    precision_at_10 = true_positives / 10
    print(f"Query ID: {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")    
    return recall_at_10
queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]: ''})
    
for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)

Query ID: 1964316, Recall@10: 0.06060606060606061
Query ID: 1964316, Precision@10: 0.2
Query ID: 2418598, Recall@10: 0.0
Query ID: 2418598, Precision@10: 0.0
Query ID: 1167882, Recall@10: 0.0
Query ID: 1167882, Precision@10: 0.0
Query ID: 1880028, Recall@10: 0.0
Query ID: 1880028, Precision@10: 0.0
Query ID: 2192891, Recall@10: 0.0
Query ID: 2192891, Precision@10: 0.0
Query ID: 949154, Recall@10: 0.0
Query ID: 949154, Precision@10: 0.0
Query ID: 1844896, Recall@10: 0.0
Query ID: 1844896, Precision@10: 0.0
Query ID: 2634143, Recall@10: 0.0
Query ID: 2634143, Precision@10: 0.0
Query ID: 2382487, Recall@10: 0.05454545454545454
Query ID: 2382487, Precision@10: 0.3
Query ID: 229303, Recall@10: 0.0
Query ID: 229303, Precision@10: 0.0
Query ID: 1015624, Recall@10: 0.0
Query ID: 1015624, Precision@10: 0.0
Query ID: 2785579, Recall@10: 0.05128205128205128
Query ID: 2785579, Precision@10: 0.2
Query ID: 4003223, Recall@10: 0.11428571428571428
Query ID: 4003223, Precision@10: 0.4
Query ID: 481173,

Query ID: 3116977, Recall@10: 0.5
Query ID: 3116977, Precision@10: 0.4
Query ID: 3602419, Recall@10: 0.0
Query ID: 3602419, Precision@10: 0.0
Query ID: 225204, Recall@10: 0.2
Query ID: 225204, Precision@10: 0.3
Query ID: 2035638, Recall@10: 0.15384615384615385
Query ID: 2035638, Precision@10: 0.2
Query ID: 2421407, Recall@10: 0.0
Query ID: 2421407, Precision@10: 0.0
Query ID: 323517, Recall@10: 0.21739130434782608
Query ID: 323517, Precision@10: 0.5
Query ID: 3084222, Recall@10: 0.0
Query ID: 3084222, Precision@10: 0.0
Query ID: 3141572, Recall@10: 0.0
Query ID: 3141572, Precision@10: 0.0
Query ID: 2060239, Recall@10: 0.0
Query ID: 2060239, Precision@10: 0.0
Query ID: 2404311, Recall@10: 0.0
Query ID: 2404311, Precision@10: 0.0
Query ID: 2502618, Recall@10: 0.0
Query ID: 2502618, Precision@10: 0.0
Query ID: 2764765, Recall@10: 0.25
Query ID: 2764765, Precision@10: 0.3
Query ID: 2723810, Recall@10: 0.0
Query ID: 2723810, Precision@10: 0.0
Query ID: 167923, Recall@10: 0.6666666666666666


Query ID: 2969924, Recall@10: 0.0
Query ID: 2969924, Precision@10: 0.0
Query ID: 2920776, Recall@10: 0.2
Query ID: 2920776, Precision@10: 0.1
Query ID: 1511753, Recall@10: 0.0
Query ID: 1511753, Precision@10: 0.0
Query ID: 2724170, Recall@10: 0.0
Query ID: 2724170, Precision@10: 0.0
Query ID: 2421078, Recall@10: 0.0
Query ID: 2421078, Precision@10: 0.0
Query ID: 2330967, Recall@10: 0.0
Query ID: 2330967, Precision@10: 0.0
Query ID: 1847644, Recall@10: 0.0
Query ID: 1847644, Precision@10: 0.0
Query ID: 2224478, Recall@10: 0.07142857142857142
Query ID: 2224478, Precision@10: 0.1
Query ID: 2380130, Recall@10: 0.0
Query ID: 2380130, Precision@10: 0.0
Query ID: 2806118, Recall@10: 0.0625
Query ID: 2806118, Precision@10: 0.1
Query ID: 4436327, Recall@10: 0.0
Query ID: 4436327, Precision@10: 0.0
Query ID: 1566780, Recall@10: 0.0
Query ID: 1566780, Precision@10: 0.0
Query ID: 1308733, Recall@10: 0.125
Query ID: 1308733, Precision@10: 0.1
Query ID: 897401, Recall@10: 0.0
Query ID: 897401, Preci

Query ID: 2765527, Recall@10: 0.0
Query ID: 2765527, Precision@10: 0.0
Query ID: 2593499, Recall@10: 0.05263157894736842
Query ID: 2593499, Precision@10: 0.1
Query ID: 1618655, Recall@10: 0.06666666666666667
Query ID: 1618655, Precision@10: 0.1
Query ID: 4059874, Recall@10: 0.12903225806451613
Query ID: 4059874, Precision@10: 0.4
Query ID: 2585320, Recall@10: 0.0
Query ID: 2585320, Precision@10: 0.0
Query ID: 4068073, Recall@10: 0.0
Query ID: 4068073, Precision@10: 0.0
Query ID: 1991463, Recall@10: 0.11764705882352941
Query ID: 1991463, Precision@10: 0.2
Query ID: 586477, Recall@10: 0.16666666666666666
Query ID: 586477, Precision@10: 0.1
Query ID: 3044079, Recall@10: 0.0
Query ID: 3044079, Precision@10: 0.0
Query ID: 1708787, Recall@10: 0.0
Query ID: 1708787, Precision@10: 0.0
Query ID: 1905396, Recall@10: 0.0
Query ID: 1905396, Precision@10: 0.0
Query ID: 3764982, Recall@10: 0.0
Query ID: 3764982, Precision@10: 0.0
Query ID: 2380537, Recall@10: 0.08333333333333333
Query ID: 2380537, P

Query ID: 1176673, Recall@10: 0.058823529411764705
Query ID: 1176673, Precision@10: 0.1
Query ID: 4174946, Recall@10: 0.0
Query ID: 4174946, Precision@10: 0.0
Query ID: 3274939, Recall@10: 0.0
Query ID: 3274939, Precision@10: 0.0
Query ID: 4052068, Recall@10: 0.15384615384615385
Query ID: 4052068, Precision@10: 0.2
Query ID: 652392, Recall@10: 0.0
Query ID: 652392, Precision@10: 0.0
Query ID: 2528361, Recall@10: 0.0
Query ID: 2528361, Precision@10: 0.0
Query ID: 2645522, Recall@10: 0.10526315789473684
Query ID: 2645522, Precision@10: 0.2
Query ID: 3495025, Recall@10: 0.14285714285714285
Query ID: 3495025, Precision@10: 0.4
Query ID: 4142200, Recall@10: 0.0
Query ID: 4142200, Precision@10: 0.0
Query ID: 3150973, Recall@10: 0.0
Query ID: 3150973, Precision@10: 0.0
Query ID: 1913984, Recall@10: 0.25
Query ID: 1913984, Precision@10: 0.2
Query ID: 144514, Recall@10: 0.0
Query ID: 144514, Precision@10: 0.0
Query ID: 2659459, Recall@10: 0.0
Query ID: 2659459, Precision@10: 0.0
Query ID: 17911

Query ID: 2373092, Recall@10: 0.0
Query ID: 2373092, Precision@10: 0.0
Query ID: 292326, Recall@10: 0.14285714285714285
Query ID: 292326, Precision@10: 0.4
Query ID: 1243388, Recall@10: 0.0
Query ID: 1243388, Precision@10: 0.0
Query ID: 808428, Recall@10: 0.0
Query ID: 808428, Precision@10: 0.0
Query ID: 2495991, Recall@10: 0.0
Query ID: 2495991, Precision@10: 0.0
Query ID: 4060665, Recall@10: 0.0
Query ID: 4060665, Precision@10: 0.0
Query ID: 3282426, Recall@10: 0.0
Query ID: 3282426, Precision@10: 0.0
Query ID: 1873405, Recall@10: 0.03333333333333333
Query ID: 1873405, Precision@10: 0.1
Query ID: 1881599, Recall@10: 0.0
Query ID: 1881599, Precision@10: 0.0
Query ID: 1005057, Recall@10: 0.0
Query ID: 1005057, Precision@10: 0.0
Query ID: 136707, Recall@10: 0.0
Query ID: 136707, Precision@10: 0.0
Query ID: 1537542, Recall@10: 0.0
Query ID: 1537542, Precision@10: 0.0
Query ID: 325889, Recall@10: 0.0
Query ID: 325889, Precision@10: 0.0
Query ID: 1398281, Recall@10: 0.0
Query ID: 1398281, 

Query ID: 2250547, Recall@10: 0.0
Query ID: 2250547, Precision@10: 0.0
Query ID: 276277, Recall@10: 0.0
Query ID: 276277, Precision@10: 0.0
Query ID: 3569462, Recall@10: 0.5
Query ID: 3569462, Precision@10: 0.7
Query ID: 977032, Recall@10: 0.11764705882352941
Query ID: 977032, Precision@10: 0.2
Query ID: 866113, Recall@10: 0.0
Query ID: 866113, Precision@10: 0.0
Query ID: 2783044, Recall@10: 0.0
Query ID: 2783044, Precision@10: 0.0
Query ID: 2500918, Recall@10: 0.0
Query ID: 2500918, Precision@10: 0.0
Query ID: 3716940, Recall@10: 0.1
Query ID: 3716940, Precision@10: 0.1
Query ID: 513871, Recall@10: 0.2
Query ID: 513871, Precision@10: 0.1
Query ID: 1357649, Recall@10: 0.0
Query ID: 1357649, Precision@10: 0.0
Query ID: 3364692, Recall@10: 0.0
Query ID: 3364692, Precision@10: 0.0
Query ID: 2561878, Recall@10: 0.05555555555555555
Query ID: 2561878, Precision@10: 0.1
Query ID: 587607, Recall@10: 0.0
Query ID: 587607, Precision@10: 0.0
Query ID: 3626847, Recall@10: 0.043478260869565216
Quer

Query ID: 3643580, Recall@10: 0.0
Query ID: 3643580, Precision@10: 0.0
Query ID: 4315325, Recall@10: 0.0
Query ID: 4315325, Precision@10: 0.0
Query ID: 3397823, Recall@10: 0.4
Query ID: 3397823, Precision@10: 0.2
Query ID: 3995844, Recall@10: 0.2
Query ID: 3995844, Precision@10: 0.1
Query ID: 743628, Recall@10: 0.0
Query ID: 743628, Precision@10: 0.0
Query ID: 2521298, Recall@10: 0.0
Query ID: 2521298, Precision@10: 0.0
Query ID: 3193043, Recall@10: 0.0
Query ID: 3193043, Precision@10: 0.0
Query ID: 2799828, Recall@10: 0.0
Query ID: 2799828, Precision@10: 0.0
Query ID: 1841367, Recall@10: 0.0
Query ID: 1841367, Precision@10: 0.0
Query ID: 1546463, Recall@10: 0.16666666666666666
Query ID: 1546463, Precision@10: 0.2
Query ID: 2062564, Recall@10: 0.0
Query ID: 2062564, Precision@10: 0.0
Query ID: 489705, Recall@10: 0.0
Query ID: 489705, Precision@10: 0.0
Query ID: 922490, Recall@10: 0.25
Query ID: 922490, Precision@10: 0.2
Query ID: 227569, Recall@10: 0.18181818181818182
Query ID: 227569,

Query ID: 1538567, Recall@10: 0.09090909090909091
Query ID: 1538567, Precision@10: 0.1
Query ID: 1038856, Recall@10: 0.0
Query ID: 1038856, Precision@10: 0.0
Query ID: 2095626, Recall@10: 0.08333333333333333
Query ID: 2095626, Precision@10: 0.1
Query ID: 1239473, Recall@10: 0.0
Query ID: 1239473, Precision@10: 0.0
Query ID: 1759762, Recall@10: 0.0
Query ID: 1759762, Precision@10: 0.0
Query ID: 2914849, Recall@10: 0.16666666666666666
Query ID: 2914849, Precision@10: 0.1
Query ID: 973349, Recall@10: 0.0
Query ID: 973349, Precision@10: 0.0
Query ID: 522791, Recall@10: 0.0
Query ID: 522791, Precision@10: 0.0
Query ID: 566364, Recall@10: 0.0
Query ID: 566364, Precision@10: 0.0
Query ID: 3152429, Recall@10: 0.0
Query ID: 3152429, Precision@10: 0.0
Query ID: 3562030, Recall@10: 0.2222222222222222
Query ID: 3562030, Precision@10: 0.2
Query ID: 285233, Recall@10: 0.058823529411764705
Query ID: 285233, Precision@10: 0.2
Query ID: 2456114, Recall@10: 0.0
Query ID: 2456114, Precision@10: 0.0
Query

Query ID: 1907578, Recall@10: 0.034482758620689655
Query ID: 1907578, Precision@10: 0.1
Query ID: 2505599, Recall@10: 0.25
Query ID: 2505599, Precision@10: 0.1
Query ID: 2325383, Recall@10: 0.5
Query ID: 2325383, Precision@10: 0.1
Query ID: 3545997, Recall@10: 0.16666666666666666
Query ID: 3545997, Precision@10: 0.1
Query ID: 3808142, Recall@10: 0.2631578947368421
Query ID: 3808142, Precision@10: 0.5
Query ID: 2489233, Recall@10: 0.0
Query ID: 2489233, Precision@10: 0.0
Query ID: 2169746, Recall@10: 0.0
Query ID: 2169746, Precision@10: 0.0
Query ID: 138140, Recall@10: 0.0
Query ID: 138140, Precision@10: 0.0
Query ID: 3767197, Recall@10: 0.0
Query ID: 3767197, Precision@10: 0.0
Query ID: 613280, Recall@10: 0.3333333333333333
Query ID: 613280, Precision@10: 0.2
Query ID: 3750817, Recall@10: 0.3
Query ID: 3750817, Precision@10: 0.3
Query ID: 4234146, Recall@10: 0.16666666666666666
Query ID: 4234146, Precision@10: 0.1
Query ID: 2046885, Recall@10: 0.0
Query ID: 2046885, Precision@10: 0.0
Q

Query ID: 3095740, Recall@10: 0.0
Query ID: 3095740, Precision@10: 0.0
Query ID: 1686720, Recall@10: 0.0
Query ID: 1686720, Precision@10: 0.0
Query ID: 320716, Recall@10: 0.0
Query ID: 320716, Precision@10: 0.0
Query ID: 1219786, Recall@10: 0.0
Query ID: 1219786, Precision@10: 0.0
Query ID: 2333900, Recall@10: 0.25
Query ID: 2333900, Precision@10: 0.2
Query ID: 802001, Recall@10: 0.0
Query ID: 802001, Precision@10: 0.0
Query ID: 1637590, Recall@10: 0.0
Query ID: 1637590, Precision@10: 0.0
Query ID: 900313, Recall@10: 0.5
Query ID: 900313, Precision@10: 0.5
Query ID: 3710170, Recall@10: 0.0
Query ID: 3710170, Precision@10: 0.0
Query ID: 1629403, Recall@10: 0.0
Query ID: 1629403, Precision@10: 0.0
Query ID: 4127969, Recall@10: 0.0
Query ID: 4127969, Precision@10: 0.0
Query ID: 2768098, Recall@10: 0.14285714285714285
Query ID: 2768098, Precision@10: 0.1
Query ID: 3751146, Recall@10: 0.0
Query ID: 3751146, Precision@10: 0.0
Query ID: 1703147, Recall@10: 0.0
Query ID: 1703147, Precision@10:

Query ID: 2080327, Recall@10: 0.1
Query ID: 2080327, Precision@10: 0.1
Query ID: 652556, Recall@10: 0.0
Query ID: 652556, Precision@10: 0.0
Query ID: 2489934, Recall@10: 0.2222222222222222
Query ID: 2489934, Precision@10: 0.4
Query ID: 1807629, Recall@10: 0.0
Query ID: 1807629, Precision@10: 0.0
Query ID: 3104337, Recall@10: 0.0
Query ID: 3104337, Precision@10: 0.0
Query ID: 237138, Recall@10: 0.0
Query ID: 237138, Precision@10: 0.0
Query ID: 4193878, Recall@10: 0.0
Query ID: 4193878, Precision@10: 0.0
Query ID: 3554911, Recall@10: 0.0
Query ID: 3554911, Precision@10: 0.0
Query ID: 2809444, Recall@10: 0.0
Query ID: 2809444, Precision@10: 0.0
Query ID: 3325541, Recall@10: 0.5
Query ID: 3325541, Precision@10: 0.1
Query ID: 1015398, Recall@10: 0.0
Query ID: 1015398, Precision@10: 0.0
Query ID: 188009, Recall@10: 0.0
Query ID: 188009, Precision@10: 0.0
Query ID: 2825834, Recall@10: 0.0
Query ID: 2825834, Precision@10: 0.0
Query ID: 818796, Recall@10: 0.1111111111111111
Query ID: 818796, Pr

Query ID: 2695109, Recall@10: 0.0
Query ID: 2695109, Precision@10: 0.0
Query ID: 3383242, Recall@10: 0.0
Query ID: 3383242, Precision@10: 0.0
Query ID: 335819, Recall@10: 0.375
Query ID: 335819, Precision@10: 0.3
Query ID: 1458125, Recall@10: 0.0
Query ID: 1458125, Precision@10: 0.0
Query ID: 3751886, Recall@10: 0.13333333333333333
Query ID: 3751886, Precision@10: 0.2
Query ID: 3096528, Recall@10: 0.0
Query ID: 3096528, Precision@10: 0.0
Query ID: 2452131, Recall@10: 0.0
Query ID: 2452131, Precision@10: 0.0
Query ID: 835544, Recall@10: 0.0
Query ID: 835544, Precision@10: 0.0
Query ID: 917465, Recall@10: 0.16666666666666666
Query ID: 917465, Precision@10: 0.1
Query ID: 4235227, Recall@10: 0.23076923076923078
Query ID: 4235227, Precision@10: 0.3
Query ID: 344029, Recall@10: 0.6
Query ID: 344029, Precision@10: 0.3
Query ID: 966622, Recall@10: 0.0
Query ID: 966622, Precision@10: 0.0
Query ID: 475108, Recall@10: 0.0
Query ID: 475108, Precision@10: 0.0
Query ID: 3801064, Recall@10: 0.0
Query

In [61]:

def calculate_MAP(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(retrieved_docs) and retrieved_docs[j] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

map_sum = 0
for query_id in list(queries_ids.keys()):
    map_sum += calculate_MAP(query_id)

print(f"Mean Average Precision (MAP@10): {map_sum / len(queries_ids)}")


Mean Average Precision (MAP@10): 0.22330059523809528


In [78]:
dataset = ir_datasets.load("antique/test/non-offensive")

In [79]:
def process_query(query: str, tfidf_model, tfidf_matrix):
    query_tfidf = tfidf_model.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    ranked_doc_indices = cosine_similarities.argsort()[::-1]
    return ranked_doc_indices, cosine_similarities

tfidf_matrix = load_file("tfidf_matrix.pickle")
tfidf_model = load_file("tfidf_model.pickle")

def getRetrievedQueries(query: str, k=10):
    preprocessed_query = preprocess_text(query)
    ranked_indices, _ = process_query(preprocessed_query, tfidf_model, tfidf_matrix)
    idsList = []
    for idx in ranked_indices[:k]:
        doc_id = list(corpus.keys())[idx]
        idsList.append(doc_id)
    return idsList

def calculate_recall_precision(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break  

    y_true = [1 if doc_id in relevant_docs else 0 for doc_id in retrieved_docs]
    true_positives = sum(y_true)
    recall_at_10 = true_positives / len(relevant_docs) if relevant_docs else 0
    precision_at_10 = true_positives / 10
    print(f"Query ID: {query_id}, Recall@10: {recall_at_10}")
    print(f"Query ID: {query_id}, Precision@10: {precision_at_10}")    
    return recall_at_10
queries_ids = {}
for qrel in dataset.qrels_iter():
    queries_ids.update({qrel[0]: ''})
    
for query_id in list(queries_ids.keys()):
    calculate_recall_precision(query_id)

Query ID: 1964316, Recall@10: 0.24242424242424243
Query ID: 1964316, Precision@10: 0.8
Query ID: 2418598, Recall@10: 0.05405405405405406
Query ID: 2418598, Precision@10: 0.2
Query ID: 1167882, Recall@10: 0.0
Query ID: 1167882, Precision@10: 0.0
Query ID: 1880028, Recall@10: 0.25806451612903225
Query ID: 1880028, Precision@10: 0.8
Query ID: 2192891, Recall@10: 0.05263157894736842
Query ID: 2192891, Precision@10: 0.2
Query ID: 949154, Recall@10: 0.17857142857142858
Query ID: 949154, Precision@10: 0.5
Query ID: 1844896, Recall@10: 0.2
Query ID: 1844896, Precision@10: 0.6
Query ID: 2634143, Recall@10: 0.03333333333333333
Query ID: 2634143, Precision@10: 0.1
Query ID: 229303, Recall@10: 0.05555555555555555
Query ID: 229303, Precision@10: 0.2
Query ID: 1015624, Recall@10: 0.2727272727272727
Query ID: 1015624, Precision@10: 0.6
Query ID: 2785579, Recall@10: 0.02564102564102564
Query ID: 2785579, Precision@10: 0.1
Query ID: 3396066, Recall@10: 0.08571428571428572
Query ID: 3396066, Precision@1

In [80]:

def calculate_MAP(query_id):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id and qrel[2] > 0:
            relevant_docs.append(qrel[1])

    retrieved_docs = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            retrieved_docs = getRetrievedQueries(query[1])
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(retrieved_docs) and retrieved_docs[j] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs else 0)
        pk_sum += p_at_k
        if i - 1 < len(retrieved_docs) and retrieved_docs[i - 1] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

queries_ids = {qrel[0]: '' for qrel in dataset.qrels_iter()}

map_sum = 0
for query_id in list(queries_ids.keys()):
    map_sum += calculate_MAP(query_id)

print(f"Mean Average Precision (MAP@10): {map_sum / len(queries_ids)}")


Mean Average Precision (MAP@10): 0.6416032060972541
