In [1]:
import joblib
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = joblib.load("/models/arguana/tf-idf/vectorizer.joblib")
tfidf_matrix = joblib.load("/models/arguana/vectors.joblib")

with open("/models/arguana/inverted_index.joblib", "rb") as f:
    inverted_index = pickle.load(f)
import json

queries = {}
with open("C:/Users/USER/DataSets/arguana/queries.jsonl", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        queries[item["_id"]] = item["text"]

from collections import defaultdict
import csv

qrels = defaultdict(list)
with open("C:/Users/USER/DataSets/arguana/qrels/test.tsv", encoding="utf-8") as f:
    tsv_reader = csv.reader(f, delimiter="\t")
    next(tsv_reader)  # لتجاوز رأس الجدول (header)
    for row in tsv_reader:
        query_id, doc_id, score = row
        if float(score) > 0:
            qrels[query_id].append(doc_id)


In [2]:
def retrieve_top_k_docs(query_text, k=10):
    query_vector = vectorizer.transform([query_text])
    scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_k_indices = np.argsort(scores)[::-1][:k]
    return top_k_indices, scores[top_k_indices]


In [3]:
def precision_at_k(relevant, retrieved, k=10):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    return sum(1 for doc in retrieved_k if doc in relevant_set) / k


def recall_at_k(relevant, retrieved, k=10):
    relevant_set = set(relevant)
    retrieved_k = retrieved[:k]
    return sum(1 for doc in retrieved_k if doc in relevant_set) / len(relevant)


def average_precision(relevant, retrieved):
    relevant_set = set(relevant)
    hits = 0
    sum_precisions = 0.0
    for i, doc_id in enumerate(retrieved):
        if doc_id in relevant_set:
            hits += 1
            sum_precisions += hits / (i + 1)
    if hits == 0:
        return 0.0
    return sum_precisions / hits


def reciprocal_rank(relevant, retrieved):
    for i, doc_id in enumerate(retrieved):
        if doc_id in set(relevant):
            return 1 / (i + 1)
    return 0.0


In [4]:
precisions, recalls, average_precisions, reciprocal_ranks = [], [], [], []

for qid, query_text in queries.items():
    relevant_docs = qrels.get(qid, [])
    retrieved_docs, _ = retrieve_top_k_docs(query_text, k=100)  # تقييم على top-100 مثلاً

    precisions.append(precision_at_k(relevant_docs, retrieved_docs, k=10))
    recalls.append(recall_at_k(relevant_docs, retrieved_docs, k=10))
    average_precisions.append(average_precision(relevant_docs, retrieved_docs))
    reciprocal_ranks.append(reciprocal_rank(relevant_docs, retrieved_docs))

print(f"Precision@10: {np.mean(precisions):.4f}")
print(f"Recall@10: {np.mean(recalls):.4f}")
print(f"MAP: {np.mean(average_precisions):.4f}")
print(f"MRR: {np.mean(reciprocal_ranks):.4f}")


Precision@10: 0.0000
Recall@10: 0.0000
MAP: 0.0000
MRR: 0.0000
