In [1]:
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import json
import os

def match_queries_by_topic(
    docs_topic_path: str,
    docs_embedding_path: str,
    queries_embedding_path: str,
    output_path: str = "bert_topic_results.json",
    top_k: int = 100
):
    print("📥 تحميل بيانات BERTopic + BERT embeddings للوثائق...")
    topic_data = joblib.load(docs_topic_path)
    bert_data = joblib.load(docs_embedding_path)
    query_data = joblib.load(queries_embedding_path)

    doc_ids = bert_data["doc_ids"]
    doc_embeddings = np.vstack(bert_data["embeddings_matrix"])

    query_ids = query_data["query_ids"]
    query_embeddings = np.vstack(query_data["embeddings"])

    topics = topic_data["topics"]
    topic_embeddings = topic_data["topic_embeddings"]
    topic_ids = list(topic_embeddings.keys())
    topic_vectors = np.array([topic_embeddings[tid] for tid in topic_ids])

    print(f"🧠 عدد التوبيكات: {len(topic_ids)}")
    print(f"📊 عدد الاستعلامات: {len(query_ids)}, عدد الوثائق: {len(doc_ids)}")

    results = {}

    for i, query_embedding in tqdm(enumerate(query_embeddings), total=len(query_embeddings), desc="🔍 مطابقة الاستعلامات"):
        query_id = query_ids[i]

        # 1️⃣ مطابقة التوبيك الأفضل
        topic_similarities = cosine_similarity([query_embedding], topic_vectors)[0]
        best_topic_index = np.argmax(topic_similarities)
        predicted_topic = topic_ids[best_topic_index]

        # 2️⃣ استخراج الوثائق ضمن التوبيك
        topic_doc_indices = [j for j, t in enumerate(topics) if t == predicted_topic]
        if not topic_doc_indices:
            results[query_id] = []
            continue

        topic_doc_embeddings = [doc_embeddings[j] for j in topic_doc_indices]
        topic_doc_ids = [doc_ids[j] for j in topic_doc_indices]

        # 3️⃣ حساب التشابه
        similarities = cosine_similarity([query_embedding], topic_doc_embeddings)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results[query_id] = [
            (topic_doc_ids[j], float(similarities[j])) for j in top_indices
        ]

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"✅ تم حفظ النتائج في: {output_path}")


In [4]:
match_queries_by_topic(
    docs_topic_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\merged_topics_1500.joblib",
    docs_embedding_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\beir\quora\test\doc\bert_embedding.joblib",
    queries_embedding_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\BEIR\quora\test\query_embeddings\bert_query_embeddings.joblib",
    output_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\bert_topic_results.json",
    top_k=100
)


📥 تحميل بيانات BERTopic + BERT embeddings للوثائق...
🧠 عدد التوبيكات: 5
📊 عدد الاستعلامات: 10000, عدد الوثائق: 522931


🔍 مطابقة الاستعلامات: 100%|██████████| 10000/10000 [1:33:38<00:00,  1.78it/s]


✅ تم حفظ النتائج في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\bert_topic_results.json


In [4]:
match_queries_by_topic(
    docs_topic_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\antique_train_bertopic_results.joblib",
    docs_embedding_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\antique\train\doc\bert_embedding.joblib",
    queries_embedding_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings\bert_query_embeddings.joblib",
    output_path=r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\bert_topic_results_antique.json",
    top_k=100
)


📥 تحميل بيانات BERTopic + BERT embeddings للوثائق...
🧠 عدد التوبيكات: 4753
📊 عدد الاستعلامات: 176, عدد الوثائق: 401768


🔍 مطابقة الاستعلامات:  67%|██████▋   | 118/176 [00:05<00:02, 19.90it/s]


KeyboardInterrupt: 

In [5]:
import json
import ir_datasets
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# تحميل qrels من BEIR Quora
dataset = ir_datasets.load("beir/quora/test")
qrels = defaultdict(set)
for qrel in dataset.qrels_iter():
    if int(qrel.relevance) > 0:
        qrels[qrel.query_id].add(qrel.doc_id)

# تحميل نتائج المطابقة من ملف JSON
with open(r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\bert_topic_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

# دوال التقييم
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not retrieved_k:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not relevant:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / len(relevant)

def average_precision(retrieved, relevant, k):
    score = 0.0
    hits = 0
    for i, doc_id in enumerate(retrieved[:k], start=1):
        if doc_id in relevant:
            hits += 1
            score += hits / i
    return score / min(len(relevant), k) if relevant else 0.0

def dcg(retrieved, relevant, k):
    return sum([(1 if retrieved[i] in relevant else 0) / np.log2(i + 2) for i in range(min(len(retrieved), k))])

def idcg(relevant, k):
    return sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])

def ndcg_at_k(retrieved, relevant, k):
    dcg_val = dcg(retrieved, relevant, k)
    idcg_val = idcg(relevant, k)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

# التقييم لجميع الاستعلامات
k = 10
precisions, recalls, maps, ndcgs = [], [], [], []

for qid, retrieved_docs in tqdm(results.items(), desc="📊 تقييم الاستعلامات"):
    retrieved_doc_ids = [doc_id for doc_id, _ in retrieved_docs]
    relevant_doc_ids = qrels[qid]

    precisions.append(precision_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    recalls.append(recall_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    maps.append(average_precision(retrieved_doc_ids, relevant_doc_ids, k))
    ndcgs.append(ndcg_at_k(retrieved_doc_ids, relevant_doc_ids, k))

# المتوسطات النهائية
evaluation_results = {
    "Precision@10": round(np.mean(precisions), 4),
    "Recall@10": round(np.mean(recalls), 4),
    "MAP@10": round(np.mean(maps), 4),
    "NDCG@10": round(np.mean(ndcgs), 4),
}

print("📈 نتائج التقييم:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


📊 تقييم الاستعلامات: 100%|██████████| 10000/10000 [00:00<00:00, 14558.03it/s]

📈 نتائج التقييم:
Precision@10: 0.0484
Recall@10: 0.3382
MAP@10: 0.2935
NDCG@10: 0.3156





In [4]:
import json
import ir_datasets
from collections import defaultdict
import numpy as np
from tqdm import tqdm


qrels_path = r"C:\Users\Azzam\.ir_datasets\antique\test\qrels"
qrels = defaultdict(set)

with open(qrels_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 4:
            qid, _, docid, rel = parts
            if int(rel) > 0:
                qrels[qid].add(docid)

# تحميل نتائج المطابقة من ملف JSON
with open(r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\bert_topic_results_antique.json", "r", encoding="utf-8") as f:
    results = json.load(f)

# دوال التقييم
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not retrieved_k:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not relevant:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / len(relevant)

def average_precision(retrieved, relevant, k):
    score = 0.0
    hits = 0
    for i, doc_id in enumerate(retrieved[:k], start=1):
        if doc_id in relevant:
            hits += 1
            score += hits / i
    return score / min(len(relevant), k) if relevant else 0.0

def dcg(retrieved, relevant, k):
    return sum([(1 if retrieved[i] in relevant else 0) / np.log2(i + 2) for i in range(min(len(retrieved), k))])

def idcg(relevant, k):
    return sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])

def ndcg_at_k(retrieved, relevant, k):
    dcg_val = dcg(retrieved, relevant, k)
    idcg_val = idcg(relevant, k)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

# التقييم لجميع الاستعلامات
k = 10
precisions, recalls, maps, ndcgs = [], [], [], []

for qid, retrieved_docs in tqdm(results.items(), desc="📊 تقييم الاستعلامات"):
    retrieved_doc_ids = [doc_id for doc_id, _ in retrieved_docs]
    relevant_doc_ids = qrels[qid]

    precisions.append(precision_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    recalls.append(recall_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    maps.append(average_precision(retrieved_doc_ids, relevant_doc_ids, k))
    ndcgs.append(ndcg_at_k(retrieved_doc_ids, relevant_doc_ids, k))

# المتوسطات النهائية
evaluation_results = {
    "Precision@10": round(np.mean(precisions), 4),
    "Recall@10": round(np.mean(recalls), 4),
    "MAP@10": round(np.mean(maps), 4),
    "NDCG@10": round(np.mean(ndcgs), 4),
}

print("📈 نتائج التقييم:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


📊 تقييم الاستعلامات: 100%|██████████| 176/176 [00:00<00:00, 24401.61it/s]

📈 نتائج التقييم:
Precision@10: 0.2909
Recall@10: 0.0807
MAP@10: 0.2138
NDCG@10: 0.3375



