In [2]:
import joblib
import faiss
import numpy as np
import json
import os
from tqdm import tqdm

def search_with_faiss(
    doc_embedding_path: str,
    query_embedding_path: str,
    faiss_index_path: str,
    output_path: str = "faiss_results.json",
    top_k: int = 100,
    batch_size_queries: int = 100
):
    print("📥 تحميل تمثيلات الوثائق...")
    doc_data = joblib.load(doc_embedding_path)
    doc_ids = doc_data["doc_ids"]

    print("📥 تحميل تمثيلات الاستعلامات...")
    query_data = joblib.load(query_embedding_path)
    query_ids = query_data["query_ids"]
    query_embeddings = np.vstack(query_data["embeddings"])

    print("📦 تحميل FAISS Index ...")
    index = faiss.read_index(faiss_index_path)

    results = {}
    num_queries = len(query_embeddings)
    print(f"📊 عدد الاستعلامات: {num_queries}, عدد الوثائق: {len(doc_ids)}")

    for start in tqdm(range(0, num_queries, batch_size_queries), desc="🔍 البحث في FAISS"):
        end = min(start + batch_size_queries, num_queries)
        batch_queries = query_embeddings[start:end].astype(np.float32)

        distances, indices = index.search(batch_queries, top_k)

        for i, query_idx in enumerate(range(start, end)):
            query_id = query_ids[query_idx]
            results[query_id] = [
                (doc_ids[int(doc_idx)], float(1 / (1 + dist)))  # تحويل المسافة إلى score
                for doc_idx, dist in zip(indices[i], distances[i])
                if doc_idx >= 0 and doc_idx < len(doc_ids)
            ]

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"✅ تم حفظ نتائج FAISS في: {output_path}")


if __name__ == "__main__":
    doc_embedding_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\beir\quora\test\doc\bert_embedding.joblib"
    query_embedding_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\BEIR\quora\test\query_embeddings\bert_query_embeddings.joblib"
    faiss_index_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vector_store\bert\beir_quora_test.faiss"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_results.json"

    search_with_faiss(doc_embedding_path, query_embedding_path, faiss_index_path, output_path)


📥 تحميل تمثيلات الوثائق...
📥 تحميل تمثيلات الاستعلامات...
📦 تحميل FAISS Index ...
📊 عدد الاستعلامات: 10000, عدد الوثائق: 522931


🔍 البحث في FAISS: 100%|██████████| 100/100 [01:00<00:00,  1.65it/s]


✅ تم حفظ نتائج FAISS في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_results.json


In [3]:
import json
import ir_datasets
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# تحميل qrels من BEIR Quora
dataset = ir_datasets.load("beir/quora/test")
qrels = defaultdict(set)
for qrel in dataset.qrels_iter():
    if int(qrel.relevance) > 0:
        qrels[qrel.query_id].add(qrel.doc_id)

# تحميل نتائج المطابقة من ملف JSON
with open(r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

# دوال التقييم
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not retrieved_k:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not relevant:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / len(relevant)

def average_precision(retrieved, relevant, k):
    score = 0.0
    hits = 0
    for i, doc_id in enumerate(retrieved[:k], start=1):
        if doc_id in relevant:
            hits += 1
            score += hits / i
    return score / min(len(relevant), k) if relevant else 0.0

def dcg(retrieved, relevant, k):
    return sum([(1 if retrieved[i] in relevant else 0) / np.log2(i + 2) for i in range(min(len(retrieved), k))])

def idcg(relevant, k):
    return sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])

def ndcg_at_k(retrieved, relevant, k):
    dcg_val = dcg(retrieved, relevant, k)
    idcg_val = idcg(relevant, k)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

# التقييم لجميع الاستعلامات
k = 10
precisions, recalls, maps, ndcgs = [], [], [], []

for qid, retrieved_docs in tqdm(results.items(), desc="📊 تقييم الاستعلامات"):
    retrieved_doc_ids = [doc_id for doc_id, _ in retrieved_docs]
    relevant_doc_ids = qrels[qid]

    precisions.append(precision_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    recalls.append(recall_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    maps.append(average_precision(retrieved_doc_ids, relevant_doc_ids, k))
    ndcgs.append(ndcg_at_k(retrieved_doc_ids, relevant_doc_ids, k))

# المتوسطات النهائية
evaluation_results = {
    "Precision@10": round(np.mean(precisions), 4),
    "Recall@10": round(np.mean(recalls), 4),
    "MAP@10": round(np.mean(maps), 4),
    "NDCG@10": round(np.mean(ndcgs), 4),
}

print("📈 نتائج التقييم:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


📊 تقييم الاستعلامات: 100%|██████████| 10000/10000 [00:00<00:00, 30512.33it/s]

📈 نتائج التقييم:
Precision@10: 0.1211
Recall@10: 0.8789
MAP@10: 0.7493
NDCG@10: 0.7923





In [6]:
import joblib
import faiss
import numpy as np
import json
import os
from tqdm import tqdm

def search_with_faiss(
    doc_embedding_path: str,
    query_embedding_path: str,
    faiss_index_path: str,
    output_path: str = "faiss_antique_results.json",
    top_k: int = 100,
    batch_size_queries: int = 100
):
    print("📥 تحميل تمثيلات الوثائق...")
    doc_data = joblib.load(doc_embedding_path)
    doc_ids = doc_data["doc_ids"]

    print("📥 تحميل تمثيلات الاستعلامات...")
    query_data = joblib.load(query_embedding_path)
    query_ids = query_data["query_ids"]
    query_embeddings = np.vstack(query_data["embeddings"])

    print("📦 تحميل FAISS Index ...")
    index = faiss.read_index(faiss_index_path)

    results = {}
    num_queries = len(query_embeddings)
    print(f"📊 عدد الاستعلامات: {num_queries}, عدد الوثائق: {len(doc_ids)}")

    for start in tqdm(range(0, num_queries, batch_size_queries), desc="🔍 البحث في FAISS"):
        end = min(start + batch_size_queries, num_queries)
        batch_queries = query_embeddings[start:end].astype(np.float32)

        distances, indices = index.search(batch_queries, top_k)

        for i, query_idx in enumerate(range(start, end)):
            query_id = query_ids[query_idx]
            results[query_id] = [
                (doc_ids[int(doc_idx)], float(1 / (1 + dist)))  # تحويل المسافة إلى score
                for doc_idx, dist in zip(indices[i], distances[i])
                if doc_idx >= 0 and doc_idx < len(doc_ids)
            ]

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"✅ تم حفظ نتائج FAISS في: {output_path}")


if __name__ == "__main__":
    doc_embedding_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\antique\train\doc\bert_embedding.joblib"
    query_embedding_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings\bert_query_embeddings.joblib"
    faiss_index_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vector_store\bert\antique_train.faiss"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_antique_results.json"

    search_with_faiss(doc_embedding_path, query_embedding_path, faiss_index_path, output_path)


📥 تحميل تمثيلات الوثائق...
📥 تحميل تمثيلات الاستعلامات...
📦 تحميل FAISS Index ...
📊 عدد الاستعلامات: 176, عدد الوثائق: 401768


🔍 البحث في FAISS: 100%|██████████| 2/2 [00:00<00:00,  3.25it/s]


✅ تم حفظ نتائج FAISS في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_antique_results.json


In [1]:
import json
import ir_datasets
from collections import defaultdict
import numpy as np
from tqdm import tqdm


qrels_path = r"C:\Users\Azzam\.ir_datasets\antique\test\qrels"
qrels = defaultdict(set)

with open(qrels_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 4:
            qid, _, docid, rel = parts
            if int(rel) > 0:
                qrels[qid].add(docid)


# تحميل نتائج المطابقة من ملف JSON
with open(r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\Evaluation\faiss_antique_results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

# دوال التقييم
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not retrieved_k:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    if not relevant:
        return 0.0
    return len([doc for doc in retrieved_k if doc in relevant]) / len(relevant)

def average_precision(retrieved, relevant, k):
    score = 0.0
    hits = 0
    for i, doc_id in enumerate(retrieved[:k], start=1):
        if doc_id in relevant:
            hits += 1
            score += hits / i
    return score / min(len(relevant), k) if relevant else 0.0

def dcg(retrieved, relevant, k):
    return sum([(1 if retrieved[i] in relevant else 0) / np.log2(i + 2) for i in range(min(len(retrieved), k))])

def idcg(relevant, k):
    return sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])

def ndcg_at_k(retrieved, relevant, k):
    dcg_val = dcg(retrieved, relevant, k)
    idcg_val = idcg(relevant, k)
    return dcg_val / idcg_val if idcg_val > 0 else 0.0

# التقييم لجميع الاستعلامات
k = 10
precisions, recalls, maps, ndcgs = [], [], [], []

for qid, retrieved_docs in tqdm(results.items(), desc="📊 تقييم الاستعلامات"):
    retrieved_doc_ids = [doc_id for doc_id, _ in retrieved_docs]
    relevant_doc_ids = qrels[qid]

    precisions.append(precision_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    recalls.append(recall_at_k(retrieved_doc_ids, relevant_doc_ids, k))
    maps.append(average_precision(retrieved_doc_ids, relevant_doc_ids, k))
    ndcgs.append(ndcg_at_k(retrieved_doc_ids, relevant_doc_ids, k))

# المتوسطات النهائية
evaluation_results = {
    "Precision@10": round(np.mean(precisions), 4),
    "Recall@10": round(np.mean(recalls), 4),
    "MAP@10": round(np.mean(maps), 4),
    "NDCG@10": round(np.mean(ndcgs), 4),
}

print("📈 نتائج التقييم:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value}")


📊 تقييم الاستعلامات: 100%|██████████| 176/176 [00:00<00:00, 22070.00it/s]

📈 نتائج التقييم:
Precision@10: 0.4125
Recall@10: 0.1201
MAP@10: 0.3112
NDCG@10: 0.4653



