In [1]:
import os
import json
import joblib
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix



def custom_tokenizer(text):
    return text.split()


def process_and_match_queries(
    enhanced_queries_path: str,
    docs_joblib_path: str,
    tfidf_output_path: str,
    match_output_path: str,
    top_k: int = 100
):
    print("📥 تحميل الاستعلامات المحسنة...")
    enhanced_data: dict = joblib.load(enhanced_queries_path)
    query_ids = list(enhanced_data.keys())
    enhanced_texts = list(enhanced_data.values())

    print("📄 تحميل بيانات الوثائق (TF-IDF)...")
    docs_data = joblib.load(docs_joblib_path)
    doc_ids = docs_data["doc_ids"]
    tfidf_docs: csr_matrix = docs_data["tfidf_matrix"]
    vectorizer = docs_data["vectorizer"]

    print("🔢 تمثيل الاستعلامات المحسنة بـ TF-IDF...")
    query_tfidf_matrix = vectorizer.transform(enhanced_texts)

    # حفظ التمثيل
    joblib.dump({
        "query_ids": query_ids,
        "enhanced_queries": enhanced_texts,
        "query_tfidf_matrix": query_tfidf_matrix,
        "vectorizer": vectorizer
    }, tfidf_output_path)
    print(f"✅ تم حفظ تمثيل TF-IDF في: {tfidf_output_path}")

    print("🤝 بدء المطابقة بين الاستعلامات والوثائق...")
    results = {}

    for i, qid in tqdm(enumerate(query_ids), total=len(query_ids), desc="🔍 مطابقة الاستعلامات"):
        sims = cosine_similarity(query_tfidf_matrix[i], tfidf_docs).flatten()
        top_indices = np.argpartition(sims, -top_k)[-top_k:]
        top_scores = sims[top_indices]
        sorted_indices = top_indices[np.argsort(-top_scores)]
        results[qid] = [(doc_ids[idx], float(sims[idx])) for idx in sorted_indices]

    # حفظ نتائج المطابقة
    os.makedirs(os.path.dirname(match_output_path), exist_ok=True)
    with open(match_output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(f"✅ تم حفظ نتائج المطابقة في: {match_output_path}")


In [6]:
if __name__ == "__main__":
    enhanced_queries_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_corrected_part1.joblib"
    docs_joblib_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\TF-IDF\beir\quora\test\doc\tfidf_data.joblib"
    tfidf_output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\tfidf\queries_tfidf_enhanced_qoura.joblib"
    match_output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\TfidfMatching\tfidf_results_enhanced_qoura.json"

    process_and_match_queries(
        enhanced_queries_path=enhanced_queries_path,
        docs_joblib_path=docs_joblib_path,
        tfidf_output_path=tfidf_output_path,
        match_output_path=match_output_path,
        top_k=100
    )


📥 تحميل الاستعلامات المحسنة...
📄 تحميل بيانات الوثائق (TF-IDF)...
🔢 تمثيل الاستعلامات المحسنة بـ TF-IDF...
✅ تم حفظ تمثيل TF-IDF في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\tfidf\queries_tfidf_enhanced_qoura.joblib
🤝 بدء المطابقة بين الاستعلامات والوثائق...


🔍 مطابقة الاستعلامات: 100%|██████████| 100/100 [00:12<00:00,  7.93it/s]


✅ تم حفظ نتائج المطابقة في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Matching & Ranking\TfidfMatching\tfidf_results_enhanced_qoura.json
