In [6]:
import os
import joblib
import ir_datasets
from tqdm import tqdm
from collections import Counter
from difflib import get_close_matches

# ==========================
# 🧠 QueryRefiner - تصحيح فقط
# ==========================
class QueryRefinerCorrectOnly:
    def __init__(self, processed_terms):
        self.term_frequencies = Counter(processed_terms)
        self.processed_terms = set(processed_terms)

    def suggest_correction(self, query):
        words = query.split()
        corrected = []
        for word in words:
            matches = get_close_matches(word, self.processed_terms, n=1, cutoff=0.8)
            if matches:
                corrected.append(matches[0])
            else:
                corrected.append(word)
        return corrected

    def enhance(self, query):
        corrected = self.suggest_correction(query)
        return " ".join(corrected)


# ==========================
# 🔄 تحسين الاستعلامات
# ==========================
def generate_corrected_queries(dataset_name, terms_path, output_path, batch_size=1000):
    print(f"📥 تحميل الاستعلامات من: {dataset_name}")
    dataset = ir_datasets.load(dataset_name)
    queries = list(dataset.queries_iter())

    print(f"📖 تحميل المفردات من: {terms_path}")
    with open(terms_path, "r", encoding="utf-8") as f:
        terms = [line.strip().lower() for line in f if line.strip()]
    refiner = QueryRefinerCorrectOnly(terms)

    enhanced_queries = {}

    print("🔧 تصحيح الاستعلامات فقط...")
    for i in range(0, len(queries), batch_size):
        batch = queries[i:i+batch_size]
        for q in tqdm(batch, desc=f"🔤 دفعة {i//batch_size+1}"):
            enhanced = refiner.enhance(q.text)
            enhanced_queries[q.query_id] = enhanced

        # حفظ مؤقت لكل دفعة
        partial_path = output_path.replace(".joblib", f"_corrected_part{i//batch_size+1}.joblib")
        os.makedirs(os.path.dirname(partial_path), exist_ok=True)
        joblib.dump(enhanced_queries, partial_path)
        print(f"💾 تم حفظ الدفعة {i//batch_size+1} في: {partial_path}")

    joblib.dump(enhanced_queries, output_path)
    print(f"✅ تم حفظ الاستعلامات المصححة في: {output_path}")


In [7]:
# ==========================
# ▶️ تنفيذ السكربت
# ==========================
if __name__ == "__main__":
    # 🔧 عدّل هذه القيم حسب الحاجة
    dataset_name = "beir/quora/test"
    terms_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\beir_quora_test_terms.txt"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries.joblib"
    batch_size = 100

    generate_corrected_queries(
        dataset_name=dataset_name,
        terms_path=terms_path,
        output_path=output_path,
        batch_size=batch_size
    )


📥 تحميل الاستعلامات من: beir/quora/test
📖 تحميل المفردات من: C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\beir_quora_test_terms.txt
🔧 تصحيح الاستعلامات فقط...


🔤 دفعة 1: 100%|██████████| 100/100 [01:51<00:00,  1.12s/it]


💾 تم حفظ الدفعة 1 في: C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_corrected_part1.joblib


🔤 دفعة 2:  30%|███       | 30/100 [00:31<01:12,  1.04s/it]


KeyboardInterrupt: 

In [9]:
# ==========================
# ▶️ تنفيذ السكربت
# ==========================
if __name__ == "__main__":
    # 🔧 عدّل هذه القيم حسب الحاجة
    dataset_name = "antique/test/non-offensive"
    terms_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\antique_train_terms.txt"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_antique.joblib"

   generate_corrected_queries(
        dataset_name=dataset_name,
        terms_path=terms_path,
        output_path=output_path
    )


📥 تحميل الاستعلامات من: antique/test/non-offensive
📖 تحميل المفردات من: C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\antique_train_terms.txt
🚀 تحسين الاستعلامات...


🔧 تحسين: 100%|██████████| 176/176 [30:44<00:00, 10.48s/it]

✅ تم حفظ الاستعلامات المحسنة في: C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_antique.joblib





In [10]:
import joblib
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import List

# ⚙️ إعدادات BERT
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def embed_texts(texts: List[str], batch_size: int = 32) -> List[List[float]]:
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="🔢 تمثيل الاستعلامات"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeds = outputs.last_hidden_state[:, 0, :]  # استخدام [CLS]
        embeddings.extend(batch_embeds.cpu().numpy())
    return embeddings

# ✅ الدالة الرئيسية
def represent_enhanced_queries(enhanced_queries_path, output_path):
    print("📥 تحميل الاستعلامات المحسنة...")
    enhanced_data = joblib.load(enhanced_queries_path)
    query_ids = list(enhanced_data.keys())
    enhanced_texts = list(enhanced_data.values())

    print("🔄 تمثيل الاستعلامات باستخدام BERT...")
    embeddings = embed_texts(enhanced_texts, batch_size=32)

    print("💾 حفظ النتائج...")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    joblib.dump({
        "query_ids": query_ids,
        "embeddings": embeddings,
        "model_name": MODEL_NAME
    }, output_path)

    print(f"✅ تم حفظ التمثيلات في: {output_path}")


# === تنفيذ
if __name__ == "__main__":
    enhanced_queries_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_antique.joblib"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\enhanced\bert_enhanced_queries.joblib"

    represent_enhanced_queries(enhanced_queries_path, output_path)


  from .autonotebook import tqdm as notebook_tqdm


📥 تحميل الاستعلامات المحسنة...
🔄 تمثيل الاستعلامات باستخدام BERT...


🔢 تمثيل الاستعلامات: 100%|██████████| 6/6 [00:03<00:00,  1.65it/s]

💾 حفظ النتائج...
✅ تم حفظ التمثيلات في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\enhanced\bert_enhanced_queries.joblib





In [11]:
import os
import re
import joblib
import nltk
import ir_datasets
from tqdm import tqdm
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def clean_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

def build_hybrid_enhanced_representation(enhanced_queries_path, vectorizer_path, bert_path, output_path):
    # تحميل البيانات
    print("📥 تحميل البيانات...")
    enhanced_data = joblib.load(enhanced_queries_path)
    vectorizer_data = joblib.load(vectorizer_path)
    bert_data = joblib.load(bert_path)

    vectorizer: TfidfVectorizer = vectorizer_data["vectorizer"]
    bert_embeddings = bert_data["embeddings"]
    bert_query_ids = bert_data["query_ids"]
    bert_model_name = bert_data["model_name"]

    bert_map = dict(zip(bert_query_ids, bert_embeddings))

    query_docs = []
    query_ids = []
    original_texts = []
    clean_texts = []
    tfidf_indices_list = []
    tfidf_values_list = []

    print("🧼 بناء تمثيلات هجينة للاستعلامات المحسنة...")
    for qid, enhanced_text in tqdm(enhanced_data.items()):
        if qid not in bert_map:
            continue

        cleaned = clean_text(enhanced_text)
        tfidf_vector = vectorizer.transform([cleaned])
        row = tfidf_vector.getrow(0).tocoo()

        doc = {
            "query_id": qid,
            "original_text": enhanced_text,
            "clean_text": cleaned,
            "bert_embedding": bert_map[qid].tolist(),
            "tfidf_indices": row.col.tolist(),
            "tfidf_values": row.data.tolist()
        }
        query_docs.append(doc)
        query_ids.append(qid)
        original_texts.append(enhanced_text)
        clean_texts.append(cleaned)
        tfidf_indices_list.append(row.col.tolist())
        tfidf_values_list.append(row.data.tolist())

    # MongoDB (اختياري)
    client = MongoClient("mongodb://localhost:27017/")
    db = client["ir_project"]
    collection = db["queries_enhanced_hybrid_antique"]
    collection.delete_many({})
    collection.insert_many(query_docs)
    print(f"✅ تم تخزين {len(query_docs)} استعلام هجين محسّن في: {collection.name}")

    # Joblib
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    joblib.dump({
        "query_ids": query_ids,
        "original_texts": original_texts,
        "clean_texts": clean_texts,
        "bert_embeddings": [bert_map[qid].tolist() for qid in query_ids],
        "tfidf_indices": tfidf_indices_list,
        "tfidf_values": tfidf_values_list,
        "bert_model_name": bert_model_name
    }, output_path)

    print(f"📦 تم حفظ التمثيلات الهجينة المحسنة في: {output_path}")


# 🟩 تنفيذ
if __name__ == "__main__":
    enhanced_queries_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\vocabularies\Evaluation Query\enhanced_queries_antique.joblib"
    vectorizer_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\TF-IDF\antique\train\doc\tfidf_data.joblib"
    bert_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\enhanced\bert_enhanced_queries.joblib"
    output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\Antique\enhanced\hybrid_enhanced_queries.joblib"

    build_hybrid_enhanced_representation(
        enhanced_queries_path=enhanced_queries_path,
        vectorizer_path=vectorizer_path,
        bert_path=bert_path,
        output_path=output_path
    )


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


📥 تحميل البيانات...
🧼 بناء تمثيلات هجينة للاستعلامات المحسنة...


100%|██████████| 176/176 [00:00<00:00, 578.47it/s]


✅ تم تخزين 176 استعلام هجين محسّن في: queries_enhanced_hybrid_antique
📦 تم حفظ التمثيلات الهجينة المحسنة في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\Antique\enhanced\hybrid_enhanced_queries.joblib
