In [2]:
import os
import re
import joblib
import ir_datasets
import nltk
import numpy as np
from tqdm import tqdm
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer



def custom_tokenizer(text):
    return text.split()

# تحميل موارد NLTK
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def clean_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

# مسارات الملفات
bert_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\BEIR\quora\test\query_embeddings\bert_query_embeddings.joblib"
vectorizer_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\TF-IDF\beir\quora\test\doc\tfidf_data.joblib"

# تحميل ملفات BERT و TF-IDF vectorizer الخاص بالوثائق
bert_data = joblib.load(bert_path)
vectorizer_data = joblib.load(vectorizer_path)
vectorizer = vectorizer_data["vectorizer"]

# تحميل الاستعلامات
dataset = ir_datasets.load("beir/quora/test")
query_ids = []
original_texts = []
clean_texts = []

print("🧼 تنظيف الاستعلامات...")
for query in tqdm(dataset.queries_iter()):
    cleaned = clean_text(query.text)
    if cleaned.strip():
        query_ids.append(query.query_id)
        original_texts.append(query.text)
        clean_texts.append(cleaned)

# تحويل الاستعلامات إلى تمثيل TF-IDF متوافق مع الوثائق
print("🔢 تحويل الاستعلامات إلى TF-IDF باستخدام vectorizer الوثائق...")
tfidf_matrix = vectorizer.transform(clean_texts)

# استخراج الإندكسات والقيم من المصفوفة
tfidf_indices_list = []
tfidf_values_list = []
for i in range(tfidf_matrix.shape[0]):
    row = tfidf_matrix.getrow(i).tocoo()
    tfidf_indices_list.append(row.col.tolist())
    tfidf_values_list.append(row.data.tolist())

bert_embeddings = bert_data["embeddings"]
bert_model_name = bert_data["model_name"]

# تخزين في MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection = db["queries_quora_test_hybrid_updated"]
collection.delete_many({})

query_docs = []
for i in tqdm(range(len(query_ids)), desc="Mongo Insert"):
    doc = {
        "query_id": query_ids[i],
        "original_text": original_texts[i],
        "clean_text": clean_texts[i],
        "bert_embedding": bert_embeddings[i].tolist(),
        "tfidf_indices": tfidf_indices_list[i],
        "tfidf_values": tfidf_values_list[i],
    }
    query_docs.append(doc)

collection.insert_many(query_docs)
print(f"✅ تم تخزين {len(query_docs)} استعلام هجين في MongoDB داخل: {collection.name}")

# حفظ بصيغة joblib
output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\BEIR\quora\test\hybird_query_data.joblib"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

joblib.dump({
    "query_ids": query_ids,
    "original_texts": original_texts,
    "clean_texts": clean_texts,
    "bert_embeddings": bert_embeddings,
    "tfidf_indices": tfidf_indices_list,
    "tfidf_values": tfidf_values_list,
    "bert_model_name": bert_model_name
}, output_path)

print(f"📦 تم حفظ تمثيل الاستعلامات الهجين في: {output_path}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🧼 تنظيف الاستعلامات...


10000it [00:06, 1531.07it/s]


🔢 تحويل الاستعلامات إلى TF-IDF باستخدام vectorizer الوثائق...


Mongo Insert: 100%|██████████| 10000/10000 [00:00<00:00, 38759.92it/s]


✅ تم تخزين 10000 استعلام هجين في MongoDB داخل: queries_quora_test_hybrid_updated
📦 تم حفظ تمثيل الاستعلامات الهجين في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\BEIR\quora\test\hybird_query_data.joblib


In [5]:
import os
import re
import json
import joblib
import ir_datasets
import nltk
import numpy as np
from tqdm import tqdm
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# تحميل موارد NLTK
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def clean_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize_tokens(tokens)
    return " ".join(lemmas)

# ------------------ المسارات -------------------
bert_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings\bert_query_embeddings.joblib"
vectorizer_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\TF-IDF\antique\train\doc\tfidf_data.joblib"
output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\Antique\train\hybird_query_data.joblib"

# ------------------ تحميل البيانات -------------------
bert_data = joblib.load(bert_path)
vectorizer_data = joblib.load(vectorizer_path)
vectorizer = vectorizer_data["vectorizer"]

bert_embeddings = bert_data["embeddings"]
bert_model_name = bert_data["model_name"]
bert_query_ids = bert_data["query_ids"]

# ربط تمثيلات BERT بالـ query_id
bert_map = {
    qid: emb for qid, emb in zip(bert_query_ids, bert_embeddings)
}

# تحميل الاستعلامات
dataset = ir_datasets.load("antique/test/non-offensive")

query_docs = []
query_ids = []
original_texts = []
clean_texts = []
tfidf_indices_list = []
tfidf_values_list = []

print("🧼 تنظيف الاستعلامات...")
for query in tqdm(dataset.queries_iter()):
    cleaned = clean_text(query.text)
    if not cleaned.strip():
        continue

    if query.query_id not in bert_map:
        continue  # تجاهل الاستعلامات التي لا يوجد لها تمثيل BERT

    # تمثيل TF-IDF
    tfidf_vector = vectorizer.transform([cleaned])
    row = tfidf_vector.getrow(0).tocoo()

    # بناء الإدخال
    doc = {
        "query_id": query.query_id,
        "original_text": query.text,
        "clean_text": cleaned,
        "bert_embedding": bert_map[query.query_id].tolist(),
        "tfidf_indices": row.col.tolist(),
        "tfidf_values": row.data.tolist()
    }
    query_docs.append(doc)

    # تجميع للـ joblib لاحقًا
    query_ids.append(query.query_id)
    original_texts.append(query.text)
    clean_texts.append(cleaned)
    tfidf_indices_list.append(row.col.tolist())
    tfidf_values_list.append(row.data.tolist())

# ------------------ تخزين في MongoDB -------------------
client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection = db["queries_quora_test_hybrid_antique"]
collection.delete_many({})
collection.insert_many(query_docs)

print(f"✅ تم تخزين {len(query_docs)} استعلام هجين في MongoDB داخل: {collection.name}")

# ------------------ حفظ بصيغة joblib -------------------
os.makedirs(os.path.dirname(output_path), exist_ok=True)

joblib.dump({
    "query_ids": query_ids,
    "original_texts": original_texts,
    "clean_texts": clean_texts,
    "bert_embeddings": [bert_map[qid].tolist() for qid in query_ids],
    "tfidf_indices": tfidf_indices_list,
    "tfidf_values": tfidf_values_list,
    "bert_model_name": bert_model_name
}, output_path)

print(f"📦 تم حفظ تمثيل الاستعلامات الهجين في: {output_path}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🧼 تنظيف الاستعلامات...


176it [00:00, 386.50it/s]


✅ تم تخزين 176 استعلام هجين في MongoDB داخل: queries_quora_test_hybrid_antique
📦 تم حفظ تمثيل الاستعلامات الهجين في: C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\hybridQuery\Antique\train\hybird_query_data.joblib
