In [5]:
import ir_datasets
import re
import joblib
import os
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from pymongo import MongoClient

# إعدادات اللِمات والأدوات
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def clean_text(text):
    tokens = tokenize(text)
    lemmas = lemmatize_tokens(tokens)
    return ' '.join(lemmas)

# تحميل الداتا سيت
dataset = ir_datasets.load("antique/test/non-offensive")

# تحميل موديل BERT
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        emb = outputs.last_hidden_state[:, 0, :]
    return emb.squeeze().cpu().numpy()

# إعداد اتصال MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection = db["queries_antique_train"]

# نظف الاستعلامات، مثلها، وخزنها في MongoDB
query_docs = []
query_ids = []
embeddings = []

print("🔄 Processing queries...")

for q in tqdm(dataset.queries_iter()):
    cleaned = clean_text(q.text)
    emb = get_bert_embedding(cleaned)
    query_doc = {
        "query_id": q.query_id,
        "original_text": q.text,
        "clean_text": cleaned,
        "bert_embedding": emb.tolist()  # حول numpy array إلى list لتخزينها في MongoDB
    }
    query_docs.append(query_doc)
    query_ids.append(q.query_id)
    embeddings.append(emb)

# حفظ في MongoDB دفعة واحدة (bulk insert)
if query_docs:
    collection.delete_many({})  # تنظيف الكوليكشن قبل الإدخال (اختياري)
    collection.insert_many(query_docs)
    print(f"✅ تم تخزين {len(query_docs)} استعلام في MongoDB في الكوليكشن: {collection.name}")

# حفظ في ملف joblib
output_dir = r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings"
os.makedirs(output_dir, exist_ok=True)

joblib.dump({
    "query_ids": query_ids,
    "embeddings": embeddings,
    "model_name": MODEL_NAME
}, os.path.join(output_dir, "bert_query_embeddings.joblib"))

print(f"✅ تم حفظ تمثيلات الاستعلامات في {output_dir}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🔄 Processing queries...


176it [00:02, 66.98it/s]


✅ تم تخزين 176 استعلام في MongoDB في الكوليكشن: queries_antique_train
✅ تم حفظ تمثيلات الاستعلامات في C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings


In [4]:
import joblib
data = joblib.load(r"C:\Users\Azzam\PycharmProjects\PythonProject\Query Processing\Bertquery\antique\train\query_embeddings\bert_query_embeddings.joblib")
print(len(data["query_ids"]))
print(data["query_ids"][:10])


2426
['3097310', '3910705', '237390', '2247892', '1078492', '782453', '3198658', '1907320', '10895', '992730']
