In [1]:
import os
import joblib
import torch
from tqdm import tqdm
from pymongo import MongoClient
from transformers import AutoTokenizer, AutoModel

# تحميل BERT
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()  # تعطيل التدريب

# CUDA إذا متاح
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_bert_embedding(text):
    """تحويل نص إلى تمثيل BERT"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
    return embeddings.squeeze().cpu().numpy()

def process_bert_embedding(dataset_name, collection_name):
    print(f"🚀 Processing BERT embeddings from MongoDB collection: {collection_name}...")

    # الاتصال بقاعدة البيانات
    client = MongoClient("mongodb://localhost:27017/")
    db = client["ir_project"]
    collection = db[collection_name]

    # جلب الوثائق
    documents = list(collection.find({}, {"_id": 0, "doc_id": 1, "original_text": 1}))
    documents = [doc for doc in documents if doc.get("original_text", "").strip()]

    if not documents:
        print("❌ لا توجد نصوص أصلية متاحة للمعالجة.")
        return

    doc_ids = [doc["doc_id"] for doc in documents]
    texts = [doc["original_text"] for doc in documents]

    # الحصول على التمثيلات
    all_embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        emb = get_bert_embedding(text)
        all_embeddings.append(emb)

    # حفظ البيانات
    embedding_data = {
        "doc_ids": doc_ids,
        "embeddings_matrix": all_embeddings,
        "model_name": MODEL_NAME
    }

    save_path = os.path.join(dataset_name.replace("/", os.sep), "doc")
    os.makedirs(save_path, exist_ok=True)
    joblib.dump(embedding_data, os.path.join(save_path, "bert_embedding.joblib"))

    print(f"✅ BERT embeddings saved to: {os.path.join(save_path, 'bert_embedding.joblib')}")


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import os
import json
import string
import torch
import joblib
import nltk
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import ir_datasets

# تحميل الموارد
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# الإعدادات
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuations = set(string.punctuation)
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# تحميل BERT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# دالة تنظيف
def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        if token in punctuations or token in stop_words or token.isdigit():
            continue
        lemma = lemmatizer.lemmatize(token)
        if len(lemma) < 3:
            continue
        cleaned_tokens.append(lemma)
    return ' '.join(cleaned_tokens)

# دالة تمثيل BERT
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
    return embeddings.squeeze().cpu().numpy()

# المسار والداتاسيت
dataset_name = "antique/train"
dataset = ir_datasets.load(dataset_name)
save_path = os.path.join(dataset_name.replace("/", os.sep), "doc")
os.makedirs(save_path, exist_ok=True)
save_file = os.path.join(save_path, "bert_embedding.joblib")

print(f"🚀 Loading and processing dataset: {dataset_name}")

doc_ids = []
embeddings = []

for doc in tqdm(dataset.docs_iter(), total=dataset.docs_count(), desc="🔄 Processing"):
    clean_text = preprocess(doc.text)
    if not clean_text.strip():
        continue
    emb = get_bert_embedding(clean_text)
    doc_ids.append(doc.doc_id)
    embeddings.append(emb)

# حفظ البيانات
embedding_data = {
    "doc_ids": doc_ids,
    "embeddings_matrix": embeddings,
    "model_name": MODEL_NAME
}
joblib.dump(embedding_data, save_file)

print(f"✅ BERT embeddings saved to: {save_file}")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🚀 Loading and processing dataset: antique/train


🔄 Processing: 100%|██████████| 403666/403666 [2:30:56<00:00, 44.57it/s]  


✅ BERT embeddings saved to: antique\train\doc\bert_embedding.joblib


In [2]:
# Quora Dataset
process_bert_embedding("beir/quora/test", "documents_quora_test")

🚀 Processing BERT embeddings from MongoDB collection: documents_quora_test...


Embedding: 100%|██████████| 522931/522931 [1:59:47<00:00, 72.76it/s]  


✅ BERT embeddings saved to: beir\quora\test\doc\bert_embedding.joblib
