In [1]:
import os
import json
import string
import nltk
from pymongo import MongoClient
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import ir_datasets

# ------------- تحميل الموارد -------------
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuations = set(string.punctuation)

def preprocess(text):
    """تنظيف النص وإرجاع قائمة كلمات"""
    text = text.lower()
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        if token in punctuations:
            continue
        if token in stop_words:
            continue
        if token.isdigit():
            continue
        lemma = lemmatizer.lemmatize(token)
        if len(lemma) < 3:
            continue
        cleaned_tokens.append(lemma)
    return cleaned_tokens

# ---------------- تحميل مجموعة البيانات ----------------
dataset_name = "antique/train"
dataset = ir_datasets.load(dataset_name)

# -------------- إعداد قاعدة البيانات -------------------
client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection_docs = db["documents_test"]

# -------------- إعداد مجلد التخزين --------------------
save_path = os.path.join(dataset_name.replace("/", os.sep), "doc")
os.makedirs(save_path, exist_ok=True)
docs_json_path = os.path.join(save_path, "docs.json")

# ------------ معالجة وتخزين الوثائق ---------------------
print(f"🚀 Processing documents from {dataset_name}...")

all_docs = []

for doc in tqdm(dataset.docs_iter(), total=dataset.docs_count()):
    clean_tokens = preprocess(doc.text)
    clean_text = ' '.join(clean_tokens)  # تخزين كنص وليس قائمة

    # تخزين كامل في MongoDB
    doc_entry = {
        "doc_id": doc.doc_id,
        "original_text": doc.text,
        "clean_text": clean_text
    }
    collection_docs.insert_one(doc_entry)

    # تخزين مختصر في JSON (doc_id + نص نظيف)
    all_docs.append({
        "doc_id": doc.doc_id,
        "clean_text": clean_text
    })

# حفظ نسخة JSON على القرص
with open(docs_json_path, "w", encoding="utf-8") as f:
    json.dump(all_docs, f, ensure_ascii=False, indent=2)

print(f"✅ Saved cleaned documents to JSON file: {docs_json_path}")
print(f"✅ Stored cleaned documents in MongoDB collection: {collection_docs.name}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🚀 Processing documents from antique/train...


100%|██████████| 403666/403666 [13:26<00:00, 500.82it/s]


✅ Saved cleaned documents to JSON file: antique\train\doc\docs.json
✅ Stored cleaned documents in MongoDB collection: documents_test
