In [1]:
import ir_datasets
import re
from collections import Counter
import os

# مسار لحفظ ملفات المفردات
OUTPUT_DIR = "symspell_vocab"

# أنشئ المجلد إذا لم يكن موجود
os.makedirs(OUTPUT_DIR, exist_ok=True)

# اسم الداتا ومكان حفظ الملف
DATASETS = {
    "antique/train": "symspell_vocab_antique.txt",
    "beir/quora/test": "symspell_vocab_quora.txt"
}

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

def build_vocab_from_dataset(dataset_key, save_path):
    print(f"\n📦 Loading dataset: {dataset_key}")
    dataset = ir_datasets.load(dataset_key)
    counter = Counter()

    for i, doc in enumerate(dataset.docs_iter()):
        words = clean_and_tokenize(doc.text)
        counter.update(words)
        if (i + 1) % 1000 == 0:
            print(f"→ Processed {i + 1} docs")

    print(f"✅ Total words: {len(counter)} | Writing to {save_path}")

    with open(os.path.join(OUTPUT_DIR, save_path), "w", encoding="utf-8") as f:
        for word, freq in counter.items():
            f.write(f"{word}\t{freq}\n")

    print(f"✅ Done: {save_path}\n")

# تنفيذ لكل داتا
for dataset_key, filename in DATASETS.items():
    build_vocab_from_dataset(dataset_key, filename)



📦 Loading dataset: antique/train
→ Processed 1000 docs
→ Processed 2000 docs
→ Processed 3000 docs
→ Processed 4000 docs
→ Processed 5000 docs
→ Processed 6000 docs
→ Processed 7000 docs
→ Processed 8000 docs
→ Processed 9000 docs
→ Processed 10000 docs
→ Processed 11000 docs
→ Processed 12000 docs
→ Processed 13000 docs
→ Processed 14000 docs
→ Processed 15000 docs
→ Processed 16000 docs
→ Processed 17000 docs
→ Processed 18000 docs
→ Processed 19000 docs
→ Processed 20000 docs
→ Processed 21000 docs
→ Processed 22000 docs
→ Processed 23000 docs
→ Processed 24000 docs
→ Processed 25000 docs
→ Processed 26000 docs
→ Processed 27000 docs
→ Processed 28000 docs
→ Processed 29000 docs
→ Processed 30000 docs
→ Processed 31000 docs
→ Processed 32000 docs
→ Processed 33000 docs
→ Processed 34000 docs
→ Processed 35000 docs
→ Processed 36000 docs
→ Processed 37000 docs
→ Processed 38000 docs
→ Processed 39000 docs
→ Processed 40000 docs
→ Processed 41000 docs
→ Processed 42000 docs
→ Process