In [1]:
import os
import json
import string
import nltk
import joblib
from pymongo import MongoClient
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# تحميل الموارد
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuations = set(string.punctuation)

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        if token in punctuations:
            continue
        if token in stop_words:
            continue
        if token.isdigit():
            continue
        lemma = lemmatizer.lemmatize(token)
        if len(lemma) < 3:
            continue
        cleaned_tokens.append(lemma)
    return ' '.join(cleaned_tokens)

def custom_tokenizer(text):
    return text.split()

def process_tfidf_with_cleaning(dataset_name, collection_name):
    print(f"🚀 Processing TF-IDF from MongoDB collection: {collection_name} using new cleaning...")

    # الاتصال بقاعدة البيانات
    client = MongoClient("mongodb://localhost:27017/")
    db = client["ir_project"]
    collection = db[collection_name]

    documents = list(collection.find({}, {"_id": 0, "doc_id": 1, "original_text": 1}))
    documents = [doc for doc in documents if doc.get("original_text", "").strip()]

    if not documents:
        print("❌ لا توجد نصوص أصلية متاحة للمعالجة.")
        return

    doc_ids = [doc["doc_id"] for doc in documents]
    processed_texts = [preprocess(doc["original_text"]) for doc in tqdm(documents)]

    vectorizer = TfidfVectorizer(
        tokenizer=custom_tokenizer,
        lowercase=False,
        preprocessor=None,
        token_pattern=None,
        min_df=1
    )
    tfidf_matrix = vectorizer.fit_transform(processed_texts)

    # تخزين النتائج في MongoDB (بدون تغيير)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_data = []
    for i, doc_id in enumerate(tqdm(doc_ids)):
        row = tfidf_matrix[i]
        values = row.data
        indices = row.indices
        tfidf_scores = {feature_names[idx]: float(val) for idx, val in zip(indices, values)}
        tfidf_data.append({
            "doc_id": doc_id,
            "tfidf": tfidf_scores
        })

    tfidf_collection_name = f"tfidf_{collection_name}_recleaned"
    tfidf_collection = db[tfidf_collection_name]
    tfidf_collection.delete_many({})
    tfidf_collection.insert_many(tfidf_data)
    print(f"✅ TF-IDF stored in MongoDB collection: {tfidf_collection_name}")

    # -------------------------- #
    # التخزين باستخدام joblib
    # -------------------------- #
    save_path = os.path.join(dataset_name.replace("/", os.sep), "doc")
    os.makedirs(save_path, exist_ok=True)
    joblib_path = os.path.join(save_path, "tfidf_data.joblib")

    # حفظ dict فيه المصفوفة والـvectorizer
    joblib.dump({
        "tfidf_matrix": tfidf_matrix,
        "vectorizer": vectorizer,
        "doc_ids": doc_ids
    }, joblib_path)

    print(f"✅ TF-IDF matrix and vectorizer saved to joblib file: {joblib_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azzam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
process_tfidf_with_cleaning("beir/quora/test", "documents_quora_test")

🚀 Processing TF-IDF from MongoDB collection: documents_quora_test using new cleaning...
❌ لا توجد نصوص أصلية متاحة للمعالجة.


In [2]:
# Antique Dataset
process_tfidf_with_cleaning("antique/train", "documents_test")

🚀 Processing TF-IDF from MongoDB collection: documents_test using new cleaning...


100%|██████████| 403666/403666 [06:01<00:00, 1117.29it/s]
100%|██████████| 403666/403666 [00:56<00:00, 7200.09it/s]


✅ TF-IDF stored in MongoDB collection: tfidf_documents_test_recleaned
✅ TF-IDF matrix and vectorizer saved to joblib file: antique\train\doc\tfidf_data.joblib


In [2]:
import joblib

def custom_tokenizer(text):
    return text.split()

data = joblib.load(r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\beir\quora\test\doc\bert_embedding.joblib")


print(data.keys())



dict_keys(['doc_ids', 'embeddings_matrix', 'model_name'])
