In [1]:
import os
import json
import joblib
import numpy as np
from tqdm import tqdm
from bertopic import BERTopic

# ============================
# 🛠️ الإعدادات
# ============================

datasets = {
    "antique_train": r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Pre-Processing\antique\train\doc\docs.json",
    "beir_quora_test": r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Pre-Processing\beir\quora\test\doc\docs.json"
}

embeddings_base_dir = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert"
output_base_dir = r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults"
os.makedirs(output_base_dir, exist_ok=True)

for dataset_name, json_path in datasets.items():
    if not os.path.exists(json_path):
        print(f"❌ لا يوجد ملف docs.json في {dataset_name} - سيتم تخطيه.")
        continue

    print(f"\n📂 معالجة الداتا: {dataset_name}")
    with open(json_path, "r", encoding="utf-8") as f:
        docs_list = json.load(f)

    docs, doc_ids = [], []
    for doc in tqdm(docs_list, desc="📄 تحميل الوثائق"):
        if "clean_text" in doc and doc["clean_text"].strip():
            docs.append(doc["clean_text"])
            doc_ids.append(doc["doc_id"])

    print(f"✅ Loaded {len(docs)} documents.")

    embedding_path = os.path.join(
        embeddings_base_dir,
        dataset_name.replace("_", os.sep),
        "doc",
        "bert_embedding.joblib"
    )

    if not os.path.exists(embedding_path):
        print(f"❌ لم يتم العثور على ملف التضمينات في {embedding_path}")
        continue

    embedding_data = joblib.load(embedding_path)
    if embedding_data["doc_ids"] != doc_ids:
        print(f"❌ ترتيب doc_ids لا يتطابق في {dataset_name}")
        continue

    embeddings = np.array(embedding_data["embeddings_matrix"])

    print("🧠 Fitting BERTopic model...")
    topic_model = BERTopic(language="english", verbose=True)
    topics, probs = topic_model.fit_transform(docs, embeddings)

    # ============================
    # 🔍 إنشاء تمثيل لكل توبيك
    # ============================

    topic_embeddings = {}
    for topic_id in set(topics):
        indices = [i for i, t in enumerate(topics) if t == topic_id]
        topic_embeds = embeddings[indices]
        topic_embeddings[topic_id] = topic_embeds.mean(axis=0)

    # 🧪 حفظ كل شيء
    output_path = os.path.join(output_base_dir, f"{dataset_name}_bertopic_results.joblib")
    joblib.dump({
        "topics": topics,
        "probs": probs,
        "doc_ids": doc_ids,
        "model": topic_model,
        "topic_embeddings": topic_embeddings  # ⬅️ التمثيلات الجديدة
    }, output_path)

    print(f"💾 Results saved to: {output_path}")

    try:
        print("📊 Visualizing topics...")
        fig = topic_model.visualize_topics()
        html_path = os.path.join(output_base_dir, f"{dataset_name}_topics_visualization.html")
        fig.write_html(html_path)
        print(f"🌐 Visualization saved as HTML: {html_path}")
    except Exception as e:
        print(f"⚠️ Visualization failed: {e}")

    print("\n🔎 First 10 document-topic pairs:")
    for i in range(min(10, len(doc_ids))):
        print(f"Doc ID: {doc_ids[i]}  --> Topic: {topics[i]}")


  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Azzam\PycharmProjects\PythonProject\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Azzam\PycharmProjects\PythonProject\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Azzam\PycharmProjects\PythonProject\.venv\Lib\s


📂 معالجة الداتا: antique_train


📄 تحميل الوثائق: 100%|██████████| 403666/403666 [00:00<00:00, 1446849.86it/s]


✅ Loaded 401768 documents.
🧠 Fitting BERTopic model...


2025-07-03 17:25:55,563 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-03 17:33:46,655 - BERTopic - Dimensionality - Completed ✓
2025-07-03 17:33:46,682 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-03 17:35:08,923 - BERTopic - Cluster - Completed ✓
2025-07-03 17:35:09,062 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-03 17:35:27,477 - BERTopic - Representation - Completed ✓


💾 Results saved to: C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\antique_train_bertopic_results.joblib
📊 Visualizing topics...
🌐 Visualization saved as HTML: C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\antique_train_topics_visualization.html

🔎 First 10 document-topic pairs:
Doc ID: 2020338_0  --> Topic: 1381
Doc ID: 2020338_1  --> Topic: 191
Doc ID: 2020338_2  --> Topic: -1
Doc ID: 2020338_3  --> Topic: -1
Doc ID: 2874684_0  --> Topic: 1171
Doc ID: 2874684_1  --> Topic: 1317
Doc ID: 4193114_0  --> Topic: -1
Doc ID: 4193114_1  --> Topic: -1
Doc ID: 1908421_0  --> Topic: 1572
Doc ID: 1908421_1  --> Topic: 1572

📂 معالجة الداتا: beir_quora_test


📄 تحميل الوثائق: 100%|██████████| 522931/522931 [00:00<00:00, 1778696.16it/s]


✅ Loaded 522719 documents.
❌ ترتيب doc_ids في ملف التضمين لا يطابق ملف الوثائق في beir_quora_test


In [1]:

import os
import json
import joblib
import numpy as np
from tqdm import tqdm
from bertopic import BERTopic

# ============================
# 🛠️ المسارات
# ============================

json_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Pre-Processing\beir\quora\test\doc\docs.json"
embedding_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\Data Representation\Bert\beir\quora\test\doc\bert_embedding.joblib"
output_dir = r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults"
os.makedirs(output_dir, exist_ok=True)

# ============================
# 📄 تحميل الوثائق
# ============================

with open(json_path, "r", encoding="utf-8") as f:
    docs_list = json.load(f)

docs = []
doc_ids = []
for doc in tqdm(docs_list, desc="📄 تحميل الوثائق"):
    if "clean_text" in doc and doc["clean_text"].strip():
        docs.append(doc["clean_text"])
        doc_ids.append(doc["doc_id"])

print(f"✅ Loaded {len(docs)} documents.")

# ============================
# 🔌 تحميل التضمينات
# ============================

embedding_data = joblib.load(embedding_path)
embedding_doc_ids = embedding_data["doc_ids"]
embedding_matrix = embedding_data["embeddings_matrix"]

print("🔁 إعادة ترتيب التضمينات حسب ترتيب docs.json...")

# 🧠 إنشاء قاموس ID → embedding
id_to_embedding = {doc_id: emb for doc_id, emb in zip(embedding_doc_ids, embedding_matrix)}

# 🧩 إعادة ترتيب التضمينات
reordered_embeddings = []
new_docs = []
new_doc_ids = []
for i, doc_id in enumerate(doc_ids):
    if doc_id in id_to_embedding:
        reordered_embeddings.append(id_to_embedding[doc_id])
        new_docs.append(docs[i])
        new_doc_ids.append(doc_id)

docs = new_docs
doc_ids = new_doc_ids
reordered_embeddings = np.array(reordered_embeddings)

print(f"🔢 عدد التضمينات بعد الترتيب: {reordered_embeddings.shape[0]}")

# ============================
# 🧠 BERTopic
# ============================

print("🧠 Fitting BERTopic model...")
topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(docs, reordered_embeddings)

# 🔧 تقليص عدد التوبيكات
print("🔧 Reducing topics to 4500...")
topic_model.reduce_topics(docs, nr_topics=4500)

# ============================
# 🎯 توليد تمثيل لكل توبيك
# ============================

print("🧠 Generating topic embeddings...")
topic_embeddings = {}
for topic_id in set(topics):
    indices = [i for i, t in enumerate(topics) if t == topic_id]
    topic_vector = reordered_embeddings[indices].mean(axis=0)
    topic_embeddings[topic_id] = topic_vector

# ============================
# 💾 حفظ النتائج
# ============================

output_path = os.path.join(output_dir, "beir_quora_test_bertopic_results.joblib")
joblib.dump({
    "topics": topics,
    "probs": probs,
    "doc_ids": doc_ids,
    "model": topic_model,
    "topic_embeddings": topic_embeddings
}, output_path)

print(f"💾 Results saved to: {output_path}")

# ============================
# 🌐 حفظ التصوّر
# ============================

try:
    print("📊 Visualizing topics...")
    fig = topic_model.visualize_topics()
    html_path = os.path.join(output_dir, "beir_quora_test_topics_visualization.html")
    fig.write_html(html_path)
    print(f"🌐 Visualization saved as HTML.")
except Exception as e:
    print(f"⚠️ فشل في حفظ التصوّر: {e}")



print("\n🔎 First 10 document-topic pairs:")
for i in range(min(10, len(doc_ids))):
    print(f"Doc ID: {doc_ids[i]}  --> Topic: {topics[i]}")

  from .autonotebook import tqdm as notebook_tqdm
📄 تحميل الوثائق: 100%|██████████| 522931/522931 [00:00<00:00, 794727.97it/s] 


✅ Loaded 522719 documents.
🔁 إعادة ترتيب التضمينات حسب ترتيب docs.json...


2025-07-06 21:13:31,121 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


🔢 عدد التضمينات بعد الترتيب: 522719
🧠 Fitting BERTopic model...


2025-07-06 21:30:27,779 - BERTopic - Dimensionality - Completed ✓
2025-07-06 21:30:27,864 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-06 21:35:06,442 - BERTopic - Cluster - Completed ✓
2025-07-06 21:35:06,802 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 21:35:43,910 - BERTopic - Representation - Completed ✓


🔧 Reducing topics to 4500...


2025-07-06 21:36:59,162 - BERTopic - Topic reduction - Reducing number of topics
2025-07-06 21:37:07,121 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-06 21:37:48,333 - BERTopic - Representation - Completed ✓
2025-07-06 21:37:48,567 - BERTopic - Topic reduction - Reduced number of topics from 8926 to 4500


🧠 Generating topic embeddings...
💾 Results saved to: C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\beir_quora_test_bertopic_results.joblib
📊 Visualizing topics...
🌐 Visualization saved as HTML.

🔎 First 10 document-topic pairs:
Doc ID: 1  --> Topic: 384
Doc ID: 2  --> Topic: 2454
Doc ID: 3  --> Topic: -1
Doc ID: 4  --> Topic: -1
Doc ID: 5  --> Topic: 462
Doc ID: 6  --> Topic: -1
Doc ID: 7  --> Topic: 1002
Doc ID: 8  --> Topic: -1
Doc ID: 9  --> Topic: -1
Doc ID: 10  --> Topic: -1


In [2]:
import joblib
import numpy as np
from sklearn.cluster import KMeans
from tqdm import tqdm
import os

# ============================
# 🛠️ إعداد المسارات
# ============================
input_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\beir_quora_test_bertopic_results.joblib"
output_path = r"C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\merged_topics_1500.joblib"
TARGET_NUM_TOPICS = 5

# ============================
# 📦 تحميل التوبيكات
# ============================
data = joblib.load(input_path)

original_topics = data["topics"]  # قائمة: موضوع لكل وثيقة
doc_ids = data["doc_ids"]
probs = data["probs"]
topic_embeddings = data["topic_embeddings"]  # dict
model = data.get("model", None)

topic_ids = list(topic_embeddings.keys())
topic_vectors = np.vstack([topic_embeddings[tid] for tid in topic_ids])

# ============================
# 🤝 دمج التوبيكات عبر KMeans
# ============================
print(f"📉 Clustering {len(topic_ids)} topics → {TARGET_NUM_TOPICS} clusters...")
kmeans = KMeans(n_clusters=TARGET_NUM_TOPICS, random_state=42)
new_topic_assignments = kmeans.fit_predict(topic_vectors)

# mapping: old_topic_id → new_topic_id
old_to_new_topic = {old: int(new) for old, new in zip(topic_ids, new_topic_assignments)}

# ============================
# 🔁 تعديل التوبيك لكل وثيقة
# ============================
new_topics = [old_to_new_topic.get(tid, -1) for tid in original_topics]

# ============================
# 🧠 حساب التمثيل الجديد لكل توبيك
# ============================
print("📐 Recomputing merged topic embeddings...")
new_topic_embeddings = {}
for new_topic_id in range(TARGET_NUM_TOPICS):
    indices = [i for i, t in enumerate(new_topics) if t == new_topic_id]
    if indices:
        emb_matrix = np.array([topic_vectors[topic_ids.index(original_topics[i])] for i in indices])
        new_topic_embeddings[new_topic_id] = emb_matrix.mean(axis=0)

# ============================
# 💾 حفظ النتائج
# ============================
joblib.dump({
    "topics": new_topics,
    "probs": probs,
    "doc_ids": doc_ids,
    "model": model,
    "topic_embeddings": new_topic_embeddings
}, output_path)

print(f"✅ Merged topics saved to: {output_path}")


📉 Clustering 8926 topics → 5 clusters...
📐 Recomputing merged topic embeddings...
✅ Merged topics saved to: C:\Users\Azzam\PycharmProjects\PythonProject\TopicResults\merged_topics_1500.joblib


In [1]:
import bertopic
print(bertopic.__version__)


  from .autonotebook import tqdm as notebook_tqdm


0.17.0
