<a href="https://colab.research.google.com/github/FanmeiWang/AI-AND-MATHS/blob/main/20250331_BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
!pip install bertopic
!pip install sentence-transformers

import os
import re
import nltk
import pandas as pd
from hdbscan import HDBSCAN
from bertopic import BERTopic
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')

# === 配置项 ===
INPUT_CSV = "extracted_RS_2023-04_filtered.csv"
OUTPUT_TOPIC_INFO = "extracted_RS_2023-04_topic_info.csv"
OUTPUT_DF_WITH_TOPICS = "extracted_RS_2023-04_with_topics.csv"
TEXT_COLUMN = "selftext"
USE_STOPWORDS = True

# === 文本清洗函数 ===
def clean_text(text):
    if not isinstance(text, str):
        return ""
    if text.strip().lower() in ["[deleted]", "[removed]", ""]:
        return ""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s\.\,\!\?\']", " ", text)
    text = text.lower()
    if USE_STOPWORDS:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in text.split() if token not in stop_words]
        text = " ".join(tokens)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# === 主函数 ===
def main():
    print("[INFO] 读取原始数据...")
    if not os.path.exists(INPUT_CSV):
        print(f"[ERROR] 找不到输入文件: {INPUT_CSV}")
        return

    df = pd.read_csv(INPUT_CSV, encoding='utf-8', keep_default_na=False)
    if TEXT_COLUMN not in df.columns:
        print(f"[ERROR] DataFrame 不包含名为 '{TEXT_COLUMN}' 的列！")
        return

    print("[INFO] 开始清洗文本...")
    df["clean_text"] = df[TEXT_COLUMN].apply(clean_text)
    df = df[df["clean_text"].str.strip() != ""]  # 删除清洗后为空的行
    docs = df["clean_text"].tolist()

    print("[INFO] 加载嵌入模型: all-mpnet-base-v2")
    embedding_model = SentenceTransformer("all-mpnet-base-v2")

    print("[INFO] 初始化 BERTopic 模型...")
    hdbscan_model = HDBSCAN(
        min_cluster_size=5,
        min_samples=1,
        metric='euclidean',
        cluster_selection_epsilon=0.0,
        cluster_selection_method='eom'
    )
    topic_model = BERTopic(
        embedding_model=embedding_model,
        hdbscan_model=hdbscan_model,
        verbose=True,
        nr_topics=10
    )

    print("[INFO] 执行主题建模...")
    topics, probs = topic_model.fit_transform(docs)
    df["topic_id"] = topics

    # 保存结果
    topic_info = topic_model.get_topic_info()
    print("[INFO] 主题信息（前10行）：")
    print(topic_info.head(10))

    topic_info.to_csv(OUTPUT_TOPIC_INFO, index=False, encoding='utf-8')
    df.to_csv(OUTPUT_DF_WITH_TOPICS, index=False, encoding='utf-8')

    print(f"[DONE] 主题信息已保存到: {OUTPUT_TOPIC_INFO}")
    print(f"[DONE] 带主题标签的DataFrame已保存到: {OUTPUT_DF_WITH_TOPICS}")

if __name__ == "__main__":
    main()





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[INFO] 读取原始数据...
[INFO] 开始清洗文本...
[INFO] 加载嵌入模型: all-mpnet-base-v2


2025-04-05 15:59:26,520 - BERTopic - Embedding - Transforming documents to embeddings.


[INFO] 初始化 BERTopic 模型...
[INFO] 执行主题建模...


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

2025-04-05 16:05:56,868 - BERTopic - Embedding - Completed ✓
2025-04-05 16:05:56,870 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-05 16:06:02,125 - BERTopic - Dimensionality - Completed ✓
2025-04-05 16:06:02,126 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-05 16:06:02,178 - BERTopic - Cluster - Completed ✓
2025-04-05 16:06:02,179 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-05 16:06:02,343 - BERTopic - Representation - Completed ✓
2025-04-05 16:06:02,344 - BERTopic - Topic reduction - Reducing number of topics
2025-04-05 16:06:02,355 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-05 16:06:02,466 - BERTopic - Representation - Completed ✓
2025-04-05 16:06:02,471 - BERTopic - Topic reduction - Reduced number of topics from 86 to 10


[INFO] 主题信息（前10行）：
   Topic  Count                           Name  \
0     -1    233  -1_work_canada_application_pr   
1      0    351        0_canada_would_work_get   
2      1    281     1_application_pr_card_ircc   
3      2    278      2_permit_work_pgwp_letter   
4      3     23      3_medical_exam_years_year   
5      4     19       4_code_noc_data_business   
6      5     16         5_score_ielts_draw_crs   
7      6     10    6_title_enjoy_above_general   
8      7      6      7_car_import_export_goods   
9      8      6     8_bank_money_account_funds   

                                      Representation  \
0  [work, canada, application, pr, get, permit, w...   
1  [canada, would, work, get, years, visa, job, c...   
2  [application, pr, card, ircc, passport, need, ...   
3  [permit, work, pgwp, letter, extension, study,...   
4  [medical, exam, years, year, rehabilitation, t...   
5  [code, noc, data, business, management, custom...   
6  [score, ielts, draw, crs, 481, cut,

In [19]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
!pip install bertopic
!pip install sentence-transformers

import os
import re
import nltk
import pandas as pd
from hdbscan import HDBSCAN
from bertopic import BERTopic
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')

# === 配置项 ===
INPUT_CSV = "extracted_RS_2023-04_filtered.csv"
OUTPUT_TOPIC_INFO = "extracted_RS_2023-04_topic_info.csv"
OUTPUT_DF_WITH_TOPICS = "extracted_RS_2023-04_with_topics.csv"
TEXT_COLUMN = "selftext"
USE_STOPWORDS = True

# === 文本清洗函数 ===
def clean_text(text):
    if not isinstance(text, str):
        return ""
    if text.strip().lower() in ["[deleted]", "[removed]", ""]:
        return ""
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s\.\,\!\?\']", " ", text)
    text = text.lower()
    if USE_STOPWORDS:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in text.split() if token not in stop_words]
        text = " ".join(tokens)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# === 主函数 ===
def main():
    print("[INFO] 读取原始数据...")
    if not os.path.exists(INPUT_CSV):
        print(f"[ERROR] 找不到输入文件: {INPUT_CSV}")
        return

    df = pd.read_csv(INPUT_CSV, encoding='utf-8', keep_default_na=False)
    if TEXT_COLUMN not in df.columns:
        print(f"[ERROR] DataFrame 不包含名为 '{TEXT_COLUMN}' 的列！")
        return

    print("[INFO] 开始清洗文本...")
    df["clean_text"] = df[TEXT_COLUMN].apply(clean_text)
    df = df[df["clean_text"].str.strip() != ""]  # 删除清洗后为空的行
    docs = df["clean_text"].tolist()

    print("[INFO] 加载嵌入模型: paraphrase-mpnet-base-v2")
    embedding_model = SentenceTransformer("paraphrase-mpnet-base-v2")

    print("[INFO] 初始化 BERTopic 模型...")
    hdbscan_model = HDBSCAN(
        min_cluster_size=5,
        min_samples=1,
        metric='euclidean',
        cluster_selection_epsilon=0.0,
        cluster_selection_method='eom'
    )
    topic_model = BERTopic(
        embedding_model=embedding_model,
        hdbscan_model=hdbscan_model,
        verbose=True,
        nr_topics=10
    )

    print("[INFO] 执行主题建模...")
    topics, probs = topic_model.fit_transform(docs)
    df["topic_id"] = topics

    # 保存结果
    topic_info = topic_model.get_topic_info()
    print("[INFO] 主题信息（前10行）：")
    print(topic_info.head(10))

    topic_info.to_csv(OUTPUT_TOPIC_INFO, index=False, encoding='utf-8')
    df.to_csv(OUTPUT_DF_WITH_TOPICS, index=False, encoding='utf-8')

    print(f"[DONE] 主题信息已保存到: {OUTPUT_TOPIC_INFO}")
    print(f"[DONE] 带主题标签的DataFrame已保存到: {OUTPUT_DF_WITH_TOPICS}")

if __name__ == "__main__":
    main()




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[INFO] 读取原始数据...
[INFO] 开始清洗文本...
[INFO] 加载嵌入模型: all-mpnet-base-v2


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-04-05 16:20:04,584 - BERTopic - Embedding - Transforming documents to embeddings.


[INFO] 初始化 BERTopic 模型...
[INFO] 执行主题建模...


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

2025-04-05 16:27:07,230 - BERTopic - Embedding - Completed ✓
2025-04-05 16:27:07,231 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-05 16:27:12,669 - BERTopic - Dimensionality - Completed ✓
2025-04-05 16:27:12,670 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-05 16:27:12,747 - BERTopic - Cluster - Completed ✓
2025-04-05 16:27:12,748 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-04-05 16:27:12,909 - BERTopic - Representation - Completed ✓
2025-04-05 16:27:12,910 - BERTopic - Topic reduction - Reducing number of topics
2025-04-05 16:27:12,926 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-05 16:27:13,051 - BERTopic - Representation - Completed ✓
2025-04-05 16:27:13,055 - BERTopic - Topic reduction - Reduced number of topics from 87 to 10


[INFO] 主题信息（前10行）：
   Topic  Count                                          Name  \
0     -1    315             -1_canada_application_work_permit   
1      0    281                     0_pr_canada_passport_card   
2      1    195                    1_permit_work_canada_study   
3      2    113           2_biometrics_application_score_ircc   
4      3    111                       3_canada_get_job_degree   
5      4    107                        4_pgwp_work_apply_year   
6      5     57                   5_noc_letter_job_experience   
7      6     27         6_ceremony_oath_citizenship_scheduled   
8      7     11              7_title_neuroscience_unlucky_fyi   
9      8      6  8_rehabilitation_criminal_convictions_deemed   

                                      Representation  \
0  [canada, application, work, permit, pr, would,...   
1  [pr, canada, passport, card, application, visa...   
2  [permit, work, canada, study, visa, job, apply...   
3  [biometrics, application, score, ircc,