In [1]:
import jieba
import pandas as pd
import pyLDAvis.gensim
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora
import matplotlib.pyplot as plt
from tqdm import tqdm



weibo_total_comments_path = "../result/task3/weibo_total_comments.csv"
bilibili_total_comments_path = "../result/task3/bilibili_total_comments.csv"
weibo_df = pd.read_csv(weibo_total_comments_path)
bilibili_df = pd.read_csv(bilibili_total_comments_path)

stops_path = "resources/stopword.txt"
user_dict_path = "resources/user_dict.txt"

output_path = "../result/task4"

In [2]:
def load_stops():
    stops = []
    with open(stops_path, encoding='utf-8') as fr:
        for line in fr:
            stops.append(line.strip().lower())
    return stops

def process_text(text, stopwords):
    words = jieba.cut(text)  # 使用 jieba 分词
    # 过滤停用词并去除空白词
    filtered_words = [word.strip() for word in words if word.strip() not in stopwords and len(word.strip()) > 1]
    return filtered_words


# 1. 加载停用词
STOPS = load_stops()
# 2. 加载自定义词典
jieba.load_userdict(user_dict_path)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\s6_ti\AppData\Local\Temp\jieba.cache
Loading model cost 0.326 seconds.
Prefix dict has been built successfully.


In [3]:
def tokenize_comments(df, stopwords):
    """
    将 DataFrame 中的评论内容进行分词处理，并返回列表的列表形式。
    
    参数:
        df (pd.DataFrame): 包含文本数据的 DataFrame
        stopwords (set): 停用词集合
        text_column (str): 需要处理的列名，默认是 'content'

    返回:
        List[List[str]]: 分词后的评论列表
    """
    tokenized_docs = []
    for text in df['content'].fillna(''):  # 确保空值处理
        tokens = process_text(text, stopwords)
        if tokens:  # 排除空列表
            tokenized_docs.append(tokens)
    return tokenized_docs

weibo_tokenized = tokenize_comments(weibo_df, STOPS)

In [4]:
weibo_dictionary = corpora.Dictionary(weibo_tokenized)
weibo_corpus = [weibo_dictionary.doc2bow(text) for text in weibo_tokenized]

# 查看词袋表示
print(weibo_corpus)

[[(0, 1), (1, 1)], [(2, 1), (3, 1)], [(4, 1), (5, 1)], [(4, 27), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1)], [(30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 1)], [(4, 1), (28, 1)], [(52, 1), (53, 1), (54, 1)], [(49, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)], [(49, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)], [(36, 1), (68, 1), (69, 1), (70, 1)], [(38, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 5), (82, 1), (83, 1), (84, 1), (85, 1)], [(38, 1), (71, 1), (72, 1)

In [5]:
# 设置主题数，例如 5 个主题
num_topics = 5

# 训练 LDA 模型
lda = LdaModel(weibo_corpus, num_topics=num_topics, id2word=weibo_dictionary, passes=15)

# 查看每个主题的 Top 10 词
topics = lda.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.330*"加强" + 0.044*"引星棘刺" + 0.031*"重写" + 0.028*"霍尔海雅" + 0.028*"伺夜" + 0.028*"出苍白海" + 0.019*"止颂" + 0.018*"黑键" + 0.016*"水月" + 0.015*"按闹分配"')
(1, '0.032*"干员" + 0.023*"笑哈哈" + 0.016*"强度" + 0.015*"厨子" + 0.015*"不改" + 0.013*"技能" + 0.011*"角色" + 0.010*"修改" + 0.010*"剧情" + 0.009*"模组"')
(2, '0.143*"重做" + 0.142*"弑君者" + 0.090*"微笑" + 0.076*"模组" + 0.050*"设计师" + 0.025*"星棘" + 0.020*"道歉" + 0.017*"明日方舟" + 0.012*"对待" + 0.011*"水陈"')
(3, '0.028*"dl" + 0.021*"我推" + 0.020*"语音" + 0.017*"超大杯" + 0.016*"对不起" + 0.015*"3.6" + 0.013*"爱姐" + 0.012*"鹰角网络" + 0.012*"柳德米拉" + 0.012*"凯尔希"')
(4, '0.083*"地铁" + 0.075*"拉普兰德" + 0.049*"宣传" + 0.038*"天赋" + 0.034*"疑问" + 0.034*"大屏" + 0.027*"跪下" + 0.023*"开心" + 0.019*"高清" + 0.019*"二皮"')


In [6]:
pyLDAvis.enable_notebook()

# 可视化 LDA 模型
vis = pyLDAvis.gensim.prepare(lda, weibo_corpus, weibo_dictionary)

# 保存成 HTML 文件
pyLDAvis.save_html(vis, f'{output_path}/weibo_lda_visualization.html')

In [7]:
def evaluate_lda_coherence(texts, dictionary, start=3, end=15, step=1, passes=10, plot=True):
    """
    尝试不同的 num_topics，计算 Coherence 得分，帮助选择最佳主题数。
    
    参数:
        texts: 分词后的文本列表（List[List[str]]）
        dictionary: gensim 生成的 Dictionary 对象
        start: 最小主题数（默认 3）
        end: 最大主题数（默认 15）
        step: 步长（默认 1）
        passes: LDA 模型训练的 passes 参数
        plot: 是否绘图（默认 True）
        
    返回:
        models: 所有 LDA 模型的列表
        coherence_scores: 对应的一致性得分列表
    """
    corpus = [dictionary.doc2bow(text) for text in texts]
    models = []
    coherence_scores = []
    topic_range = range(start, end + 1, step)

    print("正在尝试不同的主题数...")
    for num_topics in tqdm(topic_range):
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            passes=passes,
            random_state=42
        )
        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        score = coherence_model.get_coherence()
        models.append(lda_model)
        coherence_scores.append(score)

    if plot:
        plt.figure(figsize=(8, 5))
        plt.plot(topic_range, coherence_scores, marker='o')
        plt.title('主题数 vs 一致性得分 (Coherence)')
        plt.xlabel('主题数 (num_topics)')
        plt.ylabel('Coherence Score (c_v)')
        plt.xticks(topic_range)
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.show()

    return models, coherence_scores

In [8]:
weibo_models, weibo_scores = evaluate_lda_coherence(
    texts=weibo_tokenized,           # 你的分词数据
    dictionary=weibo_dictionary,           # gensim Dictionary
    start=3,
    end=12,
    step=1,
    passes=10
)

正在尝试不同的主题数...


  0%|          | 0/10 [00:20<?, ?it/s]


KeyboardInterrupt: 

In [None]:
for i in range(len(weibo_models)):
    # 可视化 LDA 模型
    vis = pyLDAvis.gensim.prepare(weibo_models[i], weibo_corpus, weibo_dictionary)
    
    # 保存成 HTML 文件
    pyLDAvis.save_html(vis, f'{output_path}/weibo_lda_visualization_{i}.html')