# TASK 3 网络舆情热度口碑分析
## 一、数据预处理
### 1.1 相关模块导入 与 全局设置

In [1]:

import datetime
import os
from collections import Counter

import jieba
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
weibo_comments_path = "../comments/weibo"
bilibili_comments_path = "../comments/bilibili"
stops_path = "../resources/stopword.txt"
user_dict_path = "../resources/user_dict.txt"

### 1.2 读取数据

In [2]:
def read_comments_from_path(_path):
    data_frames = []
    for root, dirs, files in os.walk(_path):
        for file in files:
            # 检查文件是否为 CSV 文件
            if file.endswith('.csv'):
                # 构建文件的完整路径
                file_path = os.path.join(root, file)
                try:
                    # 读取 CSV 文件并添加到数据框列表中
                    df = pd.read_csv(file_path)
                    data_frames.append(df)
                except Exception as e:
                    print(f"读取 {file_path} 时出错: {e}")
    df = pd.concat(data_frames, ignore_index=True)
    df.columns = ['ID', 'nickname', 'gender', 'level', 'followers', 'address', 'content', 'likes', 'time', 'hierarchy',
                  'rootID']
    return df


weibo_df = read_comments_from_path(weibo_comments_path)
bilibili_df = read_comments_from_path(bilibili_comments_path)
# B站获取的时间需要校准（UTC → 北京时间，再去除时区信息）
bilibili_df['time'] = pd.to_datetime(bilibili_df['time'], utc=True).dt.tz_convert('Asia/Shanghai').dt.tz_localize(None)

### 1.3 分别显示微博评论和B站评论的前几项

In [None]:
weibo_df.head()

In [None]:
bilibili_df.head()

### 1.4 筛选3月13日之前的评论

In [3]:
def filter_by_time(df):
    df['time'] = pd.to_datetime(df['time'])
    num1 = df.size
    df_filter = df[df['time'] < pd.to_datetime('2025-03-13')]
    num2 = df_filter.size
    print(f"删除的{num1 - num2}条记录")
    return df_filter


weibo_df = filter_by_time(weibo_df)
bilibili_df = filter_by_time(bilibili_df)

删除的2266条记录
删除的6897条记录


## 二、绘制密度图

In [None]:
def draw_density_plot(df: pd.DataFrame, platform=''):
    df['time'] = pd.to_datetime(df['time'])
    df['hour'] = df['time'].dt.hour
    # 绘制核密度图
    sns.kdeplot(x=df['hour'], linewidth=2)
    # 设置x轴和y轴的范围
    plt.xlim(0, 23)
    plt.ylim(0, None)
    # 修改横轴刻度显示，每间隔1个小时写一个数字，从0开始，到24
    plt.xticks(np.arange(0, 24))
    plt.xlabel('时间')
    plt.ylabel('评论数密度')
    plt.title(f'{platform}评论的时间密度函数图')
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.savefig(fname=f"../result/task3/{platform}_kde_plot.png", bbox_inches='tight')
    plt.show()


In [None]:
draw_density_plot(weibo_df, platform='微博')

In [None]:
draw_density_plot(bilibili_df, platform='B站')

## 三、绘制柱状图

In [None]:
def draw_bar_chart(df: pd.DataFrame, platform=''):
    df['time'] = pd.to_datetime(df['time'])
    # 按天统计评论数
    df['date'] = df['time'].dt.date
    daily_data = df.groupby('date').size()
    # 绘制柱状图
    plt.figure(figsize=(12, 6))
    bars = plt.bar(daily_data.index, daily_data.values)
    # 在每根柱子上标注数值
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height}',
                 ha='center', va='bottom')
    plt.title(f'{platform}评论数量的按日期分布')
    plt.xlabel('日期')
    plt.ylabel('评论数量')
    plt.savefig(fname=f"../result/task3/{platform}_bar_chart.png", bbox_inches='tight')
    plt.show()

In [None]:
draw_bar_chart(weibo_df, platform='微博')

In [None]:
draw_bar_chart(bilibili_df, platform='B站')

## 四、绘制词云图

In [4]:
def load_stops():
    stops = []
    with open(stops_path, encoding='utf-8') as fr:
        for line in fr:
            stops.append(line.strip().lower())
    return stops


def process_text(text, stopwords):
    words = jieba.cut(text)  # 使用 jieba 分词
    # 过滤停用词并去除空白词
    filtered_words = [word.strip() for word in words if word.strip() not in stopwords and len(word.strip()) > 1]
    return filtered_words


def draw_total_wordcloud(df: pd.DataFrame, stops=None, platform=''):
    df['time'] = pd.to_datetime(df['time'])
    # 按天统计评论数
    df['date'] = df['time'].dt.date
    # 过滤日期范围
    start_date_str = df['date'].min()
    end_date_str = df['date'].max()
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    mask = (df['time'] >= start_date) & (df['time'] <= end_date)
    df_period = df[mask]
    total_text = ' '.join(df_period['content'].dropna())
    # 分词并去停用词
    total_words = process_text(total_text, stops)
    total_counts = Counter(total_words)
    # 生成总体词云图
    wordcloud_total = WordCloud(
        font_path='msyh.ttc',
        width=1200,
        height=600,
        background_color='white',
        colormap='Set2',
        max_words=400,
        min_font_size=10
    ).generate_from_frequencies(total_counts)
    # 保存总体词云图
    wordcloud_total.to_file(f'../result/task3/{platform}_total_wordcloud.png')
    # 显示词云图
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud_total, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{platform}总体词云图")
    plt.show()


def draw_daily_wordcloud(df: pd.DataFrame, stops=None, platform=''):
    df['time'] = pd.to_datetime(df['time'])
    # 按天统计评论数
    df['date'] = df['time'].dt.date
    daily_words = []
    daily_counts = []
    # 过滤日期范围
    start_date_str = df['date'].min()
    end_date_str = df['date'].max()
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    day = start_date

    while day <= end_date:
        next_day = day + datetime.timedelta(days=1)
        day_text = ' '.join(df[(df['time'] >= day) & (df['time'] < next_day)]['content'].dropna())
        if day_text.strip():
            day_words = process_text(day_text, stops)
            day_counts = Counter(day_words)
            daily_words.append(day_words)
            daily_counts.append(day_counts)
        day += datetime.timedelta(days=1)
        
    # 创建一个新的图形（9个小图）
    fig, axes = plt.subplots(3, 3, figsize=(16, 12))
    fig.subplots_adjust(hspace=0.3, wspace=0.3)

    for idx in range(9):
        day_str = start_date + pd.to_timedelta(idx, unit='d')
        wc_day = WordCloud(
            font_path='msyh.ttc',
            width=400,
            height=300,
            background_color='white',
            colormap='Set2',
            max_words=200
        ).generate_from_frequencies(daily_counts[idx])
        row, col = divmod(idx, 3)
        axes[row, col].imshow(wc_day, interpolation='bilinear')
        axes[row, col].set_title(f"{day_str}", fontsize=10)
        axes[row, col].axis('off')
        # 保存每日词云图
    plt.tight_layout()
    plt.savefig(f"../result/task3/{platform}_daily_wordcloud.png", dpi=300, bbox_inches='tight')
    plt.show()


# 1. 加载停用词
STOPS = load_stops()
# 2. 加载自定义词典
jieba.load_userdict(user_dict_path)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\s6_ti\AppData\Local\Temp\jieba.cache
Loading model cost 0.452 seconds.
Prefix dict has been built successfully.


In [6]:
# weibo_words, weibo_counts = get_total_words(weibo_df, stops=STOPS)
# bilibili_words, bilibili_counts = get_total_words(bilibili_df, stops=STOPS)

In [None]:
draw_total_wordcloud(weibo_df, stops=STOPS, platform='微博')

In [None]:
draw_daily_wordcloud(weibo_df, stops=STOPS, platform='微博')

In [None]:
draw_total_wordcloud(bilibili_df, stops=STOPS, platform='B站')

In [None]:
draw_daily_wordcloud(bilibili_df, stops=STOPS, platform='B站')

In [8]:
def tokenize_comments(df, stopwords):
    """
    将 DataFrame 中的评论内容进行分词处理，并返回列表的列表形式。
    
    参数:
        df (pd.DataFrame): 包含文本数据的 DataFrame
        stopwords (set): 停用词集合
        text_column (str): 需要处理的列名，默认是 'content'

    返回:
        List[List[str]]: 分词后的评论列表
    """
    tokenized_docs = []
    for text in df['content'].fillna(''):  # 确保空值处理
        tokens = process_text(text, stopwords)
        if tokens:  # 排除空列表
            tokenized_docs.append(tokens)
    return tokenized_docs

weibo_tokenized = tokenize_comments(weibo_df, STOPS)


In [9]:
from gensim import corpora


weibo_dictionary = corpora.Dictionary(weibo_tokenized)
weibo_corpus = [weibo_dictionary.doc2bow(text) for text in weibo_tokenized]

# 查看词袋表示
print(weibo_corpus)

[[(0, 1), (1, 1)], [(2, 1), (3, 1)], [(4, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1)], [(30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1)], [(28, 1)], [(47, 1), (48, 1)], [(44, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)], [(44, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)], [(34, 1), (59, 1), (60, 1), (61, 1)], [(62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 5), (73, 1), (74, 1), (75, 1)], [(62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 5), (73, 1), (74, 1), (75, 1)], [(35, 1), (72, 2), (76, 1), (77, 1), (78,

In [10]:
from gensim.models import LdaModel

# 设置主题数，例如 5 个主题
num_topics = 5

# 训练 LDA 模型
lda = LdaModel(weibo_corpus, num_topics=num_topics, id2word=weibo_dictionary, passes=15)

# 查看每个主题的 Top 10 词
topics = lda.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.030*"明日方舟" + 0.026*"干员" + 0.024*"强度" + 0.022*"跪下" + 0.016*"厨子" + 0.015*"设计师" + 0.014*"yj" + 0.014*"yjwl" + 0.013*"角色" + 0.011*"内部"')
(1, '0.033*"dl" + 0.026*"高清" + 0.022*"语音" + 0.021*"给力" + 0.021*"3.6" + 0.019*"陈晖洁" + 0.019*"干员" + 0.016*"设计师" + 0.014*"柳德米拉" + 0.014*"凯尔希"')
(2, '0.033*"大屏" + 0.031*"疑问" + 0.028*"天赋" + 0.026*"止颂" + 0.025*"黑键" + 0.022*"不改" + 0.021*"水月" + 0.021*"隐德来希" + 0.019*"拉普兰德" + 0.018*"技能"')
(3, '0.138*"弑君者" + 0.134*"重做" + 0.083*"地铁" + 0.081*"微笑" + 0.072*"拉普兰德" + 0.049*"宣传" + 0.026*"设计师" + 0.023*"开心" + 0.022*"分配" + 0.020*"按闹"')
(4, '0.103*"模组" + 0.067*"引星棘刺" + 0.046*"重写" + 0.042*"出苍白海" + 0.039*"霍尔海雅" + 0.039*"笑哈哈" + 0.032*"伺夜" + 0.030*"道歉" + 0.024*"剧情" + 0.023*"棘刺"')


In [12]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

# 可视化 LDA 模型
vis = pyLDAvis.gensim.prepare(lda, weibo_corpus, weibo_dictionary)
pyLDAvis.display(vis)

In [None]:


# 转换时间格式
# df['time'] = pd.to_datetime(df['time'])

# 选择热度高的时间段

# start = '2025-03-04'
# start = '2025-03-05'
# start = '2025-03-06'
# start = '2025-03-07'
# start = '2025-03-08'
# start = '2025-03-09'

# end = '2025-03-05'
# end = '2025-03-06'
# end = '2025-03-07'
# end = '2025-03-08'
# end = '2025-03-09'
# end = '2025-03-10'
# end = '2025-03-14'

# start_date = pd.to_datetime(start)
# end_date = pd.to_datetime(end)
# 
# # df_select = df[(df['time'] >= start_date) & (df['time'] <= end_date)]
# df_select = df[(df['time'] >= start_date) & (df['time'] < end_date)]
# 
# df_select.to_csv(path_or_buf=f"{start}_{end}.csv")

In [None]:




# # 3. 拼接所有评论内容为一个大字符串
# text = ' '.join(df_select['content'])
# # 4. 分词并过滤停用词
# filtered_words = process_text(text, stops)
# # 5. 获取词频统计
# word_counts = Counter(filtered_words)
# # 6. 序列化词频统计
# write_to_csv(word_counts, start, end)
# print(f"词频数据已保存到 word_counts_{start}_{end}.csv 文件")


In [None]:
# # 生成词云
# wordcloud = WordCloud(
#     max_words=400,
#     font_path='msyh.ttc',  # 改成你本机有的中文字体
#     width=1200,
#     height=800,
#     background_color='white',  # 背景颜色：可改为 'black' / 'white' / 'pink' / 'skyblue'
#     colormap='Set2',  # 颜色主题，见下方介绍
#     min_font_size=10
#     # scale=2                        # 放大图像质量
#     # contour_width=1,      # 边框宽度
#     # contour_color='steelblue'  # 边框颜色
# ).generate_from_frequencies(word_counts)

# # 显示词云
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.savefig(fname=f"{start}_{end}", bbox_inches='tight')

In [None]:
# # 将df中的数据，按照用户名的出现次数，保存到user_counts.csv
# df['nickname'] = df['nickname'].astype(str)
# user_counts = Counter(df['nickname'])
# with open('user_counts.csv', mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     # 写入标题行
#     writer.writerow(["nickname", "Count"])
#     # 写入词汇和词频
#     for nickname, count in user_counts.items():
#         writer.writerow([nickname, count])