In [40]:
# -*- coding: utf-8 -*-
"""
按 BV 抓取“全量弹幕”（segment_index 递增到空），
将每个分P保存为 CSV（content/video_title/timestamp），
并在 bili_danmu_results/ 目录下再生成合并 CSV/TXT。

用法：
python crawl_bili_danmu.py
粘贴 BV 或 URL，Cookie 可留空。
"""

import os, re, time, csv, datetime, sys
import requests
from requests.adapters import HTTPAdapter, Retry

# ---------- 常量 ----------
HEADERS_BASE = {
    "origin": "https://www.bilibili.com",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
}
DM_RE = re.compile(rb':(.*?)@', re.S)            # 从 seg.so 中提取弹幕文本
BV_RE = re.compile(r'(BV[0-9A-Za-z]{10,})')

RAW_DIR = os.path.join(os.getcwd(), "bili_danmu_results")
os.makedirs(RAW_DIR, exist_ok=True)

# ---------- 工具 ----------
def build_session(cookie=None, referer=None):
    s = requests.Session()
    headers = HEADERS_BASE.copy()
    if cookie: headers["cookie"] = cookie
    if referer: headers["referer"] = referer
    retries = Retry(total=3, backoff_factor=0.6, status_forcelist=[412, 429, 500, 502, 503, 504])
    s.mount("https://", HTTPAdapter(max_retries=retries, pool_connections=16, pool_maxsize=16))
    s.headers.update(headers)
    return s

def extract_bvid(text: str) -> str:
    m = BV_RE.search(text)
    if not m:
        raise ValueError("未找到 BV 号，请检查输入。")
    return m.group(1)

def get_pagelist_by_bvid(session, bvid):
    """用 pagelist 取所有分P：[{cid, page, part, duration}, ...]"""
    url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
    r = session.get(url, timeout=10); r.raise_for_status()
    j = r.json()
    if j.get("code", -1) != 0:
        raise RuntimeError(f"pagelist 接口失败：code={j.get('code')} msg={j.get('message')}")
    pages = j["data"] or []
    if not pages:
        raise RuntimeError("pagelist 返回空，可能 BV 无效或需要登录。")
    return pages

def fetch_seg_bytes(session, cid, seg_idx):
    url = f"https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={cid}&segment_index={seg_idx}"
    r = session.get(url, timeout=10); r.raise_for_status()
    return r.content

def parse_danmu_from_seg(seg_bytes):
    # 这里只抽内容；若要更丰富字段需解析 protobuf
    return [m.decode('utf-8', errors='ignore') for m in DM_RE.findall(seg_bytes)]

# ---------- I/O ----------
def save_csv(rows, path):
    # rows: list of dicts with keys: content, video_title, timestamp
    with open(path, "w", encoding="utf-8-sig", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["content", "video_title", "timestamp"])
        w.writeheader()
        for r in rows:
            w.writerow(r)

def save_txt(lines, path):
    with open(path, "w", encoding="utf-8") as f:
        for line in lines:
            f.write(line.strip() + "\n")

# ---------- 主流程 ----------
def crawl_all_by_cid(session, cid, video_title, sleep_s=0.05):
    all_items = []
    seg = 1
    empty_runs = 0
    while True:
        try:
            seg_bytes = fetch_seg_bytes(session, cid, seg)
        except requests.HTTPError:
            time.sleep(1.0)
            seg_bytes = fetch_seg_bytes(session, cid, seg)
        contents = parse_danmu_from_seg(seg_bytes)
        if not contents:
            empty_runs += 1
            if empty_runs >= 2:
                break
        else:
            empty_runs = 0
            for c in contents:
                all_items.append({
                    "content": c,
                    "video_title": video_title,
                    "timestamp": ""      # seg.so 未解析时间戳，这里留空占位
                })
        seg += 1
        time.sleep(sleep_s)
    return all_items

def main():
    print("=" * 60)
    print("B站弹幕抓取（保存为 CSV，便于后续清洗）")
    print("=" * 60)

    raw_input_str = input("输入 BV 或完整链接：").strip()
    cookie = "enable_web_push=DISABLE; buvid4=441674D3-73C6-EE18-D7F6-0A9AA6A8764B10929-024061616-tMB8uhs7bNpfYIQVaVKjtQ%3D%3D; DedeUserID=499303036; DedeUserID__ckMd5=7c2e754fb5285a0b; buvid_fp_plain=undefined; enable_feed_channel=ENABLE; hit-dyn-v2=1; fingerprint=d30d288e57446a6e2075d79545bcdece; buvid_fp=d30d288e57446a6e2075d79545bcdece; buvid3=D8E29309-F4DF-1535-6A27-C5ADFB04F3B208505infoc; b_nut=1750090608; _uuid=EE91029AA-C3ED-2D10D-A31B-8610DA6DC1BEE28428infoc; header_theme_version=OPEN; theme-tip-show=SHOWED; theme-avatar-tip-show=SHOWED; rpdid=|(J~R~uR))~u0J'u~lJkJl)Y|; LIVE_BUVID=AUTO2217563800546588; PVID=2; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NTczMDE2NTEsImlhdCI6MTc1NzA0MjM5MSwicGx0IjotMX0.FWLzrbsTzZVO9pAvA2K-j8JpwDO4_00W2sqETwK91NA; bili_ticket_expires=1757301591; SESSDATA=fc10051c%2C1772610457%2C5b79d%2A91CjBHREFbDUZegrrFspSN0CJBiv6HiJF4vOUubDwDGrFJa2-yRA8gOqoqGpCwy29sjfwSVnVwNlBZR1hKb2VFaGI0NkZpcWZMYm5ubE5Kd2JFdkluaFp2Q1JfeFNLX1JqWWxxb19mUENfa193RkFUVDNPa3lZcUU4WmZrZ3ZlLXRpZ3RBRkQ5RnNRIIEC; bili_jct=e685ab8cae35933d20e223cb4b6a6d91; bmg_af_switch=1; bmg_src_def_domain=i2.hdslb.com; bsource=search_google; sid=7l86jgpi; bp_t_offset_499303036=1109422191797075968; b_lsid=D181510CA_1991E1E02C8; home_feed_column=5; browser_resolution=1530-770; CURRENT_FNVAL=2000"#input("可选：Cookie：").strip() or None

    bvid = extract_bvid(raw_input_str)
    referer = f"https://www.bilibili.com/video/{bvid}"
    sess = build_session(cookie=cookie, referer=referer)

    pages = get_pagelist_by_bvid(sess, bvid)
    print(f"共 {len(pages)} 个分P：")

    all_rows = []
    for p in pages:
        cid = p["cid"]
        part = p.get("part") or f"P{p.get('page', '?')}"
        print(f"- 抓取 {part} (cid={cid}) …")
        rows = crawl_all_by_cid(sess, cid, video_title=f"{bvid} | {part}")
        print(f"  {part} 抓到 {len(rows)} 条")
        all_rows.extend(rows)

        # 分P各自保存
        csv_path = os.path.join(RAW_DIR, f"{bvid}_{cid}_{part}.csv")
        save_csv(rows, csv_path)

    # 合并保存（CSV + TXT）
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M")
    combined_csv = os.path.join(RAW_DIR, f"combined_raw_{bvid}_{ts}.csv")
    combined_txt = os.path.join(RAW_DIR, f"combined_raw_{bvid}_{ts}.txt")
    save_csv(all_rows, combined_csv)
    save_txt([r["content"] for r in all_rows], combined_txt)

    print(f"\n合计抓到 {len(all_rows)} 条弹幕")
    print(f"已保存：{combined_csv}")
    print(f"也写了纯文本：{combined_txt}")
    print("完成。")

if __name__ == "__main__":
    main()


B站弹幕抓取（保存为 CSV，便于后续清洗）
共 1 个分P：
- 抓取 2025年了，华为辅助驾驶有什么槽点？ (cid=29546709885) …
  2025年了，华为辅助驾驶有什么槽点？ 抓到 1193 条

合计抓到 1193 条弹幕
已保存：C:\Users\Andrew\Desktop\homework\bb\bili_danmu_results\combined_raw_BV1GD5DzHEJo_202509061618.csv
也写了纯文本：C:\Users\Andrew\Desktop\homework\bb\bili_danmu_results\combined_raw_BV1GD5DzHEJo_202509061618.txt
完成。


In [44]:
# -*- coding: utf-8 -*-
"""
B站弹幕批量清洗工具（增强版 v2）
- 读取 bili_danmu_results/*.csv （字段至少含 content）
- “先抽取，再清洗”：优先截取冒号后的正文；剥离前缀噪声块；去控制字符
- 输出 cleaned_danmu_results/：
    - combined_cleaned_danmu_*.txt / *.csv
    - removed_reasons_*.csv （被删除条目与原因）
    - debug_original_vs_cleaned_*.csv （抽样对照，便于人工复核）
"""

import re
import os
import glob
import datetime
import unicodedata
import pandas as pd
from collections import Counter

RAW_DIR = os.path.join(os.getcwd(), "bili_danmu_results")
OUT_DIR = os.path.join(os.getcwd(), "cleaned_danmu_results")

# ---------- 预编译正则 ----------
RE_CTRL = re.compile(r'[\x00-\x1F\x7F-\x9F]+')                    # 控制字符
RE_HTML = re.compile(r'<[^>]+>')
RE_URL  = re.compile(r'https?://\S+')
# 只保留：中英文、数字、常见中文标点、连字符
RE_KEEP = re.compile(r'[^\w\u4e00-\u9fa5，。！？、；："\'()【】《》·—…\-]+')
RE_SP   = re.compile(r'\s+')
RE_MULTI_PUNCT = re.compile(r'([，。！？…])\1+')
# 行首噪声块（括号/奇怪字节/短hex/孤立字母/标点）反复出现的组合
RE_NOISY_PREFIX = re.compile(
    r'^\s*(?:[\(\)\[\]{}<>\|\\/\-_=+~^`·•]+|[A-Za-z](?=\s)|[0-9A-Fa-f]{6,10}|[,.:;，。；：!！?？\s])+'
)

def _cut_after_colon(raw: str) -> str:
    """若含冒号，取最后一个冒号后的片段（更接近正文）"""
    if ':' in raw:
        return raw.split(':')[-1]
    return raw

def _strip_to_first_textual_char(s: str) -> str:
    """
    从开头剥离“噪声块”，直到遇到第一个中文/英文/数字；
    先用一个快速前缀清理，再精确定位第一个有效字符。
    """
    s = RE_NOISY_PREFIX.sub('', s)
    idx = None
    for i, ch in enumerate(s):
        if ch.isalnum() or ('\u4e00' <= ch <= '\u9fa5'):
            idx = i
            break
    return s[idx:] if idx is not None else ''

def _normalize(s: str) -> str:
    s = RE_CTRL.sub(' ', s)                    # 删控制字符
    s = unicodedata.normalize('NFKC', s)       # 全角→半角
    s = RE_HTML.sub('', s)                     # 去HTML
    s = RE_URL.sub('', s)                      # 去URL
    s = RE_KEEP.sub(' ', s)                    # 过滤到允许字符集
    s = RE_MULTI_PUNCT.sub(r'\1', s)           # 连续标点压缩
    s = RE_SP.sub(' ', s).strip()              # 空白规整
    return s

def _looks_like_noise(s: str) -> bool:
    """噪声判定：纯数字/单字母/太短/文字占比低"""
    if not s:
        return True
    if s.isdigit():
        return True
    if len(s) == 1 and s.isalnum():
        return True
    if len(s) < 2:
        return True
    # 文字占比（中英数字）
    total = len(s)
    keep = sum(1 for ch in s if ch.isalnum() or '\u4e00' <= ch <= '\u9fa5')
    if keep / max(1, total) < 0.3:
        return True
    return False

def clean_one(raw: str) -> str:
    """对单条弹幕做‘抽取→去噪→规范化’"""
    s = str(raw or '')
    s = _cut_after_colon(s)            # 先截冒号后
    s = RE_CTRL.sub(' ', s)            # 去控制符（保证后续定位不被干扰）
    s = _strip_to_first_textual_char(s)# 丢弃前缀噪声块
    s = _normalize(s)                  # 归一化+过滤
    if len(s) > 100:                   # 上限保护
        s = s[:100].rstrip()
    return s

def clean_danmu_content(records):
    """
    输入：字典列表（至少含 'content'）
    输出：
      - cleaned: [{original, cleaned, video_title, timestamp}]
      - removed: [(original, reason)]
    """
    cleaned, removed = [], []

    for dm in records:
        raw = str(dm.get('content', '') or '')
        if not raw:
            removed.append((raw, "empty"))
            continue

        out = clean_one(raw)

        if _looks_like_noise(out):
            removed.append((raw, "noise"))
            continue

        cleaned.append({
            "original": raw,
            "cleaned": out,
            "video_title": dm.get("video_title", "未知视频"),
            "timestamp": dm.get("timestamp", "")
        })

    # 去重（以 cleaned 为键）
    seen, dedup = set(), []
    for row in cleaned:
        key = row['cleaned']
        if key in seen:
            removed.append((row['original'], "duplicate"))
            continue
        seen.add(key)
        dedup.append(row)

    return dedup, removed

# ---------- 文件 I/O ----------
def find_csv_files(directory):
    files = glob.glob(os.path.join(directory, '*.csv'))
    return [f for f in files
            if '_cleaned.csv' not in os.path.basename(f)
            and not os.path.basename(f).startswith('combined_cleaned_')]

def read_all_records(paths):
    allrows = []
    for p in paths:
        try:
            df = pd.read_csv(p)
            for col in ["content", "video_title", "timestamp"]:
                if col not in df.columns:
                    df[col] = ""
            df = df[["content", "video_title", "timestamp"]]
            print(f"读取: {os.path.basename(p)}（{len(df)} 条）")
            allrows.extend(df.to_dict("records"))
        except Exception as e:
            print(f"读取失败: {p} - {e}")
    print(f"总计读取 {len(allrows)} 条")
    return allrows

def save_outputs(cleaned, removed):
    os.makedirs(OUT_DIR, exist_ok=True)
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M")

    # 主输出
    txt_path = os.path.join(OUT_DIR, f"combined_cleaned_danmu_{ts}.txt")
    csv_path = os.path.join(OUT_DIR, f"combined_cleaned_danmu_{ts}.csv")

    with open(txt_path, "w", encoding="utf-8") as f:
        for r in cleaned:
            f.write(r["cleaned"] + "\n")
    pd.DataFrame(cleaned).to_csv(csv_path, index=False, encoding="utf-8-sig")

    print(f"\n清洗后保存：\n- {txt_path}\n- {csv_path}")

    # 被移除条目统计
    if removed:
        rm_path = os.path.join(OUT_DIR, f"removed_reasons_{ts}.csv")
        pd.DataFrame(removed, columns=["original", "reason"]).to_csv(
            rm_path, index=False, encoding="utf-8-sig")
        cnt = Counter([r for _, r in removed])
        print("移除原因分布：", dict(cnt))
        print(f"移除明细：\n- {rm_path}")

    # 调试对照（随机抽样或前N条）
    dbg_path = os.path.join(OUT_DIR, f"debug_original_vs_cleaned_{ts}.csv")
    dbg_df = pd.DataFrame(cleaned)[["original", "cleaned"]].head(200)
    dbg_df.to_csv(dbg_path, index=False, encoding="utf-8-sig")
    print(f"调试对照样本：\n- {dbg_path}")

# ---------- 主流程 ----------
def main():
    print("="*50)
    print("B站弹幕批量清洗工具（增强版 v2）")
    print("="*50)

    paths = find_csv_files(RAW_DIR)
    if not paths:
        print(f"未在 {RAW_DIR} 找到待处理 CSV"); return

    records = read_all_records(paths)
    if not records:
        print("没有可处理数据"); return

    cleaned, removed = clean_danmu_content(records)
    print(f"\n保留 {len(cleaned)}/{len(records)} "
          f"({len(cleaned)/max(1,len(records))*100:.1f}%)")

    save_outputs(cleaned, removed)
    print("\n完成。")

if __name__ == "__main__":
    main()


B站弹幕批量清洗工具（增强版 v2）
读取: BV17pZfYLEu6_29199436543_小米su7事件，4月1号信息分析🧐.csv（2582 条）
读取: BV1Bh3jzxE2q_30840851939_外国车评人来到武汉，感受无人驾驶，悬浮空轨，完全颠覆认知！.csv（3529 条）
读取: BV1bN9PYeEBm_28603977518_50多万1548匹小米SU7，智驾水平应该不怎么样吧？.csv（2851 条）
读取: BV1bxb1zPEUJ_31257921527_首次体验萝卜快跑无人驾驶出租车.csv（1724 条）
读取: BV1D197YCEsW_28692122756_全网唯一最真实对比，特斯拉、华为什么水平？「特斯拉VS华为」.csv（2666 条）
读取: BV1f7b6zPESj_31686987287_馆长8.14深圳行④，体验无人驾驶出租车：哎哟喂呀，第一次坐，真是吓死我了好不好？.csv（1962 条）
读取: BV1fcXQYUEik_28632024035_全网最全“特斯拉FSD对比华为ADS”，中美智驾有多大差距？【科技狐】.csv（2299 条）
读取: BV1fcXQYUEik_28696839809_特斯拉FSD夜间广州郊区到城区一镜到底.csv（8 条）
读取: BV1fcXQYUEik_28696905721_特斯拉FSD广州城中村一镜到底.csv（5 条）
读取: BV1GD5DzHEJo_29546709885_2025年了，华为辅助驾驶有什么槽点？.csv（1193 条）
读取: BV1h5fnYcEyC_28054652505_危险危险危险！车辆“自动驾驶”，司机盖被睡觉？.csv（1608 条）
读取: BV1hBuFzfEY5_31011965200_【大虾沉浸式试驾】岚图FREE+👉智能驾驶·底盘·百公里加速全知道！.csv（1943 条）
读取: BV1kNZ1YJEPH_29212607774_智能驾驶≠自动驾驶！岚图梦想家“智驾”要如何更安全？.csv（734 条）
读取: BV1TRLEzpE6k_29618405918_小米SU7事件，应该教会我们更多   【下尺报告】.csv（4598 条）
读取: BV1v7PjegENh_28605743916_一镜到底：特斯拉FSD挑战「智闯

In [45]:
# -*- coding: utf-8 -*-
"""
B站弹幕分析 Pro：词频 / 词云 / 自适应聚类 / 情感，适配 cleaned_danmu_results 输出
- 自动读取最新 cleaned 文件（csv 优先，其次 txt）
- 分词：词性过滤 + 行业词典（可选）
- 向量化：1-2gram TF-IDF、min_df/max_df 抑制口水词
- 自适应 KMeans（轮廓系数选 K）
- 主题命名：用非停用词关键词前2个
- 情感：SnowNLP（<4字视为中性），按主题汇总
- 可视化：词云、主题分布饼图 + 情感堆叠柱状 + 关键词表
- 额外导出：词频、主题关键词表、主题情感表、Top n-gram
"""

import os, re, glob, time, random
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
from collections import Counter
from wordcloud import WordCloud
from snownlp import SnowNLP

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.gridspec import GridSpec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ===================== 可调参数 =====================
CLEANED_DIR = "cleaned_danmu_results"
OUT_DIR     = "analysis_results"
STOP_PATH   = "cn_stopwords.txt"      # 通用停用词，如无可为空
USER_DICT   = "user_dict.txt"         # 行业词典（可选）

# 词性保留：名词/动词/英文/术语/组织/人名等
KEEP_POS = {"n","nr","ns","nt","nz","vn","v","eng","nw","an","i","j","ni","nl","ng"}

# 额外口语停用词（弹幕高频“无信息”词）
DANMU_STOP_EXTRA = {
    "这个","那个","就是","什么","还有","然后","但是","所以","还是","已经","真的","感觉","觉得","知道",
    "可以","不可以","不会","不能","应该","可能","还是","有点","有些","怎么","为啥","为什么",
    "啊","呀","呢","吧","哦","哇","诶","嘛","哈","哈哈","哈哈哈","emm","嗯","啊啊","呜呜",
    "视频","弹幕","现在","今天","昨天","明天","这里","那里","这样","那样","很多","非常"
}

# TF-IDF 参数
MAX_FEATURES = 4000
MIN_DF       = 5       # 出现不到 MIN_DF 个文档的特征丢弃
MAX_DF       = 0.6     # 出现在 >60% 文档的特征丢弃
NGRAM        = (1,2)

# 自适应 K 范围
K_MIN, K_MAX = 2, 8

# 词云 TopN
WORDCLOUD_TOPN = 150

# 小句情感处理（长度<4判中性）
SHORT_NEUTRAL_LEN = 4
# ====================================================


# ---------------- 字体设置 ----------------
def setup_chinese_font():
    candidates = ['SimHei','Microsoft YaHei','SimSun','KaiTi',
                  'Noto Sans CJK SC','Source Han Sans SC','Arial Unicode MS']
    picked = None
    for name in candidates:
        if any(name in f.name for f in fm.fontManager.ttflist):
            picked = name; break
    if picked:
        plt.rcParams['font.family'] = picked
    plt.rcParams['axes.unicode_minus'] = False
    font_path = None
    if picked:
        for f in fm.findSystemFonts():
            if picked.lower() in os.path.basename(f).lower():
                font_path = f; break
    if not font_path:
        font_path = fm.findfont(fm.FontProperties(family='sans-serif'))
    return font_path

WC_FONT = setup_chinese_font()


# ---------------- 工具函数 ----------------
CTRL_RE  = re.compile(r'[\x00-\x1F\x7F-\x9F]+')
EMOJI_RE = re.compile(r'[\U00010000-\U0010ffff]', flags=re.UNICODE)

def load_stopwords(path=STOP_PATH):
    base = set()
    if path and os.path.exists(path):
        with open(path,'r',encoding='utf-8') as f:
            base = {x.strip() for x in f if x.strip()}
    # 常见符号 & 弹幕口语补充
    base |= set(list("，。、！？；：”“‘‘（）()[]【】—…- "))
    base |= DANMU_STOP_EXTRA
    return base

def maybe_load_user_dict():
    if USER_DICT and os.path.exists(USER_DICT):
        jieba.load_userdict(USER_DICT)
        print(f"[INFO] 已加载行业词典：{USER_DICT}")

def find_latest_cleaned_file():
    csvs = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.csv"))
    txts = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.txt"))
    files = csvs + txts
    if not files: return None
    return max(files, key=os.path.getmtime)

def load_cleaned_lines(path):
    if path.endswith(".csv"):
        df = pd.read_csv(path)
        col = "cleaned" if "cleaned" in df.columns else df.columns[0]
        lines = [str(x) for x in df[col].fillna("").tolist()]
    else:
        with open(path,'r',encoding='utf-8') as f:
            lines = [line.strip() for line in f]
    # 轻量去噪 + 去重
    cleaned, seen = [], set()
    for s in lines:
        s = CTRL_RE.sub(' ', s)
        s = EMOJI_RE.sub('', s)
        s = re.sub(r'\s+',' ', s).strip()
        if s and s not in seen:
            seen.add(s); cleaned.append(s)
    return cleaned

def segment_docs(lines, stop):
    all_words, docs, kept_lines = [], [], []
    for s in lines:
        if len(s) < 2:        # 允许短句，但太短就跳过
            continue
        tokens = []
        for w, flag in pseg.cut(s):
            if (w not in stop) and (len(w) > 1) and (flag in KEEP_POS):
                tokens.append(w)
        if tokens:
            all_words.extend(tokens)
            docs.append(" ".join(tokens))
            kept_lines.append(s)     # 保留原句用于情感
    return all_words, docs, kept_lines

def save_wordfreq(counter):
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"word_frequency_{ts}.csv")
    with open(out,'w',encoding='utf-8') as fw:
        fw.write("rank,word,freq\n")
        for i,(w,f) in enumerate(counter.most_common(),1):
            fw.write(f"{i},{w},{f}\n")
    return out

def draw_wordcloud(counter, top_n=WORDCLOUD_TOPN):
    if not counter: return None
    wc = WordCloud(width=1200, height=700, background_color='white', font_path=WC_FONT)
    img = wc.generate_from_frequencies(dict(counter.most_common(top_n)))
    plt.figure(figsize=(12,7))
    plt.imshow(img, interpolation='bilinear'); plt.axis('off'); plt.title('弹幕高频词云', fontsize=16)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"wordcloud_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close()
    return out

# --- n-gram 辅助（统计 top bigram） ---
def top_ngrams(docs, n=2, topk=50):
    c = Counter()
    for d in docs:
        toks = d.split()
        for i in range(len(toks)-n+1):
            c[" ".join(toks[i:i+n])] += 1
    return c.most_common(topk)

# ---------------- 聚类（自适应K） ----------------
def auto_kmeans(docs, min_k=K_MIN, max_k=K_MAX, max_features=MAX_FEATURES):
    vec = TfidfVectorizer(max_features=max_features, ngram_range=NGRAM,
                          min_df=MIN_DF, max_df=MAX_DF)
    X = vec.fit_transform(docs)
    n = X.shape[0]

    # 少样本：固定较小K
    if n < 60:
        k = max(2, min(4, n // 15 or 2))
        model = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
        return model, vec, None

    # 轮廓系数选择 K（抽样评估）
    if n > 6000:
        idx = np.random.RandomState(42).choice(n, 6000, replace=False)
        X_eval = X[idx]
    else:
        X_eval = X

    best_k, best_score, best_model = None, -1, None
    for k in range(min_k, max_k+1):
        km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
        labels_eval = km.labels_ if X_eval is X else km.predict(X_eval)
        score = silhouette_score(X_eval, labels_eval, metric='cosine')
        if score > best_score:
            best_k, best_score, best_model = k, score, km
    return best_model, vec, best_score

def extract_cluster_keywords(model, vectorizer, stop_extra, topn=10):
    terms = (vectorizer.get_feature_names_out()
             if hasattr(vectorizer, "get_feature_names_out")
             else vectorizer.get_feature_names())
    order = model.cluster_centers_.argsort()[:, ::-1]
    keywords, names = [], []
    for i in range(model.n_clusters):
        keys = []
        for j in order[i, :topn*3]:  # 多取再过滤
            if j < len(terms):
                t = terms[j]
                if all(tok not in stop_extra for tok in t.split()):
                    keys.append(t)
            if len(keys) >= topn:
                break
        keywords.append(keys[:topn])
        names.append(" ".join(keys[:2]) if keys else f"主题{i+1}")
    return names, keywords

def sentiment_ratio(texts):
    if not texts: return (0,0,0)
    pos = neu = neg = 0
    for t in texts:
        t = t.strip()
        if len(t) < SHORT_NEUTRAL_LEN:
            neu += 1; continue
        s = SnowNLP(t).sentiments
        if s > 0.6: pos += 1
        elif s < 0.4: neg += 1
        else: neu += 1
    total = len(texts)
    return pos/total, neu/total, neg/total

def visualize_clusters(theme_names, theme_counts, sentiments, key_table):
    if not theme_names: return None
    fig = plt.figure(figsize=(16,12))
    gs = GridSpec(2, 2, figure=fig)

    # 饼图：主题分布
    ax1 = fig.add_subplot(gs[0,0])
    wedges, _, _ = ax1.pie(theme_counts, labels=None, autopct='%1.1f%%',
                           startangle=90, pctdistance=0.8, labeldistance=1.4)
    legend_labels = [f"{nm}: {cnt}条 ({cnt/sum(theme_counts)*100:.1f}%)"
                     for nm, cnt in zip(theme_names, theme_counts)]
    ax1.legend(legend_labels, loc='center left', bbox_to_anchor=(-0.32, 0))
    ax1.set_title("主题分布", fontsize=15)

    # 柱状：情感堆叠（绿/蓝/红）
    ax2 = fig.add_subplot(gs[0,1])
    idx = np.arange(len(theme_names))
    pos = [sentiments[n]['positive'] for n in theme_names]
    neu = [sentiments[n]['neutral'] for n in theme_names]
    neg = [sentiments[n]['negative'] for n in theme_names]
    barw = 0.65
    ax2.bar(idx, pos, width=barw, color='#4CAF50', label='积极')
    ax2.bar(idx, neu, width=barw, bottom=pos, color='#2196F3', label='中性')
    ax2.bar(idx, neg, width=barw, bottom=[i+j for i,j in zip(pos,neu)],
            color='#F44336', label='消极')
    ax2.set_xticks(idx)
    ax2.set_xticklabels(theme_names, rotation=45, ha='right')
    ax2.set_ylim(0,1)
    ax2.set_ylabel("比例"); ax2.set_title("情感分布", fontsize=15)
    ax2.legend(loc='upper right', bbox_to_anchor=(1.15,1))

    # 主题关键词表
    ax3 = fig.add_subplot(gs[1,:]); ax3.axis('off')
    cols = [f"关键词{i+1}" for i in range(max(len(r) for r in key_table) if key_table else 10)]
    # 补齐不等长
    data = [row + [""]*(len(cols)-len(row)) for row in key_table]
    table = ax3.table(cellText=data, rowLabels=theme_names, colLabels=cols, loc='center')
    table.auto_set_font_size(False); table.set_fontsize(10); table.scale(1,1.5)
    ax3.set_title("主题关键词", fontsize=15, y=0.98)

    plt.subplots_adjust(top=0.93, bottom=0.08, left=0.08, right=0.95,
                        hspace=0.6, wspace=0.35)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"cluster_summary_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close(fig)
    return out


# ---------------- 主流程 ----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    print("="*64)
    print("弹幕文本分析 Pro（词频/词云/自适应聚类/情感）")
    print("="*64)

    latest = find_latest_cleaned_file()
    if not latest:
        print("未找到清洗后的文件，请先运行清洗。"); return
    print(f"[INFO] 分析文件：{latest}")

    maybe_load_user_dict()
    stop = load_stopwords(STOP_PATH)
    lines = load_cleaned_lines(latest)
    print(f"[INFO] 清洗后行数：{len(lines)}")

    jieba.initialize()
    all_words, docs, kept_lines = segment_docs(lines, stop)
    if not docs:
        print("[WARN] 分词后为空（可能停用词过多或文本太短）。"); return
    print(f"[INFO] 分词后文档数：{len(docs)} ；总词数：{len(all_words)}")

    # 词频、词云、Top bigram
    counter = Counter(all_words)
    freq_csv = save_wordfreq(counter)
    print(f"[OK] 词频表：{freq_csv}")

    wc_path = draw_wordcloud(counter)
    if wc_path: print(f"[OK] 词云：{wc_path}")

    bigram_top = top_ngrams(docs, n=2, topk=50)
    ts = time.strftime("%Y%m%d%H%M")
    bigram_csv = os.path.join(OUT_DIR, f"top_bigram_{ts}.csv")
    pd.DataFrame(bigram_top, columns=["bigram","count"]).to_csv(bigram_csv, index=False, encoding='utf-8-sig')
    print(f"[OK] Top bigram：{bigram_csv}")

    # 聚类
    if len(docs) >= 20:
        print("[INFO] 开始自适应聚类 …")
        model, vec, s_score = auto_kmeans(docs)
        if s_score is not None:
            print(f"[INFO] 轮廓系数最优：K={model.n_clusters}，score={s_score:.4f}")
        labels = model.labels_

        # 主题关键词与命名
        theme_names, key_table = extract_cluster_keywords(model, vec, DANMU_STOP_EXTRA, topn=10)

        # 每簇原句汇总（用于情感）
        groups = {i: [] for i in range(model.n_clusters)}
        for i, lbl in enumerate(labels):
            groups[lbl].append(kept_lines[i])

        theme_counts = [len(groups[i]) for i in range(model.n_clusters)]
        sentiments = {}
        for i, nm in enumerate(theme_names):
            p,u,n = sentiment_ratio(groups[i])
            sentiments[nm] = {"positive": p, "neutral": u, "negative": n}

        # 可视化
        img = visualize_clusters(theme_names, theme_counts, sentiments, key_table)
        if img: print(f"[OK] 聚类总览图：{img}")

        # 导出 CSV
        kw_csv = os.path.join(OUT_DIR, f"cluster_keywords_{ts}.csv")
        pd.DataFrame({"theme": theme_names, **{f"kw{i+1}":[row[i] if i<len(row) else "" for row in key_table] for i in range(10)}})\
          .to_csv(kw_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] 主题关键词表：{kw_csv}")

        sent_csv = os.path.join(OUT_DIR, f"cluster_sentiment_{ts}.csv")
        pd.DataFrame([{"theme": nm, **sentiments[nm], "count": cnt, "percentage": cnt/len(docs)}
                      for nm, cnt in zip(theme_names, theme_counts)])\
          .to_csv(sent_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] 主题情感表：{sent_csv}")
    else:
        print("[WARN] 文档数 <20，跳过聚类/情感。")

    # 摘要
    print("\n词频TOP10：")
    for i,(w,f) in enumerate(counter.most_common(10),1):
        print(f"{i}. {w}  {f}")
    print("\n完成。输出目录：analysis_results/")

if __name__ == "__main__":
    main()


弹幕文本分析 Pro（词频/词云/自适应聚类/情感）
[INFO] 分析文件：cleaned_danmu_results\combined_cleaned_danmu_202509061626.csv
[INFO] 清洗后行数：36165
[INFO] 分词后文档数：32655 ；总词数：97466
[OK] 词频表：analysis_results\word_frequency_202509061630.csv
[OK] 词云：analysis_results\wordcloud_202509061630.png
[OK] Top bigram：analysis_results\top_bigram_202509061630.csv
[INFO] 开始自适应聚类 …
[INFO] 轮廓系数最优：K=8，score=0.0172
[OK] 聚类总览图：analysis_results\cluster_summary_202509061632.png
[OK] 主题关键词表：analysis_results\cluster_keywords_202509061630.csv
[OK] 主题情感表：analysis_results\cluster_sentiment_202509061630.csv

词频TOP10：
1. 驾驶  1849
2. 自动  1241
3. 没有  1066
4. 智驾  1042
5. 问题  975
6. 华为  860
7. 小米  691
8. 识别  651
9. 辅助  553
10. 司机  542

完成。输出目录：analysis_results/


In [46]:
#聚类 k=7

In [47]:
# -*- coding: utf-8 -*-
"""
B站弹幕分析（固定K=7版本）：词频 / 词云 / 聚类 / 情感
- 输入：cleaned_danmu_results/combined_cleaned_danmu_*.{csv,txt}
- 词性过滤 + 行业词典 + 强停用
- TF-IDF: 1~3gram, min_df=5, max_df=0.5
- 聚类：KMeans(K=7) 固定主题数，避免主题塌缩
- 可视化：词云、主题分布饼图、情感堆叠柱 + 关键词表
- 导出：词频、主题关键词表、主题情感表、Top bigram
"""

import os, re, glob, time
import numpy as np
import pandas as pd
import jieba
import jieba.posseg as pseg
from collections import Counter
from wordcloud import WordCloud
from snownlp import SnowNLP

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.gridspec import GridSpec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# ---------------- 路径 & 参数 ----------------
CLEANED_DIR = "cleaned_danmu_results"
OUT_DIR     = "analysis_results"
STOP_PATH   = "cn_stopwords.txt"     # 可留空
USER_DICT   = "user_dict.txt"        # 可留空（建议添加行业词）

# 允许的词性：名词/动词/英文/术语/组织/人名等
KEEP_POS = {"n","nr","ns","nt","nz","vn","v","eng","nw","an","i","j","ni","nl","ng"}

# 额外“通用弱信息词”停用（防主题被“智能/驾驶/辅助/系统/功能”等吸走）
GENERIC_WEAK = {
    "智能","驾驶","辅助","系统","功能","自动","车辆","设备","技术","模式","信息","体验","感觉","问题","情况",
    "方面","进行","实现","处理","比较","可以","不会","不能","应该","已经","还是","就是","什么","还有","然后","但是",
    "啊","呀","呢","吧","哦","哈","哈哈","哈哈哈","真的","觉得","感觉","我们","他们","自己","这个","那个","这样","那样"
}

# 行业词典建议：把“自动驾驶/端到端/FSD/ADS/NOA/激光雷达/Robotaxi/车路协同/感知/规划/控制/
# 责任/法律/事故/保险/特斯拉/小鹏/理想/比亚迪/华为/萝卜快跑/NOH/NCA”等放到 user_dict.txt，一行一个词。

# TF-IDF
MAX_FEATURES = 5000
MIN_DF       = 5
MAX_DF       = 0.5
NGRAM        = (1,3)

# 聚类主题数（固定）
K_FIXED      = 7

# 词云
WORDCLOUD_TOPN = 150

# 短句情感（<4字判中性）
SHORT_NEUTRAL_LEN = 4

# ---------------- 字体 ----------------
def setup_chinese_font():
    cands = ['SimHei','Microsoft YaHei','SimSun','KaiTi','Noto Sans CJK SC','Source Han Sans SC','Arial Unicode MS']
    picked = None
    for n in cands:
        if any(n in f.name for f in fm.fontManager.ttflist):
            picked = n; break
    if picked: plt.rcParams['font.family'] = picked
    plt.rcParams['axes.unicode_minus'] = False
    # for wordcloud
    fp = None
    if picked:
        for f in fm.findSystemFonts():
            if picked.lower() in os.path.basename(f).lower():
                fp = f; break
    if not fp:
        fp = fm.findfont(fm.FontProperties(family='sans-serif'))
    return fp

WC_FONT = setup_chinese_font()

# ---------------- 基础工具 ----------------
CTRL_RE  = re.compile(r'[\x00-\x1F\x7F-\x9F]+')
EMOJI_RE = re.compile(r'[\U00010000-\U0010ffff]', flags=re.UNICODE)

def load_stopwords(path=STOP_PATH):
    base = set()
    if path and os.path.exists(path):
        with open(path,'r',encoding='utf-8') as f:
            base = {x.strip() for x in f if x.strip()}
    base |= set(list("，。、！？；：”“‘‘（）()[]【】—…- "))
    base |= GENERIC_WEAK
    return base

def maybe_load_user_dict():
    if USER_DICT and os.path.exists(USER_DICT):
        jieba.load_userdict(USER_DICT)
        print(f"[INFO] 已加载行业词典：{USER_DICT}")

def find_latest_cleaned_file():
    csvs = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.csv"))
    txts = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.txt"))
    files = csvs + txts
    return max(files, key=os.path.getmtime) if files else None

def load_cleaned_lines(path):
    if path.endswith(".csv"):
        df = pd.read_csv(path)
        col = "cleaned" if "cleaned" in df.columns else df.columns[0]
        lines = [str(x) for x in df[col].fillna("").tolist()]
    else:
        with open(path,'r',encoding='utf-8') as f:
            lines = [line.strip() for line in f]
    cleaned, seen = [], set()
    for s in lines:
        s = CTRL_RE.sub(' ', s)
        s = EMOJI_RE.sub('', s)
        s = re.sub(r'\s+',' ', s).strip()
        if s and s not in seen:
            seen.add(s); cleaned.append(s)
    return cleaned

# 分词：词性过滤 + 强停用
def segment_docs(lines, stop):
    all_words, docs, kept_lines = [], [], []
    for s in lines:
        if len(s) < 2:
            continue
        tokens = []
        for w, flag in pseg.cut(s):
            if (w not in stop) and (len(w) > 1) and (flag in KEEP_POS):
                tokens.append(w)
        if tokens:
            all_words.extend(tokens)
            docs.append(" ".join(tokens))
            kept_lines.append(s)
    return all_words, docs, kept_lines

def save_wordfreq(counter):
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"word_frequency_{ts}.csv")
    with open(out,'w',encoding='utf-8') as fw:
        fw.write("rank,word,freq\n")
        for i,(w,f) in enumerate(counter.most_common(),1):
            fw.write(f"{i},{w},{f}\n")
    return out

def draw_wordcloud(counter):
    if not counter: return None
    wc = WordCloud(width=1200, height=700, background_color='white', font_path=WC_FONT)
    img = wc.generate_from_frequencies(dict(counter.most_common(WORDCLOUD_TOPN)))
    plt.figure(figsize=(12,7))
    plt.imshow(img, interpolation='bilinear'); plt.axis('off'); plt.title('弹幕高频词云', fontsize=16)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"wordcloud_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close()
    return out

def top_ngrams(docs, n=2, topk=50):
    c = Counter()
    for d in docs:
        toks = d.split()
        for i in range(len(toks)-n+1):
            c[" ".join(toks[i:i+n])] += 1
    return c.most_common(topk)

# ---------------- KMeans（固定K=7） ----------------
def kmeans_fixed(docs):
    vec = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM,
                          min_df=MIN_DF, max_df=MAX_DF)
    X = vec.fit_transform(docs)
    k = min(K_FIXED, max(2, X.shape[0] // 30))  # 极少样本时自保
    model = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
    return model, vec

def extract_cluster_keywords(model, vectorizer, stop_extra, topn=10):
    terms = (vectorizer.get_feature_names_out()
             if hasattr(vectorizer, "get_feature_names_out")
             else vectorizer.get_feature_names())
    order = model.cluster_centers_.argsort()[:, ::-1]
    keywords, names = [], []
    for i in range(model.n_clusters):
        keys = []
        for j in order[i, :topn*3]:
            if j < len(terms):
                t = terms[j]
                if all(tok not in stop_extra for tok in t.split()):
                    keys.append(t)
            if len(keys) >= topn:
                break
        keywords.append(keys[:topn])
        names.append(" ".join(keys[:2]) if keys else f"主题{i+1}")
    return names, keywords

def sentiment_ratio(texts):
    if not texts: return (0,0,0)
    pos = neu = neg = 0
    for t in texts:
        t = t.strip()
        if len(t) < SHORT_NEUTRAL_LEN:
            neu += 1; continue
        s = SnowNLP(t).sentiments
        if s > 0.6: pos += 1
        elif s < 0.4: neg += 1
        else: neu += 1
    total = len(texts)
    return pos/total, neu/total, neg/total

def visualize_clusters(theme_names, theme_counts, sentiments, key_table):
    if not theme_names: return None
    fig = plt.figure(figsize=(16,12))
    gs = GridSpec(2, 2, figure=fig)

    # 饼图
    ax1 = fig.add_subplot(gs[0,0])
    wedges, _, _ = ax1.pie(theme_counts, labels=None, autopct='%1.1f%%',
                           startangle=90, pctdistance=0.8, labeldistance=1.4)
    legend_labels = [f"{nm}: {cnt}条 ({cnt/sum(theme_counts)*100:.1f}%)"
                     for nm, cnt in zip(theme_names, theme_counts)]
    ax1.legend(legend_labels, loc='center left', bbox_to_anchor=(-0.32, 0))
    ax1.set_title("主题分布", fontsize=15)

    # 情感柱
    ax2 = fig.add_subplot(gs[0,1])
    idx = np.arange(len(theme_names))
    pos = [sentiments[n]['positive'] for n in theme_names]
    neu = [sentiments[n]['neutral'] for n in theme_names]
    neg = [sentiments[n]['negative'] for n in theme_names]
    barw = 0.65
    ax2.bar(idx, pos, width=barw, color='#4CAF50', label='积极')
    ax2.bar(idx, neu, width=barw, bottom=pos, color='#2196F3', label='中性')
    ax2.bar(idx, neg, width=barw, bottom=[i+j for i,j in zip(pos,neu)],
            color='#F44336', label='消极')
    ax2.set_xticks(idx)
    ax2.set_xticklabels(theme_names, rotation=45, ha='right')
    ax2.set_ylim(0,1)
    ax2.set_ylabel("比例"); ax2.set_title("情感分布", fontsize=15)
    ax2.legend(loc='upper right', bbox_to_anchor=(1.15,1))

    # 关键词表
    ax3 = fig.add_subplot(gs[1,:]); ax3.axis('off')
    cols = [f"关键词{i+1}" for i in range(max(len(r) for r in key_table) if key_table else 10)]
    data = [row + [""]*(len(cols)-len(row)) for row in key_table]
    table = ax3.table(cellText=data, rowLabels=theme_names, colLabels=cols, loc='center')
    table.auto_set_font_size(False); table.set_fontsize(10); table.scale(1,1.5)
    ax3.set_title("主题关键词", fontsize=15, y=0.98)

    plt.subplots_adjust(top=0.93, bottom=0.08, left=0.08, right=0.95,
                        hspace=0.6, wspace=0.35)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"cluster_summary_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close(fig)
    return out

# ---------------- 主流程 ----------------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    print("="*64)
    print("弹幕文本分析（固定K=7版本）")
    print("="*64)

    latest = find_latest_cleaned_file()
    if not latest:
        print("未找到清洗后的文件。"); return
    print(f"[INFO] 分析文件：{latest}")

    maybe_load_user_dict()
    stop = load_stopwords(STOP_PATH)
    lines = load_cleaned_lines(latest)
    print(f"[INFO] 清洗后行数：{len(lines)}")

    jieba.initialize()
    all_words, docs, kept_lines = segment_docs(lines, stop)
    if not docs:
        print("[WARN] 分词后为空。"); return
    print(f"[INFO] 分词后文档数：{len(docs)} ；总词数：{len(all_words)}")

    # 词频/词云/Top bigram
    counter = Counter(all_words)
    freq_csv = save_wordfreq(counter); print(f"[OK] 词频表：{freq_csv}")
    wc_path = draw_wordcloud(counter);
    if wc_path: print(f"[OK] 词云：{wc_path}")
    bigram = top_ngrams(docs, n=2, topk=50)
    ts = time.strftime("%Y%m%d%H%M")
    bigram_csv = os.path.join(OUT_DIR, f"top_bigram_{ts}.csv")
    pd.DataFrame(bigram, columns=["bigram","count"]).to_csv(bigram_csv, index=False, encoding='utf-8-sig')
    print(f"[OK] Top bigram：{bigram_csv}")

    # 固定K聚类
    if len(docs) >= 20:
        model, vec = kmeans_fixed(docs)
        labels = model.labels_
        theme_names, key_table = extract_cluster_keywords(model, vec, GENERIC_WEAK, topn=10)

        groups = {i: [] for i in range(model.n_clusters)}
        for i, lbl in enumerate(labels):
            groups[lbl].append(kept_lines[i])

        theme_counts = [len(groups[i]) for i in range(model.n_clusters)]
        sentiments = {theme_names[i]: dict(zip(["positive","neutral","negative"],
                          sentiment_ratio(groups[i]))) for i in range(model.n_clusters)}

        img = visualize_clusters(theme_names, theme_counts, sentiments, key_table)
        if img: print(f"[OK] 聚类总览图：{img}")

        # 导出 CSV
        kw_csv = os.path.join(OUT_DIR, f"cluster_keywords_{ts}.csv")
        pd.DataFrame({"theme": theme_names, **{f"kw{i+1}":[row[i] if i<len(row) else "" for row in key_table] for i in range(10)}})\
          .to_csv(kw_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] 主题关键词表：{kw_csv}")

        sent_csv = os.path.join(OUT_DIR, f"cluster_sentiment_{ts}.csv")
        pd.DataFrame([{"theme": nm, **sentiments[nm], "count": cnt, "percentage": cnt/len(docs)}
                      for nm, cnt in zip(theme_names, theme_counts)])\
          .to_csv(sent_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] 主题情感表：{sent_csv}")
    else:
        print("[WARN] 文档数 <20，跳过聚类/情感。")

    # 摘要
    print("\n词频TOP10：")
    for i,(w,f) in enumerate(counter.most_common(10),1):
        print(f"{i}. {w}  {f}")
    print("\n完成。输出目录：analysis_results/")

if __name__ == "__main__":
    main()


弹幕文本分析（固定K=7版本）
[INFO] 分析文件：cleaned_danmu_results\combined_cleaned_danmu_202509061626.csv
[INFO] 清洗后行数：36165
[INFO] 分词后文档数：32492 ；总词数：92643
[OK] 词频表：analysis_results\word_frequency_202509061639.csv
[OK] 词云：analysis_results\wordcloud_202509061639.png
[OK] Top bigram：analysis_results\top_bigram_202509061639.csv
[OK] 聚类总览图：analysis_results\cluster_summary_202509061641.png
[OK] 主题关键词表：analysis_results\cluster_keywords_202509061639.csv
[OK] 主题情感表：analysis_results\cluster_sentiment_202509061639.csv

词频TOP10：
1. 没有  1066
2. 智驾  1042
3. 华为  860
4. 小米  691
5. 识别  651
6. 知道  602
7. 司机  542
8. 开车  488
9. 可能  479
10. 安全  455

完成。输出目录：analysis_results/
