In [5]:
# -*- coding: utf-8 -*-
"""
批量抓取 B 站弹幕（无需输入日期），从 url.txt 读取多个视频链接/BV，
对每条弹幕的 timestamp 统一写入该视频的“发布日期”（pubdate），
并在 bili_danmu_results/ 下保存分P CSV、每视频合并 CSV/TXT、以及全量汇总 CSV/TXT。

用法：
    python crawl_bili_danmu_batch.py
"""

import os, re, time, csv, datetime
import requests
from requests.adapters import HTTPAdapter, Retry

# ---------- 常量 ----------
HEADERS_BASE = {
    "origin": "https://www.bilibili.com",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
}
DM_RE = re.compile(rb':(.*?)@', re.S)            # 从 seg.so 提取纯文本弹幕
BV_RE = re.compile(r'(BV[0-9A-Za-z]{10,})')

RAW_DIR = os.path.join(os.getcwd(), "bili_danmu_results")
os.makedirs(RAW_DIR, exist_ok=True)

URL_LIST_FILE = "url.txt"  # 每行一个 BV 或视频链接，支持注释（以 # 开头）

# 可选：如需登录态稳定性，可在此处粘贴你的 Cookie（也可以留空）
DEFAULT_COOKIE = "enable_web_push=DISABLE; buvid4=441674D3-73C6-EE18-D7F6-0A9AA6A8764B10929-024061616-tMB8uhs7bNpfYIQVaVKjtQ%3D%3D; DedeUserID=499303036; DedeUserID__ckMd5=7c2e754fb5285a0b; buvid_fp_plain=undefined; enable_feed_channel=ENABLE; hit-dyn-v2=1; fingerprint=d30d288e57446a6e2075d79545bcdece; buvid_fp=d30d288e57446a6e2075d79545bcdece; buvid3=D8E29309-F4DF-1535-6A27-C5ADFB04F3B208505infoc; b_nut=1750090608; _uuid=EE91029AA-C3ED-2D10D-A31B-8610DA6DC1BEE28428infoc; header_theme_version=OPEN; theme-tip-show=SHOWED; theme-avatar-tip-show=SHOWED; rpdid=|(J~R~uR))~u0J'u~lJkJl)Y|; LIVE_BUVID=AUTO2217563800546588; PVID=2; CURRENT_QUALITY=80; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NTczMDE2NTEsImlhdCI6MTc1NzA0MjM5MSwicGx0IjotMX0.FWLzrbsTzZVO9pAvA2K-j8JpwDO4_00W2sqETwK91NA; bili_ticket_expires=1757301591; SESSDATA=fc10051c%2C1772610457%2C5b79d%2A91CjBHREFbDUZegrrFspSN0CJBiv6HiJF4vOUubDwDGrFJa2-yRA8gOqoqGpCwy29sjfwSVnVwNlBZR1hKb2VFaGI0NkZpcWZMYm5ubE5Kd2JFdkluaFp2Q1JfeFNLX1JqWWxxb19mUENfa193RkFUVDNPa3lZcUU4WmZrZ3ZlLXRpZ3RBRkQ5RnNRIIEC; bili_jct=e685ab8cae35933d20e223cb4b6a6d91; bmg_af_switch=1; bmg_src_def_domain=i2.hdslb.com; bsource=search_google; sid=7l86jgpi; b_lsid=D181510CA_1991E1E02C8; bp_t_offset_499303036=1109440247839588352; CURRENT_FNVAL=2000; home_feed_column=4; browser_resolution=1089-770"
# DEFAULT_COOKIE = "SESSDATA=xxx; bili_jct=yyy; ..."

if not DEFAULT_COOKIE:
    print("no cookie")

# ---------- 工具 ----------
def build_session(cookie=None, referer=None):
    s = requests.Session()
    headers = HEADERS_BASE.copy()
    if cookie:
        headers["cookie"] = cookie
    if referer:
        headers["referer"] = referer
    retries = Retry(
        total=3, backoff_factor=0.6,
        status_forcelist=[412, 429, 500, 502, 503, 504]
    )
    s.mount("https://", HTTPAdapter(max_retries=retries, pool_connections=16, pool_maxsize=16))
    s.headers.update(headers)
    return s

def extract_bvid(text: str) -> str:
    m = BV_RE.search(text)
    if not m:
        raise ValueError(f"未找到 BV 号：{text}")
    return m.group(1)

def read_urls(path=URL_LIST_FILE):
    if not os.path.exists(path):
        raise FileNotFoundError(f"未找到 {path}，请创建并写入视频链接或 BV 号")
    bvids = []
    seen = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):  # 跳过注释/空行
                continue
            try:
                bvid = extract_bvid(line)
                if bvid not in seen:
                    seen.add(bvid)
                    bvids.append(bvid)
            except Exception as e:
                print(f"[WARN] 跳过无法解析的行：{line} ({e})")
    return bvids

def get_pagelist_by_bvid(session, bvid):
    """获取分P信息：[{cid, page, part, duration}, ...]"""
    url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}"
    r = session.get(url, timeout=10)
    r.raise_for_status()
    j = r.json()
    if j.get("code", -1) != 0:
        raise RuntimeError(f"pagelist 接口失败：code={j.get('code')} msg={j.get('message')}")
    pages = j.get("data") or []
    if not pages:
        raise RuntimeError("pagelist 返回空，可能 BV 无效或需要登录。")
    return pages

def get_video_meta(session, bvid):
    """
    取视频基础信息（标题、发布日期）。
    x/web-interface/view 返回 data.pubdate（秒级 Unix 时间戳）
    """
    url = f"https://api.bilibili.com/x/web-interface/view?bvid={bvid}"
    r = session.get(url, timeout=10)
    r.raise_for_status()
    j = r.json()
    if j.get("code", -1) != 0 or not j.get("data"):
        raise RuntimeError(f"view 接口失败：code={j.get('code')} msg={j.get('message')}")
    data = j["data"]
    title = data.get("title", f"{bvid}")
    pub_ts = data.get("pubdate")  # int
    if not pub_ts:
        # 有些情况下可能返回 ctime；兜底一下
        pub_ts = data.get("ctime")
    if not pub_ts:
        # 再兜底为“今天”
        pub_dt = datetime.datetime.now()
    else:
        pub_dt = datetime.datetime.fromtimestamp(int(pub_ts))
    pubdate_str = pub_dt.strftime("%Y-%m-%d")  # 统一写“日期”到 timestamp
    return title, pubdate_str

def fetch_seg_bytes(session, cid, seg_idx):
    url = f"https://api.bilibili.com/x/v2/dm/web/seg.so?type=1&oid={cid}&segment_index={seg_idx}"
    r = session.get(url, timeout=10)
    r.raise_for_status()
    return r.content

def parse_danmu_from_seg(seg_bytes):
    # 仅提取“内容”，不解析更丰富字段（若需要可以改为 protobuf 解析）
    return [m.decode('utf-8', errors='ignore') for m in DM_RE.findall(seg_bytes)]

# ---------- I/O ----------
def save_csv(rows, path):
    with open(path, "w", encoding="utf-8-sig", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["content", "video_title", "timestamp"])
        w.writeheader()
        for r in rows:
            w.writerow(r)

def save_txt(lines, path):
    with open(path, "w", encoding="utf-8") as f:
        for line in lines:
            f.write((line or "").strip() + "\n")

# ---------- 抓取一个视频 ----------
def crawl_one_bv(session, bvid, cookie=None, sleep_s=0.05):
    referer = f"https://www.bilibili.com/video/{bvid}"
    session.headers.update({"referer": referer})
    print(f"\n=== 处理 {bvid} ===")

    # 1) 元数据：标题 & 发布日期（timestamp 统一用它）
    video_title_main, pubdate_str = get_video_meta(session, bvid)
    print(f"[META] 标题: {video_title_main} | 发布日期: {pubdate_str}")

    # 2) 分 P 列表
    pages = get_pagelist_by_bvid(session, bvid)
    print(f"[META] 共 {len(pages)} 个分P")

    all_rows_video = []
    for p in pages:
        cid = p["cid"]
        part = p.get("part") or f"P{p.get('page', '?')}"
        title = f"{bvid} | {part} | {video_title_main}"
        print(f"  - 抓取 {part} (cid={cid}) …")

        # 3) seg.so 逐段抓
        all_items = []
        seg_idx = 1
        empty_runs = 0
        while True:
            try:
                seg_bytes = fetch_seg_bytes(session, cid, seg_idx)
            except requests.HTTPError:
                time.sleep(1.0)
                seg_bytes = fetch_seg_bytes(session, cid, seg_idx)

            contents = parse_danmu_from_seg(seg_bytes)
            if not contents:
                empty_runs += 1
                if empty_runs >= 2:
                    break
            else:
                empty_runs = 0
                for c in contents:
                    all_items.append({
                        "content": c,
                        "video_title": title,
                        # 统一写“视频发布日期”
                        "timestamp": pubdate_str
                    })
            seg_idx += 1
            time.sleep(sleep_s)

        print(f"    {part} 抓到 {len(all_items)} 条")
        all_rows_video.extend(all_items)

        # 分P各自保存
        csv_path = os.path.join(RAW_DIR, f"{bvid}_{cid}_{part}.csv")
        save_csv(all_items, csv_path)

    # 单视频合并输出
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M")
    combined_csv = os.path.join(RAW_DIR, f"combined_raw_{bvid}_{ts}.csv")
    combined_txt = os.path.join(RAW_DIR, f"combined_raw_{bvid}_{ts}.txt")
    save_csv(all_rows_video, combined_csv)
    save_txt([r["content"] for r in all_rows_video], combined_txt)
    print(f"[OK] 单视频合计 {len(all_rows_video)} 条 → {os.path.basename(combined_csv)} / {os.path.basename(combined_txt)}")

    return all_rows_video  # 给总汇总用

# ---------- 主流程 ----------
def main():
    print("=" * 70)
    print("B站弹幕批量抓取（按 url.txt 列表；timestamp=视频发布日期）")
    print("=" * 70)

    bvids = read_urls(URL_LIST_FILE)
    if not bvids:
        print("未从 url.txt 读到任何 BV/链接。"); return

    sess = build_session(cookie=DEFAULT_COOKIE)

    grand_total_rows = []
    for bvid in bvids:
        try:
            rows = crawl_one_bv(sess, bvid, cookie=DEFAULT_COOKIE)
            grand_total_rows.extend(rows)
        except Exception as e:
            print(f"[ERROR] 处理 {bvid} 失败：{e}")

    # 全量汇总
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M")
    all_csv = os.path.join(RAW_DIR, f"combined_all_{ts}.csv")
    all_txt = os.path.join(RAW_DIR, f"combined_all_{ts}.txt")
    save_csv(grand_total_rows, all_csv)
    save_txt([r["content"] for r in grand_total_rows], all_txt)

    print("\n====== 汇总完成 ======")
    print(f"总计抓到 {len(grand_total_rows)} 条弹幕")
    print(f"全量 CSV：{all_csv}")
    print(f"全量 TXT：{all_txt}")

if __name__ == "__main__":
    main()


B站弹幕批量抓取（按 url.txt 列表；timestamp=视频发布日期）

=== 处理 BV1iR8MzkEpc ===
[META] 标题: 问界/理想/小米/极氪/特斯拉 城市事故炼狱 26车都谁不能回家？ | 发布日期: 2025-07-24
[META] 共 1 个分P
  - 抓取 问界/理想/小米/极氪/特斯拉 城市事故炼狱 26车都谁不能回家？ (cid=31247631170) …
    问界/理想/小米/极氪/特斯拉 城市事故炼狱 26车都谁不能回家？ 抓到 28839 条
[ERROR] 处理 BV1iR8MzkEpc 失败：[Errno 2] No such file or directory: 'C:\\Users\\Andrew\\Desktop\\homework\\cc\\bili_danmu_results\\BV1iR8MzkEpc_31247631170_问界/理想/小米/极氪/特斯拉 城市事故炼狱 26车都谁不能回家？.csv'

=== 处理 BV1vz8FzDEyE ===
[META] 标题: 全球首次 问界/理想/小米/特斯拉 36辆辅助驾驶高速事故搏命 你敢把命交给车吗？ | 发布日期: 2025-07-23
[META] 共 1 个分P
  - 抓取 全球首次 问界/理想/小米/特斯拉 36辆辅助驾驶高速事故搏命 你敢把命交给车吗？ (cid=31226333587) …
    全球首次 问界/理想/小米/特斯拉 36辆辅助驾驶高速事故搏命 你敢把命交给车吗？ 抓到 15717 条
[ERROR] 处理 BV1vz8FzDEyE 失败：[Errno 2] No such file or directory: 'C:\\Users\\Andrew\\Desktop\\homework\\cc\\bili_danmu_results\\BV1vz8FzDEyE_31226333587_全球首次 问界/理想/小米/特斯拉 36辆辅助驾驶高速事故搏命 你敢把命交给车吗？.csv'

=== 处理 BV19T8wznEHB ===
[META] 标题: 复现高速施工驾驶辅助事故 问界/小米/理想等 谁会选择撞卡车自断A柱？ | 发布日期: 2025-07-23
[META] 共 1 个分P
  - 抓取 复现高速施

In [6]:
# -*- coding: utf-8 -*-
"""
B站弹幕批量清洗工具（仅处理汇总文件 combined_all_*）
- 只读取 bili_danmu_results/combined_all_*.csv（优先）或同名 .txt（兜底）
- 抽取→去噪→规范化→去重，保留 video_title/timestamp
- 归一化日期：YYYY/MM/DD、YYYY.M.D 等 → YYYY-MM-DD
- 输出 cleaned_danmu_results/：
    - combined_cleaned_danmu_*.txt / *.csv
    - removed_reasons_*.csv （被删除条目与原因）
    - debug_original_vs_cleaned_*.csv （抽样对照）
"""

import re
import os
import glob
import datetime
import unicodedata
import pandas as pd
from collections import Counter

RAW_DIR = os.path.join(os.getcwd(), "bili_danmu_results")
OUT_DIR = os.path.join(os.getcwd(), "cleaned_danmu_results")
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- 预编译正则 ----------
RE_CTRL = re.compile(r'[\x00-\x1F\x7F-\x9F]+')                    # 控制字符
RE_HTML = re.compile(r'<[^>]+>')
RE_URL  = re.compile(r'https?://\S+')
# 仅保留：中英文、数字、常见中文标点、连字符
RE_KEEP = re.compile(r'[^\w\u4e00-\u9fa5，。！？、；："\'()【】《》·—…\-]+')
RE_SP   = re.compile(r'\s+')
RE_MULTI_PUNCT = re.compile(r'([，。！？…])\1+')
# 常见行首噪声块（括号/奇怪字节/短hex/孤立字母/标点）组合
RE_NOISY_PREFIX = re.compile(
    r'^\s*(?:[\(\)\[\]{}<>\|\\/\-_=+~^`·•]+|[A-Za-z](?=\s)|[0-9A-Fa-f]{6,10}|[,.:;，。；：!！?？\s])+'
)

def _cut_after_colon(raw: str) -> str:
    """如含冒号，取最后一个冒号后的片段"""
    if ':' in raw:
        return raw.split(':')[-1]
    return raw

def _strip_to_first_textual_char(s: str) -> str:
    """剥离前缀噪声块，定位第一个有效字符（中英数）"""
    s = RE_NOISY_PREFIX.sub('', s)
    idx = None
    for i, ch in enumerate(s):
        if ch.isalnum() or ('\u4e00' <= ch <= '\u9fa5'):
            idx = i
            break
    return s[idx:] if idx is not None else ''

def _normalize_text(s: str) -> str:
    s = RE_CTRL.sub(' ', s)                    # 删控制字符（含你样例里的奇怪前导字节）
    s = unicodedata.normalize('NFKC', s)       # 全角→半角
    s = RE_HTML.sub('', s)                     # 去HTML
    s = RE_URL.sub('', s)                      # 去URL
    s = RE_KEEP.sub(' ', s)                    # 过滤到允许字符集
    s = RE_MULTI_PUNCT.sub(r'\1', s)           # 连续标点压缩
    s = RE_SP.sub(' ', s).strip()              # 空白规整
    return s

def _looks_like_noise(s: str) -> bool:
    """噪声判定：纯数字/单字母/太短/文本占比低"""
    if not s:
        return True
    if s.isdigit():
        return True
    if len(s) == 1 and s.isalnum():
        return True
    if len(s) < 2:
        return True
    total = len(s)
    keep = sum(1 for ch in s if ch.isalnum() or '\u4e00' <= ch <= '\u9fa5')
    if keep / max(1, total) < 0.3:
        return True
    return False

def _normalize_date(ts: str) -> str:
    """
    归一化日期到 YYYY-MM-DD：
    支持：YYYY/M/D, YYYY-M-D, YYYY.M.D, YYYY_MM_DD（都可1位月日）
    其他情况原样返回（或空则返回空）
    """
    if not ts or not str(ts).strip():
        return ""
    s = str(ts).strip()
    s = RE_CTRL.sub(' ', s)         # 防止控制字符
    s = s.replace('.', '-').replace('/', '-').replace('_', '-')
    parts = [p for p in s.split('-') if p]
    if len(parts) >= 3 and parts[0].isdigit():
        y = parts[0]
        m = parts[1].zfill(2) if parts[1].isdigit() else parts[1]
        d = parts[2].zfill(2) if parts[2].isdigit() else parts[2]
        try:
            dt = datetime.date(int(y), int(m), int(d))
            return dt.strftime("%Y-%m-%d")
        except Exception:
            return s
    return s

def clean_one(raw: str) -> str:
    """单条弹幕清洗：抽取→去噪→规范化→长度保护"""
    s = str(raw or '')
    s = _cut_after_colon(s)
    s = RE_CTRL.sub(' ', s)              # 先去控制符，再定位文本起点
    s = _strip_to_first_textual_char(s)
    s = _normalize_text(s)
    if len(s) > 100:
        s = s[:100].rstrip()
    return s

# ---------- 文件选择：仅 combined_all_* ----------
def pick_combined_all_files():
    """
    优先 CSV：
      - 若存在多个 combined_all_*.csv，按修改时间取“最新的1个”
      - 若没有 CSV，则回退到最新的 combined_all_*.txt
    返回：文件路径列表（长度=1），并标注类型
    """
    csvs = glob.glob(os.path.join(RAW_DIR, "combined_all_*.csv"))
    txts = glob.glob(os.path.join(RAW_DIR, "combined_all_*.txt"))

    pick = None
    ftype = None

    if csvs:
        pick = max(csvs, key=os.path.getmtime)
        ftype = "csv"
    elif txts:
        pick = max(txts, key=os.path.getmtime)
        ftype = "txt"

    if not pick:
        return []
    print(f"将仅清洗：{os.path.basename(pick)}")
    return [(pick, ftype)]

def read_records_from_csv(path: str):
    df = pd.read_csv(path)
    for col in ["content", "video_title", "timestamp"]:
        if col not in df.columns:
            df[col] = ""
    df = df[["content", "video_title", "timestamp"]]
    # 归一化 timestamp
    df["timestamp"] = df["timestamp"].map(_normalize_date)
    return df.to_dict("records")

def read_records_from_txt(path: str):
    # 一行一条，仅有 content；video_title/timestamp 置空
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            rows.append({
                "content": line,
                "video_title": "",
                "timestamp": ""
            })
    return rows

def read_combined_all():
    picks = pick_combined_all_files()
    if not picks:
        print(f"未在 {RAW_DIR} 找到 combined_all_*.csv 或 .txt")
        return []
    path, ftype = picks[0]
    print(f"读取：{os.path.basename(path)}")
    if ftype == "csv":
        return read_records_from_csv(path)
    else:
        return read_records_from_txt(path)

# ---------- 清洗主流程 ----------
def clean_danmu_content(records):
    """
    输入：记录列表（至少含 content）
    输出：
      - cleaned: [{original, cleaned, video_title, timestamp}]
      - removed: [(original, reason)]
    """
    cleaned, removed = [], []

    for dm in records:
        raw = str(dm.get("content", "") or "")
        if not raw:
            removed.append((raw, "empty")); continue

        out = clean_one(raw)
        if _looks_like_noise(out):
            removed.append((raw, "noise")); continue

        cleaned.append({
            "original": raw,
            "cleaned": out,
            "video_title": dm.get("video_title", "未知视频"),
            "timestamp": _normalize_date(dm.get("timestamp", ""))  # 再保险
        })

    # 去重（按 cleaned）
    seen, dedup = set(), []
    for row in cleaned:
        key = row["cleaned"]
        if key in seen:
            removed.append((row["original"], "duplicate")); continue
        seen.add(key)
        dedup.append(row)

    return dedup, removed

# ---------- 输出 ----------
def save_outputs(cleaned, removed):
    ts = datetime.datetime.now().strftime("%Y%m%d%H%M")

    txt_path = os.path.join(OUT_DIR, f"combined_cleaned_danmu_{ts}.txt")
    csv_path = os.path.join(OUT_DIR, f"combined_cleaned_danmu_{ts}.csv")

    with open(txt_path, "w", encoding="utf-8") as f:
        for r in cleaned:
            f.write(r["cleaned"] + "\n")
    pd.DataFrame(cleaned).to_csv(csv_path, index=False, encoding="utf-8-sig")

    print(f"\n清洗后保存：\n- {txt_path}\n- {csv_path}")

    if removed:
        rm_path = os.path.join(OUT_DIR, f"removed_reasons_{ts}.csv")
        pd.DataFrame(removed, columns=["original", "reason"]).to_csv(
            rm_path, index=False, encoding="utf-8-sig")
        cnt = Counter([r for _, r in removed])
        print("移除原因分布：", dict(cnt))
        print(f"移除明细：\n- {rm_path}")

    dbg_path = os.path.join(OUT_DIR, f"debug_original_vs_cleaned_{ts}.csv")
    pd.DataFrame(cleaned)[["original", "cleaned"]].head(200)\
        .to_csv(dbg_path, index=False, encoding="utf-8-sig")
    print(f"调试对照样本：\n- {dbg_path}")

# ---------- 入口 ----------
def main():
    print("="*60)
    print("B站弹幕清洗（仅处理 combined_all_* 汇总文件）")
    print("="*60)

    records = read_combined_all()
    if not records:
        return
    print(f"读取记录数：{len(records)}")

    cleaned, removed = clean_danmu_content(records)
    keep_ratio = len(cleaned) / max(1, len(records)) * 100
    print(f"\n保留 {len(cleaned)}/{len(records)} ({keep_ratio:.1f}%)")

    save_outputs(cleaned, removed)
    print("\n完成。")

if __name__ == "__main__":
    main()


B站弹幕清洗（仅处理 combined_all_* 汇总文件）
将仅清洗：combined_all_202509061721.csv
读取：combined_all_202509061721.csv
读取记录数：113175

保留 86961/113175 (76.8%)

清洗后保存：
- C:\Users\Andrew\Desktop\homework\cc\cleaned_danmu_results\combined_cleaned_danmu_202509061738.txt
- C:\Users\Andrew\Desktop\homework\cc\cleaned_danmu_results\combined_cleaned_danmu_202509061738.csv
移除原因分布： {'noise': 13304, 'duplicate': 12910}
移除明细：
- C:\Users\Andrew\Desktop\homework\cc\cleaned_danmu_results\removed_reasons_202509061738.csv
调试对照样本：
- C:\Users\Andrew\Desktop\homework\cc\cleaned_danmu_results\debug_original_vs_cleaned_202509061738.csv

完成。


In [10]:
# -*- coding: utf-8 -*-
"""
B站弹幕分析 Pro（适配 cleaned_danmu_results）
- 只读取最新 cleaned CSV（combined_cleaned_danmu_*.csv）
- 分词：词性过滤 + 停用词（含口语补充） + 可选行业词典
- 向量化：1-2gram TF-IDF（min_df/max_df 抑制口水词）
- 自适应聚类：KMeans + 轮廓系数选 K（cosine）
- 主题命名：非停用词关键词 Top2
- 情感：SnowNLP（<4字判中性），汇总主题；并新增“按月情感走势”
- 可视化：词云、主题分布饼图 + 主题情感堆叠柱状 + 关键词表 + 月度情感折线
- 导出：词频、主题关键词/情感、Top n-gram、月度情感
"""

import os, re, glob, time, datetime
import numpy as np
import pandas as pd
import jieba, jieba.posseg as pseg
from collections import Counter
from wordcloud import WordCloud
from snownlp import SnowNLP
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.gridspec import GridSpec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# ===================== 参数 =====================
CLEANED_DIR = "cleaned_danmu_results"
OUT_DIR     = "analysis_results"
STOP_PATH   = "cn_stopwords.txt"      # 可空
USER_DICT   = "user_dict.txt"         # 可空

# 词性保留（名词/动词/术语/英文等）
KEEP_POS = {"n","nr","ns","nt","nz","vn","v","eng","nw","an","i","j","ni","nl","ng"}

# 弹幕口语停用词补充
DANMU_STOP_EXTRA = {
    "这个","那个","就是","什么","还有","然后","但是","所以","还是","已经","真的","感觉","觉得","知道",
    "可以","不可以","不会","不能","应该","可能","有点","有些","怎么","为啥","为什么",
    "啊","呀","呢","吧","哦","哇","诶","嘛","哈","哈哈","哈哈哈","emm","嗯","啊啊","呜呜",
    "视频","弹幕","现在","今天","昨天","明天","这里","那里","这样","那样","很多","非常"
}

# TF-IDF
MAX_FEATURES = 4000
MIN_DF       = 5
MAX_DF       = 0.6
NGRAM        = (1,2)

# K 范围
K_MIN, K_MAX = 2, 8

# 词云 TopN
WORDCLOUD_TOPN = 150

# 小句情感中性阈
SHORT_NEUTRAL_LEN = 4
# =================================================

# ---------- 字体 ----------
def setup_chinese_font():
    candidates = ['SimHei','Microsoft YaHei','SimSun','KaiTi',
                  'Noto Sans CJK SC','Source Han Sans SC','Arial Unicode MS']
    picked = None
    for name in candidates:
        if any(name in f.name for f in fm.fontManager.ttflist):
            picked = name; break
    if picked:
        plt.rcParams['font.family'] = picked
    plt.rcParams['axes.unicode_minus'] = False
    font_path = None
    if picked:
        for f in fm.findSystemFonts():
            if picked.lower() in os.path.basename(f).lower():
                font_path = f; break
    if not font_path:
        font_path = fm.findfont(fm.FontProperties(family='sans-serif'))
    return font_path

WC_FONT = setup_chinese_font()

# ---------- 基础工具 ----------
CTRL_RE  = re.compile(r'[\x00-\x1F\x7F-\x9F]+')
EMOJI_RE = re.compile(r'[\U00010000-\U0010ffff]', flags=re.UNICODE)

def load_stopwords(path=STOP_PATH):
    base = set()
    if path and os.path.exists(path):
        with open(path,'r',encoding='utf-8') as f:
            base = {x.strip() for x in f if x.strip()}
    base |= set(list("，。、！？；：”“‘‘（）()[]【】—…- "))
    base |= DANMU_STOP_EXTRA
    return base

def maybe_load_user_dict():
    if USER_DICT and os.path.exists(USER_DICT):
        jieba.load_userdict(USER_DICT)
        print(f"[INFO] Loaded user dictionary: {USER_DICT}")

def find_latest_cleaned_csv():
    csvs = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.csv"))
    if not csvs: return None
    return max(csvs, key=os.path.getmtime)

def normalize_date_str(s):
    """兼容 2025/8/12, 2025.8.12, 2025-8-12 等 → YYYY-MM-DD"""
    if not isinstance(s, str): s = str(s or "")
    s = CTRL_RE.sub(" ", s).strip().replace("/", "-").replace(".", "-").replace("_", "-")
    parts = [p for p in s.split("-") if p]
    if len(parts) >= 3 and parts[0].isdigit() and parts[1].isdigit() and parts[2].isdigit():
        try:
            dt = datetime.date(int(parts[0]), int(parts[1]), int(parts[2]))
            return dt.strftime("%Y-%m-%d")
        except Exception:
            return s
    return s

def load_cleaned_df(latest_csv):
    df = pd.read_csv(latest_csv)
    # 兼容列名
    if "cleaned" not in df.columns:
        raise ValueError("Column 'cleaned' not found in the cleaned CSV")
    for col in ["video_title","timestamp"]:
        if col not in df.columns:
            df[col] = ""
    # 轻量去噪
    df["cleaned"] = df["cleaned"].astype(str).map(lambda x: EMOJI_RE.sub("", CTRL_RE.sub(" ", x)).strip())
    # 日期归一化
    df["timestamp"] = df["timestamp"].astype(str).map(normalize_date_str)
    return df

def segment_docs(lines, stop):
    all_words, docs, kept_lines = [], [], []
    for s in lines:
        if len(s) < 2:
            continue
        tokens = []
        for w, flag in pseg.cut(s):
            if (w not in stop) and (len(w) > 1) and (flag in KEEP_POS):
                tokens.append(w)
        if tokens:
            all_words.extend(tokens)
            docs.append(" ".join(tokens))
            kept_lines.append(s)
    return all_words, docs, kept_lines

def save_wordfreq(counter):
    os.makedirs(OUT_DIR, exist_ok=True)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"word_frequency_{ts}.csv")
    with open(out,'w',encoding='utf-8') as fw:
        fw.write("rank,word,freq\n")
        for i,(w,f) in enumerate(counter.most_common(),1):
            fw.write(f"{i},{w},{f}\n")
    return out

def draw_wordcloud(counter, top_n=WORDCLOUD_TOPN):
    if not counter: return None
    wc = WordCloud(width=1200, height=700, background_color='white', font_path=WC_FONT)
    img = wc.generate_from_frequencies(dict(counter.most_common(top_n)))
    plt.figure(figsize=(12,7))
    plt.imshow(img, interpolation='bilinear'); plt.axis('off'); plt.title('High-frequency Word Cloud', fontsize=16)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"wordcloud_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close()
    return out

def top_ngrams(docs, n=2, topk=50):
    c = Counter()
    for d in docs:
        toks = d.split()
        for i in range(len(toks)-n+1):
            c[" ".join(toks[i:i+n])] += 1
    return c.most_common(topk)

# ---------- 聚类 ----------
def auto_kmeans(docs, min_k=K_MIN, max_k=K_MAX, max_features=MAX_FEATURES):
    vec = TfidfVectorizer(max_features=max_features, ngram_range=NGRAM,
                          min_df=MIN_DF, max_df=MAX_DF)
    X = vec.fit_transform(docs)
    n = X.shape[0]

    if n < 60:  # 少样本保护
        k = max(2, min(4, n // 15 or 2))
        model = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
        return model, vec, None

    X_eval = X
    if n > 6000:  # 抽样评估
        idx = np.random.RandomState(42).choice(n, 6000, replace=False)
        X_eval = X[idx]

    best_k, best_score, best_model = None, -1, None
    for k in range(min_k, max_k+1):
        km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
        labels_eval = km.labels_ if X_eval is X else km.predict(X_eval)
        score = silhouette_score(X_eval, labels_eval, metric='cosine')
        if score > best_score:
            best_k, best_score, best_model = k, score, km
    return best_model, vec, best_score

def extract_cluster_keywords(model, vectorizer, stop_extra, topn=10):
    terms = (vectorizer.get_feature_names_out()
             if hasattr(vectorizer, "get_feature_names_out")
             else vectorizer.get_feature_names())
    order = model.cluster_centers_.argsort()[:, ::-1]
    keywords, names = [], []
    for i in range(model.n_clusters):
        keys = []
        for j in range(topn*3):
            idx = order[i, j]
            if idx < len(terms):
                t = terms[idx]
                if all(tok not in stop_extra for tok in t.split()):
                    keys.append(t)
            if len(keys) >= topn:
                break
        keywords.append(keys[:topn])
        names.append(" ".join(keys[:2]) if keys else f"Topic {i+1}")
    return names, keywords

def sentiment_ratio(texts):
    if not texts: return (0,0,0)
    pos = neu = neg = 0
    for t in texts:
        t = t.strip()
        if len(t) < SHORT_NEUTRAL_LEN:
            neu += 1; continue
        s = SnowNLP(t).sentiments
        if s > 0.6: pos += 1
        elif s < 0.4: neg += 1
        else: neu += 1
    total = len(texts)
    return pos/total, neu/total, neg/total

def visualize_clusters(theme_names, theme_counts, sentiments, key_table):
    if not theme_names: return None
    os.makedirs(OUT_DIR, exist_ok=True)
    fig = plt.figure(figsize=(16,12))
    gs = GridSpec(2, 2, figure=fig)

    # Pie: Topic distribution
    ax1 = fig.add_subplot(gs[0,0])
    wedges, _, _ = ax1.pie(theme_counts, labels=None, autopct='%1.1f%%',
                           startangle=90, pctdistance=0.8, labeldistance=1.4)
    legend_labels = [f"{nm}: {cnt} items ({cnt/sum(theme_counts)*100:.1f}%)"
                     for nm, cnt in zip(theme_names, theme_counts)]
    ax1.legend(legend_labels, loc='center left', bbox_to_anchor=(-0.32, 0))
    ax1.set_title("Topic Distribution", fontsize=15)

    # Stacked bars: Sentiment distribution
    ax2 = fig.add_subplot(gs[0,1])
    idx = np.arange(len(theme_names))
    pos = [sentiments[n]['positive'] for n in theme_names]
    neu = [sentiments[n]['neutral'] for n in theme_names]
    neg = [sentiments[n]['negative'] for n in theme_names]
    barw = 0.65
    ax2.bar(idx, pos, width=barw, color='#4CAF50', label='Positive')
    ax2.bar(idx, neu, width=barw, bottom=pos, color='#2196F3', label='Neutral')
    ax2.bar(idx, neg, width=barw, bottom=[i+j for i,j in zip(pos,neu)],
            color='#F44336', label='Negative')
    ax2.set_xticks(idx)
    ax2.set_xticklabels(theme_names, rotation=45, ha='right')
    ax2.set_ylim(0,1)
    ax2.set_ylabel("Ratio"); ax2.set_title("Sentiment Distribution", fontsize=15)
    ax2.legend(loc='upper right', bbox_to_anchor=(1.15,1))

    # Keywords table
    ax3 = fig.add_subplot(gs[1,:]); ax3.axis('off')
    cols = [f"Keyword {i+1}" for i in range(max(len(r) for r in key_table) if key_table else 10)]
    data = [row + [""]*(len(cols)-len(row)) for row in key_table]
    table = ax3.table(cellText=data, rowLabels=theme_names, colLabels=cols, loc='center')
    table.auto_set_font_size(False); table.set_fontsize(10); table.scale(1,1.5)
    ax3.set_title("Cluster Keywords", fontsize=15, y=0.98)

    plt.subplots_adjust(top=0.93, bottom=0.08, left=0.08, right=0.95,
                        hspace=0.6, wspace=0.35)
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"cluster_summary_{ts}.png")
    plt.savefig(out, dpi=300, bbox_inches='tight'); plt.close(fig)
    return out

# ---------- 月度情感 ----------
def monthly_sentiment(df):
    """
    读取 cleaned（原句）和 timestamp（日期），按月汇总积极/中性/消极比例
    """
    if "timestamp" not in df.columns:
        return None, None
    tmp = df.copy()
    tmp["date"] = tmp["timestamp"].map(normalize_date_str)
    tmp = tmp[tmp["date"].str.len() >= 8]
    if tmp.empty:
        return None, None

    tmp["month"] = tmp["date"].map(lambda x: x[:7])  # YYYY-MM
    recs = []
    for m, sub in tmp.groupby("month"):
        pos = neu = neg = 0
        total = 0
        for t in sub["cleaned"].astype(str):
            t = t.strip()
            if not t: continue
            if len(t) < SHORT_NEUTRAL_LEN:
                neu += 1
            else:
                s = SnowNLP(t).sentiments
                if s > 0.6: pos += 1
                elif s < 0.4: neg += 1
                else: neu += 1
            total += 1
        if total > 0:
            recs.append({"month": m,
                         "positive": pos/total,
                         "neutral": neu/total,
                         "negative": neg/total,
                         "count": total})
    if not recs:
        return None, None

    ms_df = pd.DataFrame(sorted(recs, key=lambda x: x["month"]))
    # 可视化
    plt.figure(figsize=(12,6))
    x = np.arange(len(ms_df))
    plt.plot(x, ms_df["positive"], marker="o", label="Positive")
    plt.plot(x, ms_df["neutral"], marker="o", label="Neutral")
    plt.plot(x, ms_df["negative"], marker="o", label="Negative")
    plt.xticks(x, ms_df["month"], rotation=45, ha="right")
    plt.ylim(0,1); plt.ylabel("Ratio"); plt.title("Monthly Sentiment Trends")
    plt.legend()
    ts = time.strftime("%Y%m%d%H%M")
    out = os.path.join(OUT_DIR, f"monthly_sentiment_{ts}.png")
    plt.tight_layout(); plt.savefig(out, dpi=300); plt.close()
    csv_out = os.path.join(OUT_DIR, f"monthly_sentiment_{ts}.csv")
    ms_df.to_csv(csv_out, index=False, encoding="utf-8-sig")
    return csv_out, out

# ---------- 主流程 ----------
def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    print("="*64)
    print("Danmu Analysis Pro (Word Frequency / Word Cloud / Adaptive Clustering / Sentiment / Monthly Trends)")
    print("="*64)

    latest = find_latest_cleaned_csv()
    if not latest:
        print("No cleaned CSV found, please run preprocessing first."); return
    print(f"[INFO] Analyzing file: {latest}")

    maybe_load_user_dict()
    stop = load_stopwords(STOP_PATH)

    df = load_cleaned_df(latest)
    lines = df["cleaned"].astype(str).tolist()
    print(f"[INFO] Number of sentences after cleaning: {len(lines)}")

    jieba.initialize()
    all_words, docs, kept_lines = segment_docs(lines, stop)
    if not docs:
        print("[WARN] Empty after segmentation (stopwords too aggressive or texts too short)."); return
    print(f"[INFO] Number of documents after segmentation: {len(docs)}; total tokens: {len(all_words)}")

    # 词频、词云、Top bigram
    counter = Counter(all_words)
    freq_csv = save_wordfreq(counter); print(f"[OK] Word frequency file: {freq_csv}")
    wc_path = draw_wordcloud(counter)
    if wc_path: print(f"[OK] Word cloud: {wc_path}")
    ts = time.strftime("%Y%m%d%H%M")
    bigram = top_ngrams(docs, n=2, topk=50)
    bigram_csv = os.path.join(OUT_DIR, f"top_bigram_{ts}.csv")
    pd.DataFrame(bigram, columns=["bigram","count"]).to_csv(bigram_csv, index=False, encoding='utf-8-sig')
    print(f"[OK] Top bigram file: {bigram_csv}")

    # 聚类
    if len(docs) >= 20:
        print("[INFO] Adaptive clustering …")
        model, vec, s_score = auto_kmeans(docs)
        if s_score is not None:
            print(f"[INFO] Silhouette score: K={model.n_clusters}, score={s_score:.4f}")
        labels = model.labels_

        # 主题命名 + 关键词
        theme_names, key_table = extract_cluster_keywords(model, vec, DANMU_STOP_EXTRA, topn=10)

        # 每簇原句（情感用）
        groups = {i: [] for i in range(model.n_clusters)}
        for i, lbl in enumerate(labels):
            groups[lbl].append(kept_lines[i])

        theme_counts = [len(groups[i]) for i in range(model.n_clusters)]
        sentiments = {}
        for i, nm in enumerate(theme_names):
            p,u,n = sentiment_ratio(groups[i])
            sentiments[nm] = {"positive": p, "neutral": u, "negative": n}

        # 图表
        img = visualize_clusters(theme_names, theme_counts, sentiments, key_table)
        if img: print(f"[OK] Cluster overview: {img}")

        # 导出 CSV
        kw_csv = os.path.join(OUT_DIR, f"cluster_keywords_{ts}.csv")
        pd.DataFrame({"theme": theme_names, **{f"kw{i+1}":[row[i] if i<len(row) else "" for row in key_table] for i in range(10)}})\
          .to_csv(kw_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] Cluster keywords file: {kw_csv}")

        sent_csv = os.path.join(OUT_DIR, f"cluster_sentiment_{ts}.csv")
        pd.DataFrame([{"theme": nm, **sentiments[nm], "count": cnt, "percentage": cnt/len(docs)}
                      for nm, cnt in zip(theme_names, theme_counts)])\
          .to_csv(sent_csv, index=False, encoding='utf-8-sig')
        print(f"[OK] Cluster sentiment file: {sent_csv}")
    else:
        print("[WARN] Documents < 20, skip clustering/sentiment.")

    # 月度情感走势
    csv_out, img_out = monthly_sentiment(df)
    if csv_out:
        print(f"[OK] Monthly sentiment file: {csv_out}")
        print(f"[OK] Monthly sentiment chart: {img_out}")
    else:
        print("[INFO] Monthly sentiment not generated (timestamp missing or <2 months of samples).")

    # 摘要
    print("\nTop 10 Words:")
    for i,(w,f) in enumerate(counter.most_common(10),1):
        print(f"{i}. {w}  {f}")
    print("\nDone. Output directory: analysis_results/")

if __name__ == "__main__":
    main()


Danmu Analysis Pro (Word Frequency / Word Cloud / Adaptive Clustering / Sentiment / Monthly Trends)
[INFO] Analyzing file: cleaned_danmu_results\combined_cleaned_danmu_202509061738.csv
[INFO] Number of sentences after cleaning: 86961
[INFO] Number of documents after segmentation: 78047; total tokens: 219504
[OK] Word frequency file: analysis_results\word_frequency_202509061957.csv
[OK] Word cloud: analysis_results\wordcloud_202509061957.png
[OK] Top bigram file: analysis_results\top_bigram_202509061957.csv
[INFO] Adaptive clustering …
[INFO] Silhouette score: K=7, score=0.0194
[OK] Cluster overview: analysis_results\cluster_summary_202509062001.png
[OK] Cluster keywords file: analysis_results\cluster_keywords_202509061957.csv
[OK] Cluster sentiment file: analysis_results\cluster_sentiment_202509061957.csv
[OK] Monthly sentiment file: analysis_results\monthly_sentiment_202509062005.csv
[OK] Monthly sentiment chart: analysis_results\monthly_sentiment_202509062005.png

Top 10 Words:
1. 驾驶

In [9]:
# -*- coding: utf-8 -*-
"""
Sentiment Analysis for Cleaned Bilibili Danmaku
- Input: latest cleaned file from cleaned_danmu_results (CSV preferred; TXT fallback)
- Scoring: SnowNLP (0..1, higher = more positive)
- Buckets (five-level):
    Very Negative (<0.3) | Negative (0.3-0.4) | Neutral (0.4-0.6) | Positive (0.6-0.7) | Very Positive (>0.7)
- Outputs (sentiment_results/):
    * Interactive bar chart (HTML, pyecharts)
    * Static histogram (PNG, matplotlib)
    * Scored table CSV (text, score, label)
    * Summary JSON
    * Extreme-sample snippets (most negative / most positive)
"""

import os, glob, time, json
import numpy as np
import pandas as pd
from snownlp import SnowNLP
from pyecharts.charts import Bar
from pyecharts import options as opts
import matplotlib.pyplot as plt
from tqdm import tqdm

# ----------------------- Paths -----------------------
CLEANED_DIR = "cleaned_danmu_results"
OUT_DIR     = "sentiment_results"
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------------- Buckets -----------------------
# Boundaries reflect the spec precisely.
BUCKETS = [
    ("Very Negative", lambda s: s < 0.3),
    ("Negative",      lambda s: 0.3 <= s < 0.4),
    ("Neutral",       lambda s: 0.4 <= s <= 0.6),   # inclusive both ends
    ("Positive",      lambda s: 0.6 <  s <= 0.7),
    ("Very Positive", lambda s: s > 0.7),
]

def find_latest_cleaned_file():
    """Prefer CSV, fallback to TXT."""
    csvs = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.csv"))
    if csvs:
        latest = max(csvs, key=os.path.getmtime)
        print(f"[INFO] Using cleaned CSV: {os.path.basename(latest)}")
        return latest
    txts = glob.glob(os.path.join(CLEANED_DIR, "combined_cleaned_danmu_*.txt"))
    if txts:
        latest = max(txts, key=os.path.getmtime)
        print(f"[INFO] Using cleaned TXT: {os.path.basename(latest)}")
        return latest
    print("[ERROR] No cleaned file found. Run the cleaning step first.")
    return None

def load_cleaned_lines(path):
    """Load 'cleaned' texts (CSV -> 'cleaned' column; TXT -> lines)."""
    if path.endswith(".csv"):
        df = pd.read_csv(path)
        if "cleaned" not in df.columns:
            raise ValueError("Column 'cleaned' not found in CSV.")
        lines = [str(x).strip() for x in df["cleaned"].fillna("")]
    else:
        with open(path, "r", encoding="utf-8") as f:
            lines = [line.strip() for line in f]
    # Filter out blanks
    return [s for s in lines if s]

def score_texts(lines, show_progress=True):
    """SnowNLP scoring with safety fallback."""
    scores = []
    iterator = tqdm(lines, desc="Scoring", unit="danmaku") if show_progress else lines
    for t in iterator:
        try:
            scores.append(float(SnowNLP(t).sentiments))
        except Exception:
            scores.append(0.5)  # neutral fallback
    return scores

def bucketize(scores):
    """Count per 5-level bucket."""
    labels = [b[0] for b in BUCKETS]
    counts = [0]*len(BUCKETS)
    for s in scores:
        for i, (_, cond) in enumerate(BUCKETS):
            if cond(s):
                counts[i] += 1
                break
    return labels, counts

def export_scored_table(lines, scores, ts):
    """Save per-text scores & labels to CSV for downstream analysis."""
    def label_of(s):
        for name, cond in BUCKETS:
            if cond(s): return name
        return "Unknown"
    df = pd.DataFrame({"text": lines, "score": scores, "label": [label_of(s) for s in scores]})
    out = os.path.join(OUT_DIR, f"sentiment_scored_{ts}.csv")
    df.to_csv(out, index=False, encoding="utf-8-sig")
    return out, df

def export_extremes(df, ts, k=30):
    """Save top-K most negative/positive samples (for qualitative quotes)."""
    neg = df.sort_values("score", ascending=True).head(k)
    pos = df.sort_values("score", ascending=False).head(k)
    out = os.path.join(OUT_DIR, f"sentiment_extremes_{ts}.txt")
    with open(out, "w", encoding="utf-8") as f:
        f.write("=== Most Negative Samples ===\n")
        for i, r in neg.iterrows():
            f.write(f"[{r['score']:.4f}] {r['text']}\n")
        f.write("\n=== Most Positive Samples ===\n")
        for i, r in pos.iterrows():
            f.write(f"[{r['score']:.4f}] {r['text']}\n")
    return out

def bar_chart_html(labels, counts, avg_score, ts):
    """Interactive pyecharts bar."""
    bar = (
        Bar()
        .add_xaxis(labels)
        .add_yaxis("Count", counts)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="Danmaku Sentiment Distribution",
                subtitle=f"Average Sentiment: {avg_score:.4f} | N={sum(counts)}"
            ),
            xaxis_opts=opts.AxisOpts(name="Category"),
            yaxis_opts=opts.AxisOpts(name="Count"),
            toolbox_opts=opts.ToolboxOpts(),
            datazoom_opts=[opts.DataZoomOpts()]
        )
        .set_series_opts(
            label_opts=opts.LabelOpts(is_show=True),
            markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="Max")]),
        )
    )
    out = os.path.join(OUT_DIR, f"sentiment_distribution_{ts}.html")
    bar.render(out)
    return out

def histogram_png(scores, avg_score, ts):
    """Static histogram with region shading."""
    plt.figure(figsize=(12,7))
    n, bins, _ = plt.hist(scores, bins=30, edgecolor='black', alpha=0.75)
    plt.axvline(x=avg_score, color='red', linestyle='--', linewidth=2, label=f'Average = {avg_score:.4f}')
    # Shade regions for the five levels
    plt.axvspan(0.0, 0.3, color='red', alpha=0.10)
    plt.axvspan(0.3, 0.4, color='orange', alpha=0.10)
    plt.axvspan(0.4, 0.6, color='gold', alpha=0.10)
    plt.axvspan(0.6, 0.7, color='lightgreen', alpha=0.10)
    plt.axvspan(0.7, 1.0, color='green', alpha=0.10)

    plt.title("Distribution of Sentiment Scores", fontsize=14)
    plt.xlabel("Score (0=negative, 1=positive)", fontsize=12)
    plt.ylabel("Count", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    out = os.path.join(OUT_DIR, f"sentiment_histogram_{ts}.png")
    plt.tight_layout(); plt.savefig(out, dpi=300); plt.close()
    return out

def main():
    print("="*64)
    print("Danmaku Sentiment Analysis (SnowNLP, 5-level buckets)")
    print("="*64)

    path = find_latest_cleaned_file()
    if not path: return

    lines = load_cleaned_lines(path)
    print(f"[INFO] Loaded {len(lines)} cleaned danmaku lines")

    scores = score_texts(lines, show_progress=True)
    avg = float(np.mean(scores)) if scores else 0.5
    labels, counts = bucketize(scores)

    ts = time.strftime("%Y%m%d%H%M")

    # Exports
    scored_csv, df_scored = export_scored_table(lines, scores, ts)
    extremes_txt = export_extremes(df_scored, ts, k=30)
    html_bar = bar_chart_html(labels, counts, avg, ts)
    hist_png = histogram_png(scores, avg, ts)

    # Summary JSON (for paper/appendix reuse)
    summary = {
        "n": int(len(scores)),
        "average": round(avg, 6),
        "buckets": {labels[i]: int(counts[i]) for i in range(len(labels))},
        "ratios": {labels[i]: round(counts[i]/max(1,len(scores)), 6) for i in range(len(labels))},
        "files": {
            "scored_csv": os.path.basename(scored_csv),
            "extremes_txt": os.path.basename(extremes_txt),
            "bar_html": os.path.basename(html_bar),
            "hist_png": os.path.basename(hist_png),
        }
    }
    json_path = os.path.join(OUT_DIR, f"sentiment_summary_{ts}.json")
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    # Console report
    print("\n[SUMMARY]")
    print(f"- N = {summary['n']} | Average = {summary['average']:.4f}")
    for k in labels:
        c = summary["buckets"][k]
        r = summary["ratios"][k]
        print(f"  {k:>14s}: {c:6d}  ({r*100:4.1f}%)")
    print("\n[FILES]")
    for k, v in summary["files"].items():
        print(f"- {k}: {os.path.join(OUT_DIR, v)}")
    print(f"- summary_json: {json_path}")
    print("\nDone.")

if __name__ == "__main__":
    main()


Danmaku Sentiment Analysis (SnowNLP, 5-level buckets)
[INFO] Using cleaned CSV: combined_cleaned_danmu_202509061738.csv
[INFO] Loaded 86959 cleaned danmaku lines


Scoring: 100%|██████████| 86959/86959 [04:18<00:00, 336.73danmaku/s]



[SUMMARY]
- N = 86959 | Average = 0.4837
   Very Negative:  27755  (31.9%)
        Negative:   8403  ( 9.7%)
         Neutral:  19240  (22.1%)
        Positive:   7003  ( 8.1%)
   Very Positive:  24558  (28.2%)

[FILES]
- scored_csv: sentiment_results\sentiment_scored_202509061806.csv
- extremes_txt: sentiment_results\sentiment_extremes_202509061806.txt
- bar_html: sentiment_results\sentiment_distribution_202509061806.html
- hist_png: sentiment_results\sentiment_histogram_202509061806.png
- summary_json: sentiment_results\sentiment_summary_202509061806.json

Done.
