<a href="https://colab.research.google.com/github/1849083010n-cell/gdp-dashboard/blob/main/journey_to_the_west_%E2%80%9C%E5%A6%82%E6%9D%A5%E4%BD%9B%E7%A5%96%E2%80%9D%EF%BC%8C%E2%80%9C%E7%8E%89%E7%9A%87%E5%A4%A7%E5%B8%9D%E2%80%9D%E8%AF%8D%E9%A2%91%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 连接Google云端硬盘
from google.colab import drive
drive.mount('/content/drive')

# 导入必要的库
import re
import matplotlib.pyplot as plt
import numpy as np

# 设置中文字体，确保图表中文正常显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def count_and_visualize(file_path, keywords=['如來', '佛祖']):
    """
    统计指定关键词的出现次数并生成柱状图

    参数:
        file_path: 文件在Google云端硬盘中的路径
        keywords: 要统计的关键词列表，默认是['如來', '佛祖']
    """
    try:
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 初始化计数字典
        counts = {keyword: 0 for keyword in keywords}
        # 存储每个次出现的上下文（可选展示）
        contexts = {keyword: [] for keyword in keywords}

        # 遍历每个关键词进行统计
        for keyword in keywords:
            pattern = re.compile(re.escape(keyword))  # 转义特殊字符
            matches = pattern.finditer(content)

            for match in matches:
                counts[keyword] += 1
                # 记录上下文（前后各30字符）
                start = match.start()
                end = match.end()
                context_start = max(0, start - 15)
                context_end = min(len(content), end + 15)
                contexts[keyword].append(content[context_start:context_end])

        # 打印统计结果
        print("关键词出现次数统计：")
        for keyword, count in counts.items():
            print(f"'{keyword}' 共出现 {count} 次")
            # 可选：打印前3次出现的上下文（避免输出过多）
            if counts[keyword] > 0:
                print(f"前3次出现的上下文示例：")
                for i, ctx in enumerate(contexts[keyword][:10]):
                    print(f"  第{i+1}次：...{ctx}...")
            print("-" * 60)

        # 生成柱状图
        plt.figure(figsize=(10, 6))
        x = np.arange(len(keywords))
        plt.bar(x, counts.values(), color=['#1f77b4', '#ff7f0e'])
        plt.xticks(x, keywords, fontsize=12)
        plt.ylabel('出现次数', fontsize=12)
        plt.title('《西游记》中"如來"与"佛祖"出现次数对比', fontsize=14)

        # 在柱形上标注具体数字
        for i, v in enumerate(counts.values()):
            plt.text(i, v + 5, str(v), ha='center', fontsize=10)

        plt.tight_layout()
        plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 请修改为你的文件在Google云端硬盘中的实际路径
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行统计和可视化
count_and_visualize(file_path)

In [None]:
# 连接Google云端硬盘
from google.colab import drive
drive.mount('/content/drive')

# 导入必要的库
import re
import jieba
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# 设置中文字体，确保中文正常显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def get_buddha_cooccurrence(file_path, target_words=['如來', '佛祖'], window=20, top_n=5):
    """
    提取"如來"和"佛祖"出现时的高频共现词

    参数:
        file_path: 文件路径
        target_words: 目标关键词（默认'如來'和'佛祖'）
        window: 上下文窗口大小（前后各window个字符）
        top_n: 显示前n个高频词
    """
    try:
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 停用词列表（过滤无意义词汇，排除目标词本身）
        stopwords = {'的', '了', '在', '是', '有', '就', '也', '着', '之', '于',
                     '便', '乃', '与', '对', '向', '即', '等', '为', '所', '更',
                     '如來', '佛祖', '佛', '世尊', '如来佛祖'}  # 排除相关称谓

        # 存储每个目标词的共现词
        cooccurrence = {word: [] for word in target_words}

        for word in target_words:
            # 匹配目标词
            pattern = re.compile(re.escape(word))
            matches = pattern.finditer(content)

            for match in matches:
                start, end = match.span()
                # 提取上下文（前后各window个字符）
                context_start = max(0, start - window)
                context_end = min(len(content), end + window)
                context = content[context_start:context_end]

                # 分词并过滤停用词
                words = jieba.lcut(context)  # 中文分词
                # 保留长度>1的有效词汇，排除停用词
                filtered = [w for w in words if w.strip() and w not in stopwords and len(w) > 1]
                cooccurrence[word].extend(filtered)

        # 统计高频词并可视化
        for word in target_words:
            if not cooccurrence[word]:
                print(f"未找到'{word}'的共现词")
                continue

            # 统计词频并取前N
            word_counts = Counter(cooccurrence[word])
            top_words = word_counts.most_common(top_n)
            top_words, counts = zip(*top_words) if top_words else ([], [])

            # 打印结果
            print(f"\n'{word}'出现时的前{top_n}高频共现词：")
            for w, c in zip(top_words, counts):
                print(f"  {w}: {c}次")

            # 生成柱状图
            plt.figure(figsize=(10, 6))
            x = np.arange(len(top_words))
            # 不同关键词用不同颜色区分
            color = '#2ca02c' if word == '如來' else '#9467bd'
            plt.bar(x, counts, color=color)
            plt.xticks(x, top_words, fontsize=12)
            plt.ylabel('共现次数', fontsize=12)
            plt.title(f"'{word}'的高频共现词（Top{top_n}）", fontsize=14)
            plt.tight_layout()
            plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 文件路径（请确认与你的《西游记》文本路径一致）
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行分析（提取"如來"和"佛祖"的前5高频共现词）
get_buddha_cooccurrence(file_path)

In [None]:
# 连接Google云端硬盘
from google.colab import drive
drive.mount('/content/drive')

# 导入必要的库
import re
import matplotlib.pyplot as plt
import numpy as np

# 设置中文字体，确保图表中文正常显示
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def count_emperor_terms(file_path, keywords=['玉皇大帝', '玉皇', '玉帝', '大帝']):
    """
    统计玉皇大帝相关称呼（含"玉皇"）的出现次数并生成可视化图表

    参数:
        file_path: 文件路径
        keywords: 要统计的关键词列表（按包含关系排序）
    """
    try:
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 初始化计数字典和上下文存储
        counts = {kw: 0 for kw in keywords}
        contexts = {kw: [] for kw in keywords}
        # 记录已匹配的位置，避免重复计数
        matched_positions = set()

        # 按关键词长度排序（从长到短），优先匹配长关键词
        sorted_keywords = sorted(keywords, key=lambda x: len(x), reverse=True)

        for kw in sorted_keywords:
            pattern = re.compile(re.escape(kw))
            matches = pattern.finditer(content)

            for match in matches:
                start, end = match.span()
                # 检查当前位置是否已被更长的关键词占用
                if not any(start < pos_end and end > pos_start for pos_start, pos_end in matched_positions):
                    counts[kw] += 1
                    matched_positions.add((start, end))
                    # 记录上下文（前后各30字符）
                    context_start = max(0, start - 20)
                    context_end = min(len(content), end + 20)
                    contexts[kw].append(content[context_start:context_end])

        # 打印统计结果
        print("关键词出现次数统计：")
        for kw in keywords:  # 按原顺序输出
            print(f"'{kw}' 共出现 {counts[kw]} 次")
            # 显示前3次上下文示例
            if counts[kw] > 0:
                print("  前3次上下文示例：")
                for i, ctx in enumerate(contexts[kw][:10]):
                    print(f"    第{i+1}次：...{ctx}...")
            print("-" * 70)

        # 生成柱状图
        plt.figure(figsize=(14, 7))
        x = np.arange(len(keywords))
        bars = plt.bar(x, counts.values(), color=['#2ca02c', '#98df8a', '#ffbb78', '#9467bd'])

        # 设置坐标轴和标题
        plt.xticks(x, keywords, fontsize=12)
        plt.ylabel('出现次数', fontsize=12)
        plt.title('《西游记》中玉皇大帝相关称呼出现次数对比', fontsize=14)

        # 在柱形上标注具体数值
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 2,
                    f'{height}', ha='center', va='bottom', fontsize=10)

        plt.tight_layout()
        plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 文件路径（请确认与你的《西游记》文本路径一致）
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行统计与可视化
count_emperor_terms(file_path)

In [None]:
# 连接Google云端硬盘
from google.colab import drive
drive.mount('/content/drive')

# 导入必要的库
import re
import jieba
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# 设置中文字体
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]

def get_top_cooccurrence(file_path, target_words=['玉皇', '玉帝'], window=20, top_n=5):
    """
    提取目标词出现时的高频共现词

    参数:
        file_path: 文件路径
        target_words: 目标关键词（'玉皇'和'玉帝'）
        window: 上下文窗口大小（前后各window个字符）
        top_n: 显示前n个高频词
    """
    try:
        # 读取文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 停用词列表（过滤无意义词汇）
        stopwords = {'的', '了', '在', '是', '有', '就', '也', '着', '之', '于',
                     '便', '乃', '与', '对', '向', '即', '等', '为', '所', '更',
                     '玉皇', '玉帝', '玉皇大帝', '大帝'}  # 排除目标词本身

        # 存储每个目标词的共现词
        cooccurrence = {word: [] for word in target_words}

        for word in target_words:
            # 匹配目标词
            pattern = re.compile(re.escape(word))
            matches = pattern.finditer(content)

            for match in matches:
                start, end = match.span()
                # 提取上下文（前后各window个字符）
                context_start = max(0, start - window)
                context_end = min(len(content), end + window)
                context = content[context_start:context_end]

                # 分词并过滤停用词
                words = jieba.lcut(context)
                filtered = [w for w in words if w.strip() and w not in stopwords and len(w) > 1]
                cooccurrence[word].extend(filtered)

        # 统计高频词并可视化
        for word in target_words:
            if not cooccurrence[word]:
                print(f"未找到'{word}'的共现词")
                continue

            # 统计词频
            word_counts = Counter(cooccurrence[word])
            top_words = word_counts.most_common(top_n)
            top_words, counts = zip(*top_words) if top_words else ([], [])

            # 打印结果
            print(f"\n'{word}'出现时的前{top_n}高频共现词：")
            for w, c in zip(top_words, counts):
                print(f"  {w}: {c}次")

            # 生成柱状图
            plt.figure(figsize=(10, 6))
            x = np.arange(len(top_words))
            plt.bar(x, counts, color=['#1f77b4' if word == '玉皇' else '#ff7f0e'])
            plt.xticks(x, top_words, fontsize=12)
            plt.ylabel('出现次数', fontsize=12)
            plt.title(f"'{word}'的高频共现词（Top{top_n}）", fontsize=14)
            plt.tight_layout()
            plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 文件路径（请确认与你的《西游记》文本路径一致）
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行分析（提取"玉皇"和"玉帝"的前5高频共现词）
get_top_cooccurrence(file_path)

In [None]:
# 步骤1：挂载Google Drive（已挂载可跳过）
from google.colab import drive
drive.mount('/content/drive')

# 安装依赖
!pip install pyvis -q
!pip install jieba -q

# 步骤2：读取繁体《西游记》文本
import re

# 读取繁体文本并预处理
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'
try:
    # 尝试多种编码读取繁体文件
    encodings = ['utf-8', 'big5', 'utf-8-sig']
    text = None
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                text = f.read()
            break
        except UnicodeDecodeError:
            continue
    if text is None:
        raise Exception("无法解析文件编码（繁体文件建议用utf-8或big5）")

    # 文本清洗
    text = re.sub(r'[^\u4e00-\u9fa5，。！？、,.:;!?\n ]', '', text)
    text = re.sub(r'\n+', '\n', text).strip()
    print(f"繁体文本读取成功！长度：{len(text)}字符")
    print("文本预览（前200字）：", text[:200])
except Exception as e:
    print(f"文本处理失败：{e}")

# 步骤3：定义繁体角色及别名
character_aliases = {
    '孫悟空': ['孫悟空', '孫行者', '大聖', '齊天大聖', '美猴王', '悟空'],
    '唐僧': ['唐僧', '唐三藏', '玄奘', '師父'],
    '豬八戒': ['豬八戒', '八戒', '豬悟能', '呆子'],
    '沙和尚': ['沙和尚', '沙僧', '沙悟淨', '沙師弟'],
    '白龍馬': ['白龍馬', '白馬'],
    '玉皇大帝': ['玉皇大帝', '玉帝', '玉皇'],
    '如來佛祖': ['如來佛祖', '如來', '佛祖'],
    '觀音菩薩': ['觀音菩薩', '觀音', '觀世音'],
    '牛魔王': ['牛魔王', '牛魔'],
    '鐵扇公主': ['鐵扇公主', '鐵扇仙', '羅剎女'],
    '紅孩兒': ['紅孩兒', '聖嬰大王'],
    '太上老君': ['太上老君', '老君'],
    '白骨精': ['白骨精', '白骨夫人'],
    '二郎神': ['二郎神', '楊戩'],
    '哪吒': ['哪吒', '三壇海會大神'],
    '菩提祖師': ['菩提祖師', '菩提老祖'],
    '四海龍王': ['四海龍王', '東海龍王', '西海龍王', '南海龍王', '北海龍王'],
    '鎮元子': ['鎮元子', '鎮元大仙'],
    '女兒國國王': ['女兒國國王', '女王']
}

# 构建别名映射
alias_to_name = {}
for std_name, aliases in character_aliases.items():
    for alias in aliases:
        alias_to_name[alias] = std_name

all_aliases = [alias for aliases in character_aliases.values() for alias in aliases]

# 步骤4：提取关系
def extract_traditional_relations(text):
    relations = {}
    paragraphs = text.split('\n')

    for para in paragraphs:
        has_sun = any(alias in para for alias in character_aliases['孫悟空'])
        if not has_sun:
            continue

        for alias in all_aliases:
            if alias in character_aliases['孫悟空']:
                continue
            if alias in para:
                target_name = alias_to_name[alias]
                para_lower = para.lower()

                if target_name == '唐僧' and ('師父' in para or '三藏' in para):
                    rel_type = '師徒（孫悟空→唐僧）'
                elif target_name in ['豬八戒', '沙和尚'] and ('師弟' in para):
                    rel_type = '師兄弟'
                elif '打' in para or '殺' in para or '敵' in para or '妖' in para:
                    rel_type = '敵對'
                elif '幫' in para or '救' in para or '助' in para:
                    rel_type = '友好'
                elif '佛' in target_name or '菩薩' in target_name or '仙' in target_name:
                    rel_type = '神仙-弟子'
                else:
                    rel_type = '互動'

                if target_name in relations:
                    relations[target_name] = (relations[target_name][0], relations[target_name][1] + 1)
                else:
                    relations[target_name] = (rel_type, 1)

    relation_list = [('孫悟空', k, v[0], v[1]) for k, v in relations.items()]
    return sorted(relation_list, key=lambda x: x[3], reverse=True)

# 执行关系提取
if 'text' in locals():
    sun_relations = extract_traditional_relations(text)
    print("\n提取到的繁体角色关系（按互动次数排序）：")
    if sun_relations:
        for rel in sun_relations[:15]:
            print(f"{rel[0]} 與 {rel[1]}：{rel[2]}（{rel[3]}次）")
    else:
        print("未提取到关系，可能是繁体别名未覆盖文本中的表述")

# 步骤5：绘制关系网络图（修正字体设置错误）
from pyvis.network import Network
from IPython.display import HTML

# 修正后的网络图初始化代码（关键是加了 cdn_resources='in_line'）
def draw_traditional_network(relations):
    if not relations:
        return "無關係數據可繪製"

    # 这里添加 cdn_resources='in_line'，解决浏览器显示问题
    net = Network(
        height="700px", width="100%",
        bgcolor="#ffffff", font_color="#333333",
        notebook=True,
        cdn_resources='in_line'  # 新增这行，解决Chrome/Safari显示问题
    )

    # 后面的添加节点、边的代码不变...

    # 添加孫悟空节点（在font中设置字体）
    net.add_node(
        "孫悟空",
        size=60,
        color="#FFD700",
        title="主角：孫悟空（美猴王、齊天大聖）",
        font={"size": 20, "color": "#000000", "face": "Microsoft YaHei"}  # 这里设置字体
    )

    # 添加其他节点（同样在font中设置字体）
    for rel in relations:
        source, target, rel_type, count = rel
        node_size = min(20 + count // 3, 50)

        if "敵對" in rel_type:
            node_color = "#FF6347"
        elif "師徒" in rel_type or "師兄弟" in rel_type:
            node_color = "#32CD32"
        elif "神仙" in rel_type:
            node_color = "#1E90FF"
        else:
            node_color = "#9370DB"

        net.add_node(
            target,
            size=node_size,
            color=node_color,
            title=f"{target}\n互動次數：{count}",
            font={"size": 14, "face": "Microsoft YaHei"}  # 节点字体设置
        )

        net.add_edge(
            source, target,
            label=rel_type,
            width=min(count // 5, 10),
            color="#888888",
            title=f"互動次數：{count}",
            font={"face": "Microsoft YaHei"}  # 边的标签字体设置
        )

    # 布局设置
    net.set_options("""
    {
        "physics": {
            "barnesHut": {
                "gravitationalConstant": -2200,
                "springLength": 180
            },
            "stabilization": {
                "iterations": 250,
                "fit": true
            }
        },
        "edges": {
            "smooth": {
                "type": "cubicBezier",
                "forceDirection": "horizontal"
            }
        }
    }
    """)

    net.save_graph("sun_wukong_traditional_network.html")
    return HTML("sun_wukong_traditional_network.html")

# 绘制网络图
if 'sun_relations' in locals() and sun_relations:
    print("\n正在繪製繁体角色關係網絡圖...")
    display(draw_traditional_network(sun_relations))
else:
    print("\n無法繪製網絡圖（無關係數據）")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 手动安装中文字体
!apt-get update -qq
!apt-get install -y fonts-wqy-microhei
!fc-cache -fv
import re
import jieba
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.font_manager as fm

# 配置中文字体
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC", "Arial Unicode MS"]
plt.rcParams["axes.unicode_minus"] = False
try:
    font_path = [f for f in fm.findSystemFonts() if 'unicode' in f.lower() or 'heiti' in f.lower()][0]
    font_prop = fm.FontProperties(fname=font_path)
    plt.rcParams["font.family"] = font_prop.get_name()
except:
    plt.rcParams["font.family"] = "Arial Unicode MS"

def analyze_buddha_sentiment(file_path):
    """分析《西游记》中如来的出现情况及情感属性"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 章节分割
        chapter_pattern = re.compile(r'第[\u4e00-\u9fa5零一二三四五六七八九十百千\d]+回[：: ]?')
        chapter_matches = list(chapter_pattern.finditer(content))

        if not chapter_matches:
            print("警告：未找到章节标记，将全文视为一个章节")
            chapters = [content]
            chapter_titles = ["全文"]
        else:
            chapter_titles = [match.group().strip() for match in chapter_matches]
            chapters = []
            for i in range(len(chapter_matches)):
                start = chapter_matches[i].end()
                end = chapter_matches[i+1].start() if i < len(chapter_matches)-1 else len(content)
                chapters.append(content[start:end])

        # 情感分析词库（增加权重维度）
        sentiment_words = {
            'positive': {
                '善': 2, '慈悲': 3, '智慧': 3, '功德': 2, '救': 2,
                '助': 2, '圣': 3, '尊': 2, '贤': 2, '佛法': 3,
                '真经': 3, '觉悟': 2, '解脱': 2, '圆满': 3
            },
            'negative': {
                '怪': 2, '妖': 2, '魔': 3, '难': 2, '苦': 2,
                '斗': 2, '敌': 2, '恶': 3, '孽': 3, '罚': 2,
                '恼': 1, '骗': 2, '害': 2, '乱': 2
            }
        }

        # 统计各章节如来出现及情感倾向
        buddha_data = []
        for idx, (title, chapter) in enumerate(zip(chapter_titles, chapters)):
            # 如来出现次数统计
            count = len(re.findall(r'如來佛祖|如来佛祖|如来', chapter))

            # 提取如来上下文
            context_pattern = re.compile(r'(.{0,30}如來.{0,30}|.{0,30}如来.{0,30})')
            contexts = context_pattern.findall(chapter)
            context_text = ' '.join(contexts)

            # 情感得分计算
            words = jieba.cut(context_text)
            pos_score = 0
            neg_score = 0
            pos_words = []
            neg_words = []

            for word in words:
                if word in sentiment_words['positive']:
                    pos_score += sentiment_words['positive'][word]
                    pos_words.append(word)
                elif word in sentiment_words['negative']:
                    neg_score += sentiment_words['negative'][word]
                    neg_words.append(word)

            # 净情感值（正面减负面）
            net_sentiment = pos_score - neg_score

            buddha_data.append({
                '章节': title,
                '章节索引': idx + 1,
                '出现次数': count,
                '正面得分': pos_score,
                '负面得分': neg_score,
                '净情感值': net_sentiment,
                '正面词汇': Counter(pos_words).most_common(3),
                '负面词汇': Counter(neg_words).most_common(3)
            })

        # 创建数据框并展示
        df = pd.DataFrame(buddha_data)
        non_zero_df = df[df['出现次数'] > 0].sort_values(by='章节索引')
        print("如来出现章节及情感分析：")
        print(non_zero_df[['章节', '出现次数', '正面得分', '负面得分', '净情感值']])

        # 可视化1：出现次数与净情感值趋势
        if not non_zero_df.empty:
            fig, ax1 = plt.subplots(figsize=(15, 8))
            ax2 = ax1.twinx()

            ax1.bar(non_zero_df['章节索引'], non_zero_df['出现次数'], color='skyblue', alpha=0.6, label='出现次数')
            ax2.plot(non_zero_df['章节索引'], non_zero_df['净情感值'], color='red', marker='o', label='净情感值')

            ax1.set_xlabel('章节索引')
            ax1.set_ylabel('出现次数', color='blue')
            ax2.set_ylabel('净情感值', color='red')
            plt.title('如来出现次数与情感倾向变化趋势')
            fig.legend(loc='upper right')
            plt.tight_layout()
            plt.show()

        # 提取所有相关上下文进行整体情感分析
        pattern = re.compile(r'(.{0,50}如來.{0,50}|.{0,50}如来.{0,50})')
        all_contexts = pattern.findall(content)
        if not all_contexts:
            print("未找到包含'如来'的内容")
            return

        all_text = ' '.join(all_contexts)
        stopwords = {'的', '了', '在', '是', '他', '就', '有', '也', '和', '之', '为', '着', '于', '这', '那', '曰', '道', '说'}
        words = jieba.cut(all_text)
        filtered_words = [word for word in words if len(word) > 1 and word not in stopwords and word not in ['如來', '如来']]

        # 整体情感词汇统计
        total_pos = Counter()
        total_neg = Counter()
        for word in filtered_words:
            if word in sentiment_words['positive']:
                total_pos[word] += sentiment_words['positive'][word]
            elif word in sentiment_words['negative']:
                total_neg[word] += sentiment_words['negative'][word]

        # 可视化2：情感词汇分布
        plt.figure(figsize=(14, 6))
        plt.subplot(1, 2, 1)
        pos_df = pd.DataFrame(total_pos.most_common(10), columns=['词汇', '权重得分'])
        plt.barh(pos_df['词汇'], pos_df['权重得分'], color='green')
        plt.title('高频正面情感词汇（带权重）')
        plt.gca().invert_yaxis()

        plt.subplot(1, 2, 2)
        neg_df = pd.DataFrame(total_neg.most_common(10), columns=['词汇', '权重得分'])
        plt.barh(neg_df['词汇'], neg_df['权重得分'], color='red')
        plt.title('高频负面情感词汇（带权重）')
        plt.gca().invert_yaxis()

        plt.tight_layout()
        plt.show()

        # 情感总结
        total_pos_score = sum(total_pos.values())
        total_neg_score = sum(total_neg.values())
        sentiment_ratio = total_pos_score / (total_neg_score + 1)  # 避免除零

        print("\n===== 情感属性总结 =====")
        print(f"整体正面情感得分：{total_pos_score}")
        print(f"整体负面情感得分：{total_neg_score}")
        print(f"正负面情感比率：{sentiment_ratio:.2f}")
        print("\n主要正面词汇：", [w[0] for w in total_pos.most_common(5)])
        print("主要负面词汇：", [w[0] for w in total_neg.most_common(5)])

        # 可视化3：整体情感倾向
        plt.figure(figsize=(8, 6))
        plt.pie(
            [total_pos_score, total_neg_score],
            labels=['正面情感', '负面情感'],
            autopct='%1.1f%%',
            colors=['green', 'red'],
            startangle=90
        )
        plt.title('如来相关内容整体情感倾向分布')
        plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 设置文件路径（请根据实际路径修改）
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行分析
analyze_buddha_sentiment(file_path)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 手动安装中文字体
!apt-get update -qq
!apt-get install -y fonts-wqy-microhei
!fc-cache -fv
import re
import jieba
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.font_manager as fm

# 配置中文字体
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC", "Arial Unicode MS"]
plt.rcParams["axes.unicode_minus"] = False
try:
    font_path = [f for f in fm.findSystemFonts() if 'unicode' in f.lower() or 'heiti' in f.lower()][0]
    font_prop = fm.FontProperties(fname=font_path)
    plt.rcParams["font.family"] = font_prop.get_name()
except:
    plt.rcParams["font.family"] = "Arial Unicode MS"

def analyze_yuandi_sentiment(file_path):
    """分析《西游记》中玉帝的出现情况及情感属性"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 章节分割
        chapter_pattern = re.compile(r'第[\u4e00-\u9fa5零一二三四五六七八九十百千\d]+回[：: ]?')
        chapter_matches = list(chapter_pattern.finditer(content))

        if not chapter_matches:
            print("警告：未找到章节标记，将全文视为一个章节")
            chapters = [content]
            chapter_titles = ["全文"]
        else:
            chapter_titles = [match.group().strip() for match in chapter_matches]
            chapters = []
            for i in range(len(chapter_matches)):
                start = chapter_matches[i].end()
                end = chapter_matches[i+1].start() if i < len(chapter_matches)-1 else len(content)
                chapters.append(content[start:end])

        # 情感分析词库（适配玉帝角色的词汇体系）
        sentiment_words = {
            'positive': {
                '仁': 3, '圣明': 3, '威严': 2, '公正': 3, '恩德': 2,
                '仁慈': 3, '英明': 3, '功德': 2, '敕令': 2, '天恩': 3,
                '神圣': 2, '尊贵': 3, '威严': 2, '公正': 3
            },
            'negative': {
                '怒': 2, '惊': 2, '慌': 2, '无奈': 3, '难': 2,
                '急': 2, '怕': 3, '愁': 2, '乱': 2, '束手无策': 3,
                '狼狈': 3, '失色': 2, '无能': 3, '窘': 2
            }
        }

        # 统计各章节玉帝出现及情感倾向
        yuandi_data = []
        for idx, (title, chapter) in enumerate(zip(chapter_titles, chapters)):
            # 玉帝相关称呼匹配（包含玉皇大帝、大帝等）
            count = len(re.findall(r'玉帝|玉皇大帝|大帝|玉皇|玉皇帝', chapter))

            # 提取玉帝上下文
            context_pattern = re.compile(r'(.{0,30}玉帝.{0,30}|.{0,30}玉皇大帝.{0,30}|.{0,30}大帝.{0,30}|.{0,30}玉皇.{0,30})')
            contexts = context_pattern.findall(chapter)
            context_text = ' '.join(contexts)

            # 情感得分计算
            words = jieba.cut(context_text)
            pos_score = 0
            neg_score = 0
            pos_words = []
            neg_words = []

            for word in words:
                if word in sentiment_words['positive']:
                    pos_score += sentiment_words['positive'][word]
                    pos_words.append(word)
                elif word in sentiment_words['negative']:
                    neg_score += sentiment_words['negative'][word]
                    neg_words.append(word)

            # 净情感值（正面减负面）
            net_sentiment = pos_score - neg_score

            yuandi_data.append({
                '章节': title,
                '章节索引': idx + 1,
                '出现次数': count,
                '正面得分': pos_score,
                '负面得分': neg_score,
                '净情感值': net_sentiment,
                '正面词汇': Counter(pos_words).most_common(3),
                '负面词汇': Counter(neg_words).most_common(3)
            })

        # 创建数据框并展示
        df = pd.DataFrame(yuandi_data)
        non_zero_df = df[df['出现次数'] > 0].sort_values(by='章节索引')
        print("玉帝出现章节及情感分析：")
        print(non_zero_df[['章节', '出现次数', '正面得分', '负面得分', '净情感值']])

        # 可视化1：出现次数与净情感值趋势
        if not non_zero_df.empty:
            fig, ax1 = plt.subplots(figsize=(15, 8))
            ax2 = ax1.twinx()

            ax1.bar(non_zero_df['章节索引'], non_zero_df['出现次数'], color='gold', alpha=0.6, label='出现次数')
            ax2.plot(non_zero_df['章节索引'], non_zero_df['净情感值'], color='purple', marker='o', label='净情感值')

            ax1.set_xlabel('章节索引')
            ax1.set_ylabel('出现次数', color='gold')
            ax2.set_ylabel('净情感值', color='purple')
            plt.title('玉帝出现次数与情感倾向变化趋势')
            fig.legend(loc='upper right')
            plt.tight_layout()
            plt.show()

        # 提取所有相关上下文进行整体情感分析
        pattern = re.compile(r'(.{0,50}玉帝.{0,50}|.{0,50}玉皇大帝.{0,50}|.{0,50}大帝.{0,50}|.{0,50}玉皇.{0,50})')
        all_contexts = pattern.findall(content)
        if not all_contexts:
            print("未找到包含'玉帝'的内容")
            return

        all_text = ' '.join(all_contexts)
        stopwords = {'的', '了', '在', '是', '他', '就', '有', '也', '和', '之', '为', '着', '于', '这', '那', '曰', '道', '说', '天', '宫'}
        words = jieba.cut(all_text)
        filtered_words = [word for word in words if len(word) > 1 and word not in stopwords and word not in ['玉帝', '玉皇大帝', '大帝', '玉皇']]

        # 整体情感词汇统计
        total_pos = Counter()
        total_neg = Counter()
        for word in filtered_words:
            if word in sentiment_words['positive']:
                total_pos[word] += sentiment_words['positive'][word]
            elif word in sentiment_words['negative']:
                total_neg[word] += sentiment_words['negative'][word]

        # 可视化2：情感词汇分布
        plt.figure(figsize=(14, 6))
        plt.subplot(1, 2, 1)
        pos_df = pd.DataFrame(total_pos.most_common(10), columns=['词汇', '权重得分'])
        plt.barh(pos_df['词汇'], pos_df['权重得分'], color='green')
        plt.title('高频正面情感词汇（带权重）')
        plt.gca().invert_yaxis()

        plt.subplot(1, 2, 2)
        neg_df = pd.DataFrame(total_neg.most_common(10), columns=['词汇', '权重得分'])
        plt.barh(neg_df['词汇'], neg_df['权重得分'], color='red')
        plt.title('高频负面情感词汇（带权重）')
        plt.gca().invert_yaxis()

        plt.tight_layout()
        plt.show()

        # 情感总结
        total_pos_score = sum(total_pos.values())
        total_neg_score = sum(total_neg.values())
        sentiment_ratio = total_pos_score / (total_neg_score + 1)  # 避免除零

        print("\n===== 情感属性总结 =====")
        print(f"整体正面情感得分：{total_pos_score}")
        print(f"整体负面情感得分：{total_neg_score}")
        print(f"正负面情感比率：{sentiment_ratio:.2f}")
        print("\n主要正面词汇：", [w[0] for w in total_pos.most_common(5)])
        print("主要负面词汇：", [w[0] for w in total_neg.most_common(5)])

        # 可视化3：整体情感倾向
        plt.figure(figsize=(8, 6))
        plt.pie(
            [total_pos_score, total_neg_score],
            labels=['正面情感', '负面情感'],
            autopct='%1.1f%%',
            colors=['green', 'red'],
            startangle=90
        )
        plt.title('玉帝相关内容整体情感倾向分布')
        plt.show()

    except FileNotFoundError:
        print(f"错误：找不到文件 {file_path}")
    except Exception as e:
        print(f"发生错误：{str(e)}")

# 设置文件路径（请根据实际路径修改）
file_path = '/content/drive/MyDrive/Colab Notebooks/西遊記.txt'

# 执行分析
analyze_yuandi_sentiment(file_path)