# 三国演义 分析

## 题干
### 数字三国:
- [x] 请显示三国演义所有的回数以及回数标题和该回数的字数
- [ ] 统计在三国演义中主要人物(自己拟定备选人物，至少10人，尽可能均衡)出现的次数，以及出场顺序，以此认定谁才是三国在演义第一主角
- [ ] 请输出在备选人物中，用数据说话谁和谁关系最铁
- [ ] 请输出备选人物中，战绩情况，赢几次，输几次
- 本次上交内容为PDF报告。针对每个步骤，使用贴图证明每个题目的结果，辅以文字说明解题的思路，可适当配部分代码。
- 每题25分，满分100分。附加分20分(题目的创意10分+整个报告美化10分)，时间1个周

## 准备工作

### 安装依赖
- Python=3.11
- PyTorch

## Task 0

### 读取源数据

In [14]:
import os

dataset_src = []
if os.path.exists("dataset/三国演义.txt"):
    with open("dataset/三国演义.txt", "r", encoding="utf-8-sig") as f:
        dataset_src = f.readlines() # 逐行读取
else:
    print("文件不存在")

### 数据预处理

In [44]:
import json
import re

dataset = []
# 去空行
dataset = [line.strip() for line in dataset_src if line != "\n"]
# 去除额外信息
dataset = dataset[3:-2]
chapter_index = []
chapter_content = [[]]
chapter_cnt = 0
chinese_punctuation = r"[,.!;:，。！：；—　。，、》《？”“]"  # 中文标点符号
for i in range(len(dataset)):
    if dataset[i].startswith("正文"):
        chapter_cnt += 1
        chapter_index.append(i)  # 获取章节索引
        chapter_content.append([])  # 新建章节
    else:
        #dataset[i] = re.sub(chinese_punctuation, '', dataset[i])  # 去除标点符号
        chapter_content[chapter_cnt].append(dataset[i])  # 获取内容
        # 去除标点符号
    
# 保存为json
data = []
for i in range(len(chapter_index)):
    line = dataset[chapter_index[i]]
    data.append(
        {
            "chapter": line[line.find("回") + 1 :].strip(),
            "content": chapter_content[i + 1],
        }
    )
with open("dataset/dataset.json", "w", encoding="utf-8-sig") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

### 导入预处理数据集

In [1]:
import json
data = []
with open("dataset/dataset.json", "r", encoding="utf-8-sig") as f:
    data = json.load(f)
print("数据集大小：", len(data))

数据集大小： 120


## Task 1

### 获取章回

In [11]:
import csv
chapters = []
csvfile = open(
    "export/chapters.csv", "w", newline="", encoding="utf-8-sig"
)  # utf-8-sig 用于解决中文乱码问题
writer = csv.writer(csvfile)
for i in range(len(data)):
    chapters.append(data[i]["chapter"])
    writer.writerow([data[i]["chapter"], len(data[i]["content"])])
csvfile.close()
print("共{}章".format(len(chapters)))

共120章


# Task 2

### 进行分析

- 加载hanlp模型

In [12]:
import hanlp
import tqdm
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)

  from .autonotebook import tqdm as notebook_tqdm
                                   

- 使用hanlp进行分词并统计

In [None]:
person = dict()
for i in tqdm.tqdm(range(len(data))):
    chapter = data[i]["content"]
    for j in range(len(chapter)):
        '''
        result = HanLP(chapter[j], tasks="pos")
        for k in range(len(result["tok/fine"])):
            if result["pos/ctb"][k] == "NR":
                word=result["tok/fine"][k]
                if word in person:
                    person[word]["count"] += 1 # 计数
                else:
                    # 初始化
                    person[word] = {}
                    person[word]["position"] = []
                    person[word]["count"] = 1
                    person[word]["position"].append((i, j, k)) # 记录位置
        '''
        result = HanLP(chapter[j], tasks="ner/msra")
        for k in range(len(result['ner/msra'])):
            if result['ner/msra'][k][1] == "PERSON":
                word=result['ner/msra'][k][0]
                
                if word in person:
                    person[word]["count"] += 1 # 计数
                else:
                    # 初始化
                    person[word] = {}
                    person[word]["position"] = []
                    person[word]["count"] = 1
                    person[word]["position"].append((i, j, k)) # 记录位置

  1%|          | 1/120 [00:11<23:08, 11.67s/it]


### 建立人名映射

- 使用正则表达式

In [10]:
import re
import json

alias_mapping = {}


pattern = r'姓(\w+)名(\w+)，字(\w+)(?:，后改(\w+))?'


for chapter in data:
    for line in chapter["content"]:
        match = re.search(pattern, line)
        if match:
            full_name = match.group(1) + match.group(2)
            alias=[]
            alias.append(match.group(3))#字
            alias.append(match.group(1))#姓     
            alias.append(match.group(2))#名
            alias.append(match.group(1)+match.group(2))#姓+名
            alias.append(match.group(1)+match.group(3))#姓+字
            if full_name not in alias_mapping:
                alias_mapping[full_name] = []
            alias_mapping[full_name]=alias

with open("export/aliases.json", "w", encoding="utf-8-sig") as json_file:
    json.dump(alias_mapping, json_file, ensure_ascii=False, indent=2)

- 进行清洗

In [32]:
msg = ''
for i in person.keys():
    msg += i + ' '
    if len(msg) > 100:
        res=llm.chat(model="llama3.1",messages=[{"role":"user","content":msg}])   # 进行清洗
        print(res)
        msg = ''

model='llama3.1' created_at='2024-12-24T02:12:41.415419264Z' done=True done_reason='stop' total_duration=434806963 load_duration=27806845 prompt_eval_count=113 prompt_eval_duration=17000000 eval_count=25 eval_duration=388000000 message=Message(role='assistant', content='此为汉朝光武帝、汉桓帝和汉灵帝的政变、继位史。', images=None, tool_calls=None)
model='llama3.1' created_at='2024-12-24T02:12:41.7160593Z' done=True done_reason='stop' total_duration=188443433 load_duration=26203745 prompt_eval_count=104 prompt_eval_duration=13000000 eval_count=9 eval_duration=148000000 message=Message(role='assistant', content='我不清楚你在说什么。', images=None, tool_calls=None)
model='llama3.1' created_at='2024-12-24T02:12:42.731425434Z' done=True done_reason='stop' total_duration=939704353 load_duration=25893170 prompt_eval_count=98 prompt_eval_duration=11000000 eval_count=66 eval_duration=901000000 message=Message(role='assistant', content='这个问题似乎是一个名单，没有明确的问题或答案。名字看起来像是历史人物，可能与中国三国时期有关。但是没有更多上下文的信息很难回答问题。\n\n如果您想问一个具体的问题或者想要了解这个名单

KeyboardInterrupt: 

### 输出结果

In [26]:
import csv
persons=[]
for i in person.keys():
    persons.append([i,person[i]['count'],person[i]['position']])
    
persons=sorted(persons, key=lambda x: x[1], reverse=True)
with open("export/persons.csv", "w", newline="", encoding="utf-8-sig") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["人物", "次数", "位置"])
    for i in persons:
        writer.writerow(i)