<a href="https://colab.research.google.com/github/ChangQingxgg/Document-Answering-Robot/blob/main/%E6%96%87%E6%A1%A3%E9%97%AE%E7%AD%94%E6%9C%BA%E5%99%A8%E4%BA%BA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/linuxdeepin/wiki.deepin.org.git

Cloning into 'wiki.deepin.org'...
remote: Enumerating objects: 48037, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 48037 (delta 120), reused 129 (delta 89), pack-reused 47856 (from 4)[K
Receiving objects: 100% (48037/48037), 141.11 MiB | 22.62 MiB/s, done.
Resolving deltas: 100% (31465/31465), done.
Updating files: 100% (2249/2249), done.


In [None]:
!pip install markdown
!pip install bs4
!pip install whoosh
!pip install transformers



In [None]:
import os
print(f"目录存在: {os.path.exists('wiki.deepin.org')}")
print(f"目录内容: {os.listdir('wiki.deepin.org') if os.path.exists('wiki.deepin.org') else '目录不存在'}")


目录存在: False
目录内容: 目录不存在


In [None]:
import os
import markdown
from bs4 import BeautifulSoup
import whoosh.index as index
from whoosh.fields import Schema, TEXT
from whoosh.qparser import MultifieldParser
from transformers import pipeline

# 解析 Markdown 文件为纯文本
def md_to_text(md_content):
    html = markdown.markdown(md_content)
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

# 定义搜索索引模式
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True), path=TEXT(stored=True), url=TEXT(stored=True))
index_dir = "wiki_index"
if not os.path.exists(index_dir):
    os.mkdir(index_dir)
ix = index.create_in(index_dir, schema) if not index.exists_in(index_dir) else index.open_dir(index_dir)

# 构建搜索索引
def build_index(wiki_dir, base_url):
    writer = ix.writer()
    for root, _, files in os.walk(wiki_dir):
        for file in files:
            if file.endswith(".md"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding="utf-8") as f:
                    md_content = f.read()
                    text_content = md_to_text(md_content)
                    #print(f"文件：{file_path}")
                    #print(f"解析后的文本：{text_content[:400]}")  # 打印前200字符
                    title = file.replace(".md", "")
                    rel_path = os.path.relpath(file_path, wiki_dir).replace("\\", "/")
                    wiki_url = f"{base_url}/{rel_path.replace('.md', '')}"
                    writer.add_document(title=title, content=text_content, path=file_path, url=wiki_url)
    writer.commit()

# 加载问答模型
qa_pipeline = pipeline("question-answering", model="NchuNLP/Chinese-Question-Answering")

# 回答用户问题
def answer_question(question, top_k=5):
    with ix.searcher() as searcher:
        query = MultifieldParser(["title", "content"], ix.schema).parse(question)
        print(f"搜索查询：{query}")
        results = searcher.search(query, limit=top_k)
        print(f"找到 {len(results)} 个结果")
        for i, hit in enumerate(results, 1):
            print(f"结果 {i}: {hit['title']} (得分: {hit.score})")
        for hit in results:
            print(f"标题：{hit['title']}")
            print(f"内容：{hit['content'][:200]}")
            print(f"路径：{hit['path']}")
            print(f"URL：{hit['url']}")
            context = hit["content"]
            result = qa_pipeline(question=question, context=context)
            print(result["score"])
            if result["score"] > 0.5:
                return {"answer": result["answer"], "url": hit["url"], "score": result["score"]}
    return {"answer": "未找到答案", "url": "", "score": 0}

# 主程序：命令行交互
def main():
    wiki_dir = "wiki.deepin.org"  # Wiki 本地路径
    base_url = "https://wiki.deepin.org"  # Wiki 线上基础 URL
    print("正在构建索引，请稍候...")
    build_index(wiki_dir, base_url)
    print("欢迎使用 Deepin Wiki 聊天机器人！输入问题或输入 '退出' 结束。")
    while True:
        question = input("问题：")
        if question.strip().lower() == "退出":
            print("感谢使用，再见！")
            break
        result = answer_question(question)
        print(f"回答：{result['answer']}")
        if result["url"]:
            print(f"参考链接：{result['url']}")
        print()

if __name__ == "__main__":
    main()

Device set to use cuda:0


正在构建索引，请稍候...
欢迎使用 Deepin Wiki 聊天机器人！输入问题或输入 '退出' 结束。
问题：什么是深度壁纸？
搜索查询：(title:什么是深度壁纸 OR content:什么是深度壁纸)
找到 0 个结果
回答：未找到答案

问题：深度壁纸
搜索查询：(title:深度壁纸 OR content:深度壁纸)
找到 7 个结果
结果 1: 深度壁纸 (得分: 20.299823157510325)
结果 2: 深度壁纸 (得分: 20.299823157510325)
结果 3: 深度壁纸 (得分: 20.299823157510325)
结果 4: 深度壁纸 (得分: 20.299823157510325)
结果 5: 深度壁纸 (得分: 20.299823157510325)
标题：深度壁纸
内容：
title: 深度壁纸
description: 
published: true
date: 2023-02-22T09:00:26.950Z
tags: 
editor: markdown
dateCreated: 2022-04-21T03:47:31.467Z

简介
壁纸，又名电脑壁纸或桌面背景，是在桌面上可以随意个性化的图片。
随着电脑技术的发展，从黑白屏到彩屏，电脑的待机桌面不再由
路径：wiki.deepin.org/01_软件wiki/00_GUI软件/01_deepin开发的软件/深度壁纸.md
URL：https://wiki.deepin.org/01_软件wiki/00_GUI软件/01_deepin开发的软件/深度壁纸
0.00366831268183887
标题：深度壁纸
内容：
title: 深度壁纸
description: 
published: true
date: 2023-02-22T09:00:26.950Z
tags: 
editor: markdown
dateCreated: 2022-04-21T03:47:31.467Z

简介
壁纸，又名电脑壁纸或桌面背景，是在桌面上可以随意个性化的图片。
随着电脑技术的发展，从黑白屏到彩屏，电脑的待机桌面不再由
路径：wiki.deepin.org/01_软件wiki/00_GUI软件/01_deepin开发的软件/深度壁纸.md
URL：https:/