本文件为 jupyter 工程文件

### [官方文档](https://python.langchain.com/docs/get_started/introduction.html)
### [入门课程](https://learn.deeplearning.ai/langchain-chat-with-your-data)
![overview.jpg](./assets/overview.jpg)

In [None]:
#! apt-get update
#! apt-get install python3.10
#! pip install torch>=2.0
#! pip install langchain

## Loader
LangChain 提供了多种多样的 Loader 来加载数据, 并且支持实现自定义的 
<br>
[预定义的 Loader](https://python.langchain.com/docs/modules/data_connection/document_loaders/)

In [None]:
#!pip install requests
#!pip install jq
#!pip install unstructured
# https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/markdown

In [None]:
import requests
import json
import re
import urllib3
import os
import os.path
from pprint import pprint
from langchain.document_loaders import UnstructuredMarkdownLoader

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def fetch(pid):
    title = ""
    content = ""
    wiki_url = "https://172.16.14.66:8889/server/index.php?s=/api/page/info"
    response = requests.post(wiki_url, data={ "page_id" : pid }, verify=False)
    json_ret = json.loads(response.text)
    if json_ret["error_code"] == 0:
        data = json_ret["data"]
        title = data["page_title"]
        content = data["page_content"]
    return title, content

wiki_dir = "/mnt/d/workspace/wiki"

index_content = fetch(189)[-1]
data = []
if len(index_content) > 0:
    # match #### [飞腾环境下图形化界面问题](https://172.16.14.66:8889/web/#/6/23 "飞腾环境下图形化界面问题")
    pattern = r"\((.*?)\s+(.*?)\)"
    lines = index_content.split("\n")
    for line in lines:
        match = re.search(pattern, line)
        if match:
            url = match.group(1)
            pid = url.rsplit("/", 1)[-1]
            title = match.group(2)[1:-1]
            data.append((title, url, pid))
# pprint(data)

documents = []
for tuple in data:
    path = f"{wiki_dir}/{tuple[-1]}.md"
    if not os.path.exists(path):
        [title, content] = fetch(tuple[-1])
        with open(path, "w") as f:
            f.write(content)
            f.close()

    stat = os.stat(path)
    if stat.st_size > 0:
        loader = UnstructuredMarkdownLoader(path)
        docs = loader.load()
        for doc in docs:
            md = doc.metadata
            md["title"] = title
            md["id"] = tuple[-1]
        documents.extend(docs)

len(documents)
pprint(documents[0])


## Splitter
为了更好的进行处理, 模型对最大 token 的支持也不一样, 通常要把文档拆分后进行向量化. 通常是拆分成句子
<br>
[预定义的 Splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/)

In [None]:
# 获取 document 中的文本内容
def mapDocs(documents):
    docs = []
    for doc in documents:
        docs.append(doc.page_content)
    return docs

基础的也是最常用的拆分器

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 5,
    length_function = len,
    add_start_index = True,
)
txt_docs = text_splitter.split_documents(documents)
pprint(mapDocs(txt_docs[:5]))

处理代码语法的拆分器

In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)
md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=64, chunk_overlap=0
)
md_docs = md_splitter.split_documents(documents)
pprint(mapDocs(md_docs[:5]))

In [None]:
#!pip install sentence-transformers
#!apt-get install git-lfs
#!git lfs install
#!git clone https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

模型语句 token 拆分器, 需要配合向量模型

In [None]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
pmmb_model = "/mnt/e/ai/models/paraphrase-multilingual-mpnet-base-v2"
st_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, model_name=pmmb_model)
st_docs = st_splitter.split_documents(documents)
pprint(mapDocs(st_docs[:5]))

## Embedding
使用嵌入向量化模型将数据向量化
<br>
[Embedding 接口](https://python.langchain.com/docs/modules/data_connection/text_embedding/)

常规的 embedding 模型

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
# embedding_model = "/mnt/e/ai/models/m3e-base"
embedding_model = "/mnt/e/ai/models/text2vec-large-chinese"
embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
test_ret = embeddings.embed_query("hello, world")
pprint(len(test_ret))
st_embedded_docs = embeddings.embed_documents(mapDocs(txt_docs))
# pprint(st_embedded_docs[0])

指定的 embedding 模型

In [None]:
from sentence_transformers import SentenceTransformer
pmmb_model = SentenceTransformer('/mnt/e/ai/models/paraphrase-multilingual-mpnet-base-v2')

In [None]:
import numpy as np
test_text0 = "轻阅读无法启动"
test_text1 = "转换服务无法启动"
test_text2 = "字体安装问题"
tt0 = embeddings.embed_query(test_text0)
tt1 = embeddings.embed_query(test_text1)
tt2 = embeddings.embed_query(test_text2)
print(np.dot(tt0, tt0))
print(np.dot(tt0, tt1))
print(np.dot(tt0, tt2))
# print(np.dot(st_embedded_docs[0], st_embedded_docs[0]))
# print(np.dot(st_embedded_docs[0], st_embedded_docs[1]))
cos_sim = np.array(tt0).dot(tt0) / (np.linalg.norm(tt0) * np.linalg.norm(tt0))
print(cos_sim)
cos_sim = np.array(tt0).dot(tt1) / (np.linalg.norm(tt0) * np.linalg.norm(tt1))
print(cos_sim)
cos_sim = np.array(tt0).dot(tt2) / (np.linalg.norm(tt0) * np.linalg.norm(tt2))
print(cos_sim)

In [None]:
tt0 = pmmb_model.encode(test_text0)
tt1 = pmmb_model.encode(test_text1)
tt2 = pmmb_model.encode(test_text2)
print(np.dot(tt0, tt0))
print(np.dot(tt0, tt1))
print(np.dot(tt0, tt2))

## Vector Stores
存储并查询 embedding 的向量
<br>
[Store 接口](https://python.langchain.com/docs/modules/data_connection/vectorstores/)

FAISS, 其他诸如 Chroma 等都可以使用

In [None]:
#!pip install faiss-gpu
#!pip install faiss-cpu

In [None]:
from langchain.vectorstores import FAISS
db = FAISS.from_documents(txt_docs, embedding=embeddings)
# db.save_local("faiss_db")
# db = FAISS.load_local("faiss_db", embeddings=embeddings)
search_ret = db.similarity_search_with_score("安装字体", k=3)
pprint(search_ret)
search_ret = db.similarity_search_with_score("安装字体", k=3, filter={"id": "451"})
pprint(search_ret)

## Retriever
查询向量转换回数据
<br>
[Retriever 接口](https://python.langchain.com/docs/modules/data_connection/retrievers/)

In [None]:
#!pip install transformers
#!git clone https://github.com/THUDM/ChatGLM2-6B
#!cd ChatGLM2-6B
#!pip install -r requirements.txt

使用 ChatGLM2 LLM 模型

In [None]:
from transformers import AutoTokenizer, AutoModel
# 最新的 162b620e 模型本地加载报错, 上一版本可以正常加载
# 自动从 huggingface 来下来的模型存储在  ~/.cache/huggingface/ 下, 可以复制到别的机器上使用
model_id = "THUDM/chatglm2-6b"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModel.from_pretrained(model_id, trust_remote_code=True, device='cuda')
model = model.eval()


需要实现一个自定义的 LLM 类

In [None]:
# https://github.com/hwchase17/langchain/discussions/6969
from typing import Any, List
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
class Glm2LLM(LLM):
    model: object = None
    tokenizer: object = None
    max_token: int = 10000
    temperature: float = 0.01
    top_p = 0.9
    history = []

    @property
    def _llm_type(self) -> str:
        return "ChatGLM2"
    
    def _call(self, prompt: str, stop: List[str] | None = None, run_manager: CallbackManagerForLLMRun | None = None, **kwargs: Any) -> str:
        response, _ = self.model.chat(
            self.tokenizer,
            prompt,
            history=self.history,
            max_length=self.max_token,
            temperature=self.temperature
        )
        self.history = self.history + [[prompt, response]]
        return response

llm = Glm2LLM(model=model, tokenizer=tokenizer)

In [None]:
llm("你是谁")

## Chat
由 LLM 来进行语料处理

In [None]:
from langchain.prompts import PromptTemplate
template = """
{context}
做为运维人员, 根据上述已知信息，简洁和专业的来回答用户的问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，答案请使用中文。
问题: {question}
"""
qa_prompt = PromptTemplate(input_variables=["context", "question"],template=template)

from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm,
                                    #    chain_type="", // stuff refine map_reduce
                                       retriever=db.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": qa_prompt})


In [None]:
#另一种方式
def get_answer(question):
    docs_with_score = db.similarity_search_with_score(question, k=3)
    if len(docs_with_score) > 0:
        # pprint(docs_with_score)
        context = []
        for ds in docs_with_score:
            context.append(ds[0].page_content)
        prompt = """
{context}
做为运维人员, 根据上述已知信息，简洁和专业的来回答用户的问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题”，不允许在答案中添加编造成分，答案请使用中文。
问题: {question}
""".replace("{question}", question).replace("{context}", "\n".join(context))
        llm_ret = llm.generate([{"prompt": prompt, "history": [], "streaming": False}])
        g = llm_ret.generations[0][0]
        return g

In [None]:
#清理显存
import torch
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    print(f"Emptying gpu cache {device}...")
    with torch.cuda.device(device):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [None]:
pprint(qa_chain({"query": "怎样安装字体?"}))

In [None]:
pprint(qa_chain({"query": "转换服务如何部署?"}))

In [None]:
pprint(get_answer("怎样安装字体?"))