In [45]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
import torch

def torch_gc():
    if torch.cuda.is_available():
        print("clear cache")
        # with torch.cuda.device(DEVICE):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    elif torch.backends.mps.is_available():
        try:
            from torch.mps import empty_cache
            empty_cache()
        except Exception as e:
            print(e)
            print("如果您使用的是 macOS 建议将 pytorch 版本升级至 2.0.0 或更高版本，以支持及时清理 torch 产生的内存占用。")

In [46]:
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
SENTENCE_SIZE=100
class ChineseTextSplitter(CharacterTextSplitter):
    def __init__(self, pdf: bool = False, sentence_size: int = SENTENCE_SIZE, **kwargs):
        super().__init__(**kwargs)
        self.pdf = pdf
        self.sentence_size = sentence_size

    def split_text1(self, text: str) -> List[str]:
        if self.pdf:
            text = re.sub(r"\n{3,}", "\n", text)
            text = re.sub('\s', ' ', text)
            text = text.replace("\n\n", "")
        sent_sep_pattern = re.compile('([﹒﹔﹖﹗．。！？]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')  # del ：；
        sent_list = []
        for ele in sent_sep_pattern.split(text):
            if sent_sep_pattern.match(ele) and sent_list:
                sent_list[-1] += ele
            elif ele:
                sent_list.append(ele)
        return sent_list

    def split_text(self, text: str) -> List[str]:   ##此处需要进一步优化逻辑
        if self.pdf:
            text = re.sub(r"\n{3,}", r"\n", text)
            text = re.sub('\s', " ", text)
            text = re.sub("\n\n", "", text)

        text = re.sub(r'([;；.!?。！？\?])([^”’])', r"\1\n\2", text)  # 单字符断句符
        text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)  # 英文省略号
        text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)  # 中文省略号
        text = re.sub(r'([;；!?。！？\?]["’”」』]{0,2})([^;；!?，。！？\?])', r'\1\n\2', text)
        # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\n放到双引号后，注意前面的几句都小心保留了双引号
        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
        # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。
        ls = [i for i in text.split("\n") if i]
        for ele in ls:
            if len(ele) > self.sentence_size:
                ele1 = re.sub(r'([,，.]["’”」』]{0,2})([^,，.])', r'\1\n\2', ele)
                ele1_ls = ele1.split("\n")
                for ele_ele1 in ele1_ls:
                    if len(ele_ele1) > self.sentence_size:
                        ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
                        ele2_ls = ele_ele2.split("\n")
                        for ele_ele2 in ele2_ls:
                            if len(ele_ele2) > self.sentence_size:
                                ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
                                ele2_id = ele2_ls.index(ele_ele2)
                                ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
                                                                                                       ele2_id + 1:]
                        ele_id = ele1_ls.index(ele_ele1)
                        ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]

                id = ls.index(ele)
                ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
        return ls

In [None]:
from langchain.document_loaders import UnstructuredFileLoader, TextLoader

filepath = "/root/caoduanxin/docs/tairQA_cn.txt"

loader = TextLoader(filepath,autodetect_encoding=True)
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=100)
print(f'textsplitter:{textsplitter}')

docs = loader.load_and_split(textsplitter)
print(docs)
for doc in docs:
    print(f'解析后的文本：{doc}')

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import torch
EMBEDDING_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
embeddings = HuggingFaceEmbeddings(model_name='GanymedeNil/text2vec-large-chinese',
                                                model_kwargs={'device': EMBEDDING_DEVICE})

torch_gc()

In [None]:
from langchain.vectorstores import Tair
keys=[]
for doc_index in range(len(docs)):
    keys.append(f'doc{doc_index}')
for key in keys:
    print(key)
vector_store = Tair.from_documents(docs,embeddings,tair_url="redis://120.27.213.45:6380",keys=keys)  # docs 为Document列表

In [None]:
from pypinyin import lazy_pinyin
import datetime

VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))), "vector_store")
print(f'{VS_ROOT_PATH}')

file = os.path.split(filepath)[-1]
print(file)

vs_path = os.path.join(VS_ROOT_PATH, f"""{"".join(lazy_pinyin(os.path.splitext(file)[0]))}_FAISS_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}""")
print(f'{vs_path}')

In [None]:
query="Tair是什么？"
context = vector_store.similarity_search(query, k=1)
context = "\n".join([doc.page_content for doc in context])
print(context)

In [None]:
PROMPT_TEMPLATE = """已知信息：
{context} 

根据上述已知信息，简洁和专业的来回答用户的问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”，不允许在答案中添加编造成分，答案请使用中文。 问题是：{question}"""
prompt = PROMPT_TEMPLATE.replace("{question}", query).replace("{context}", context)
print(f'prompt:{prompt}')

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("/root/ChatGLM-6B/THUDM/chatglm-6b-int4-qe", trust_remote_code=True).half().cuda()
model = model.eval()
response, history = model.chat(tokenizer, prompt, history=[])
print(response)

In [None]:
response, history = model.chat(tokenizer, "tairvector是什么", history=[])
print(response)

In [None]:
import tair
client = tair.Tair(host="120.27.213.45", port=6380)

In [None]:
tairvector_text=[]
#tairvector_text.append(str(client.get("tairvector")))
tairvector_text.append("tairvector是向量数据库")


tairvector_meta=[{"source":"tair"}]

key_tairvector=[]
key_tairvector.append(f'tairvector0')
Tair.from_texts(tairvector_text,embeddings,tairvector_meta,"langchain","content","metadata",tair_url="redis://120.27.213.45:6380",keys=key_tairvector)

In [None]:
query="tairvector是什么？"
context = vector_store.similarity_search(query, k=10)
context = "\n".join([doc.page_content for doc in context])


PROMPT_TEMPLATE = """已知信息：
{context} 

根据上述已知信息，简洁和专业的来回答用户的问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”，不允许在答案中添加编造成分，答案请使用中文。 问题是：{question}"""
prompt = PROMPT_TEMPLATE.replace("{question}", query).replace("{context}", context)
print(f'prompt:{prompt}')

response, history = model.chat(tokenizer, prompt, history=[])
print(response)


In [55]:
response, history = model.chat(tokenizer, "tairvector是什么", history=[])
print(response)

TairVector是一个基于Python的开源机器学习库，主要用于实现机器学习任务，如分类、聚类、降维等。TairVector使用Python编程语言，并包含了丰富的函数和工具箱，可以方便地完成各种机器学习任务。

TairVector的优点包括：

- 简单易用的界面，可以方便地安装和使用
- 支持多种机器学习算法，包括SVM、KNN、Random Forest等
- 可以方便地导入和导出数据集
- 可以方便地实现自定义的机器学习算法

TairVector也存在一些局限性，例如性能可能不如一些专门的机器学习库，尤其是在处理大型数据集时。但是，TairVector是一个开源的、免费的库，任何人都可以对其进行修改和扩展，这对于实现自己的机器学习算法非常有帮助。


In [56]:
import tair
client = tair.Tair(host="120.27.213.45", port=6380)

In [63]:
tairvector_text=[]
#tairvector_text.append(str(client.get("tairvector")))
tairvector_text.append("tairvector是向量数据库")


tairvector_meta=[{"source":"tair"}]

key_tairvector=[]
key_tairvector.append(f'tairvector0')
Tair.from_texts(tairvector_text,embeddings,tairvector_meta,"langchain","content","metadata",tair_url="redis://120.27.213.45:6380",keys=key_tairvector)

<langchain.vectorstores.tair.Tair at 0x7f71b8f880a0>

In [64]:
query="tairvector是什么？"
context = vector_store.similarity_search(query, k=10)
context = "\n".join([doc.page_content for doc in context])


PROMPT_TEMPLATE = """已知信息：
{context} 

根据上述已知信息，简洁和专业的来回答用户的问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”，不允许在答案中添加编造成分，答案请使用中文。 问题是：{question}"""
prompt = PROMPT_TEMPLATE.replace("{question}", query).replace("{context}", context)
print(f'prompt:{prompt}')

response, history = model.chat(tokenizer, prompt, history=[])
print(response)


[Document(page_content='tairvector是向量数据库', metadata={'source': 'tair'}), Document(page_content='Tair是阿里云国产自研的云原生内存数据库。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='提供TairString，TairHash，TairGIS，TairCpc，TairBloom等多种扩展数据结构，极大降低用户的开发成本，更有利于业务创新。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='为什么选择云原生内存数据库Tair', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='同时，Tair与新型存储介质——持久内存的高效结合，相比内存，成本降低30%以上，并能做到数据持久化和提供近似于内存的性能。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='目前，Tair已广泛应用于政务、金融、制造、医疗和泛互联网等各行业客户，满足客户的高速查询和计算场景。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='同时推出Tair集群版无感扩缩容方案，解决当前业界扩缩容方案对业务有损的问题。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Document(page_content='在完全兼容Redis的基础上，提供了丰富的数据模型和企业级能力来帮助客户构建实时在线场景。', metadata={'source': '/root/caoduanxin/docs/tairQA_cn.txt'}), Docu