In [7]:
import os
import json
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from embedding.dashscope_embedding import dashscope_embedding
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader, UnstructuredFileLoader

In [8]:
def load_document_from_file(loaders:list, file_path:str):
    """
    从一个 pdf、txt 或 md 文档中加载内容。
    :param loaders: 已经加载的文件
    :param file_path: 目标文件地址
    """
    loader = None
    filename, extension = os.path.splitext(file_path)
    print("current file:", filename + extension)
    if extension == '.pdf':
        loader = PyMuPDFLoader(file_path).load()
    elif extension == '.md':
        loader = UnstructuredMarkdownLoader(file_path).load()
    elif extension == '.txt' or extension == 'yaml':
        loader = UnstructuredFileLoader(file_path).load()
    loaders.append(loader)

In [9]:
def load_document_from_dir_without_repetition(dir_path:str, saved_file_name_list:list, loaders:list = [] ):
    """
    从文件夹无重复的加载文件：检索已构建数据库的文件列表，匹配不重复的文件名，并加载到 loaders 返回
    :param dir_path: 文件夹路径
    :param saved_file_name_list: 已构建数据库的文件名列表
    :param loaders: 已存在的 loaders，文件内容会加载进其中
    :return: loaders
    """
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            if file not in saved_file_name_list:
                # os.path.join()用于组合目录和文件名得到完整路径
                file_path = os.path.join(root, file)
                load_document_from_file(loaders, file_path)
                add_to_saved_files(path='../data_store/saved_files.json', saved_file_name_list=saved_file_name_list,filename= file)
    return loaders

In [10]:
def load_saved_files(path:str ='../data_store/saved_files.json'):
    # 加载已构建数据库的文件名列表
    with open(path, 'r') as f:
        # 使用json.load()函数读取并解析文件内容
        data = json.load(f)
        return data

In [11]:
def add_to_saved_files(saved_file_name_list:list, filename:str, path:str = '../data_store/saved_files.json'):
    # 将 filename 添加进已构建数据库的文件名列表，并保存至 json
    saved_file_name_list.append(filename)
    with open(path, 'w') as f:
        json.dump(saved_file_name_list,f)

In [12]:
saved_file_name_list = load_saved_files()

# 加载并切分文档
loaders = load_document_from_dir_without_repetition(dir_path='../data_store/documents',saved_file_name_list=saved_file_name_list)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
print(loaders[0:10])
split_docs = text_splitter.split_documents(loaders)

current file: ../data_store/documents/Plant Simulation中级培训.pdf
current file: ../data_store/documents/PlantSimulationENU.pdf
current file: ../data_store/documents/Plant Simulation基础培训.pdf
current file: ../data_store/documents/Plant Simulation三维仿真培训.pdf
current file: ../data_store/documents/模型/.DS_Store
current file: ../data_store/documents/模型/模型/$.yaml
current file: ../data_store/documents/模型/模型/Test/生产信息表_测试行.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/物料终结1.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/GantryLoader.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/拆卸前缓冲区.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/数控床1.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/事件控制器.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/桁架拆卸区.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/库存超市A.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/物料终结.yaml
current file: ../data_store/documents/模型/模型/Test/测试桁架/显示

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



AttributeError: 'list' object has no attribute 'page_content'

In [None]:
len(saved_file_name_list)

In [None]:
# 构建文本存储
with open('../data_store/bm25_store.txt', 'a') as f:
    for line in split_docs:
        f.write(str(line))
        f.write("\n")

In [None]:
# 构建向量存储
# 定义 Embeddings
embedding = dashscope_embedding()
# 加载数据库
vectordb = Chroma.from_documents(split_docs, embedding, persist_directory='../data_store/chroma_db')
# 向量持久化
vectordb.persist()