In [1]:
#import
#.txt
from langchain.document_loaders import TextLoader
import os
from langchain.document_loaders import CSVLoader
# .pdf
from langchain_community.document_loaders import PyPDFLoader
# .json 
from langchain_community.document_loaders import JSONLoader
from langchain.document_loaders import UnstructuredExcelLoader


In [18]:
class multi_FileLoader:
    def __init__(self, file_path: str):
        """
        初始化
        :param file_path: 文件路径
        """
        self.file_path = file_path
        self.file = None
        self.loader = None
        

    def _get_loader(self):
        """
        加载loader
        """
        try:
            file_extension = os.path.splitext(self.file_path)[1].lower()
            if file_extension == '.md' or file_extension == '.txt':
                self.loader = self._text_loder()
            elif file_extension == '.json':
                self.loader = self._json_loder()
            elif file_extension == '.csv':
                self.loader = self._csv_loder()
            elif file_extension == '.xlsx':
                self.loader = self._xlsx_loder()
            elif file_extension == '.pdf':
                self.loader = self._pdf_loder()
            else:
                raise ValueError(f"不支持的文件类型: {file_extension}")
        except Exception as e:
            print(f"加载文件失败: {e}")


    def _text_loder(self):
        return TextLoader(self.file_path)
    
    
    def _json_loder(self):
        return JSONLoader(
            file_path=self.file_path,
            jq_schema=".",
            text_content=False,
        )
    
    def _csv_loder(self):
        return CSVLoader(self.file_path)
    
    def _xlsx_loder(self):
        return UnstructuredExcelLoader(
            self.file_path,
            mode="elements")
    
    def _pdf_loder(self):
        return PyPDFLoader(self.file_path)
    
    def load(self):
        try:
            self._get_loader()
            return self.loader.load()
        except Exception as e:
            print(f"加载文件失败: {e}")


# 示例1：加载一个JSON文件
file_loader_json = multi_FileLoader('../data/test_json.json')  # 传入JSON文件路径
data_json = file_loader_json.load()
print("JSON数据:", data_json)

# 示例2：加载一个文本文件
file_loader_txt = multi_FileLoader('../data/test_txt.txt')  # 传入文本文件路径
data_txt = file_loader_txt.load()
print("文本数据:", data_txt)

# 示例3：加载一个CSV文件
file_loader_csv = multi_FileLoader('../data/test_csv.csv')  # 传入CSV文件路径
data_csv = file_loader_csv.load()
print("CSV数据:", data_csv[2])


# 示例4：加载一个Excel文件
file_loader_xls = multi_FileLoader('../data/test_xlsx.xlsx')  # 传入Excel文件路径
data_xls = file_loader_xls.load()
print("Excel数据:", data_xls)

# 示例5：加载一个PDF文件
file_loader_pdf = multi_FileLoader('../data/test_pdf.pdf')  # 传入PDF文件路径
data_pdf = file_loader_pdf.load()
print("PDF数据:", data_pdf)

JSON数据: [Document(metadata={'source': '/Users/dorisen/PycharmProjects/rag_test/AI-note/Dorisen/data/test_json.json', 'seq_num': 1}, page_content='{"_type": "prompt", "input_variables": ["name", "what"], "template": "\\u7ed9\\u6211\\u8bb2\\u4e00\\u4e2a\\u5173\\u4e8e{name}\\u7684{what}\\u7684\\u6545\\u4e8b"}')]
文本数据: [Document(metadata={'source': '../data/test_txt.txt'}, page_content='这是一个txt测试文件')]
CSV数据: page_content='name: The Lord of the Rings: The Return of the King
year: 2003
movie_rated: PG-13
run_length: 3h 21min
genres: Adventure; Drama; Fantasy;
release_date: 17 December 2003 (USA)
rating: 8.9
num_raters: 1593859
num_reviews: 3681' metadata={'source': '../data/test_csv.csv', 'row': 2}
Excel数据: [Document(metadata={'source': '../data/test_xlsx.xlsx', 'file_directory': '../data', 'filename': 'test_xlsx.xlsx', 'last_modified': '2025-02-26T17:12:03', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table><tr><td>Name</td><td>Age</td><td>City</td></tr><tr><td>Alice</td><td>