检索

Text文本方式读取

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./examples/sql.md")
loader.load()

[Document(metadata={'source': './examples/sql.md'}, page_content="## 创建表\n\n```sql\n# 分区表\ncreate table test_t2(words string,frequency string) partitioned by (partdate string) row format delimited fields terminated by ',';\n\n# orc表\nCREATE TABLE IF NOT EXISTS bank.account_orc (\n  `id_card` int,\n  `tran_time` string,\n  `name` string,\n  `cash` int\n  )\nstored as orc;\n```\n\n# 插入数据\n\n```sql\ninsert into tablename values('col1', 'col2');\n\n\nINSERT INTO table_name (column1, column2, column3)\nVALUES\n(value1, value2, value3),\n(value4, value5, value6),\n(value7, value8, value9);\n\n\nINSERT OVERWRITE TABLE tb\nselect * from tb2\n;\n```")]

CSV文件的读取

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='./examples/test.csv')
loader.load()

[Document(metadata={'source': './examples/test.csv', 'row': 0}, page_content='id: 1\nname: 张三\ndegree: 本科'),
 Document(metadata={'source': './examples/test.csv', 'row': 1}, page_content='id: 2\nname: 李四\ndegree: 硕士')]

In [3]:
loader = CSVLoader(file_path='./examples/no_fields_name.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['id', 'name', 'degree']
    }, 
    source_column='id'
)

loader.load()

[Document(metadata={'source': '1', 'row': 0}, page_content='id: 1\nname: 张三\ndegree: 本科'),
 Document(metadata={'source': '2', 'row': 1}, page_content='id: 2\nname: 李四\ndegree: 硕士')]

pdf读取器

In [6]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("examples/sql.pdf")
pages = loader.load()
pages

Previous trailer cannot be read: ("invalid literal for int() with base 10: b'/Root'",)
Object 14 0 found
Object 3 0 found
Object 2 0 found
Object 5 0 found
Object 7 0 found
Object 21 0 found
Object 20 0 found
Object 22 0 found
Object 8 0 found
Object 25 0 found
Object 9 0 found
Object 27 0 found
Object 10 0 found
Object 30 0 found
Object 29 0 found
Object 31 0 found
Object 12 0 found
Object 35 0 found
Object 34 0 found
Object 4 0 found


[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'examples/sql.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}, page_content="创建表  \n插⼊数据  \n# 分区表\ncreate table test_t2(words string,frequency string) partitioned by (partdate string) row \nformat delimited fields terminated by ',';\n# orc表\nCREATE TABLE IF NOT EXISTS bank.account_orc (\n \xa0`id_card` int,\n \xa0`tran_time` string,\n \xa0`name` string,\n \xa0`cash` int\n \xa0)\nstored as orc;\ninsert into tablename values('col1', 'col2');\nINSERT INTO table_name (column1, column2, column3)\nVALUES\n(value1, value2, value3),\n(value4, value5, value6),\n(value7, value8, value9);\nINSERT OVERWRITE TABLE tb\nselect * from tb2\n;")]

loader可以自己定义

In [7]:
from typing import AsyncIterator, Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

class CustomDocumentLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path
    # 实现 lazy_load 和 alazy_load 方法，分别为load和异步load方式
    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        with open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            for line in f:
                if not line.strip():
                    continue
                
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

    # alazy_load is OPTIONAL.
    # If you leave out the implementation, a default implementation which delegates to lazy_load will be used!
    async def alazy_load(
        self,
    ) -> AsyncIterator[Document]:  # <-- Does not take any arguments
        """An async lazy loader that reads a file line by line."""
        # Requires aiofiles
        # Install with `pip install aiofiles`
        # https://github.com/Tinche/aiofiles
        import aiofiles

        async with aiofiles.open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            async for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1

文本分割

In [8]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=35,
    chunk_overlap=4,
)

text = "This is the text I would like to chunk up. It is the example text for this exercise"
text_splitter.create_documents([text])

[Document(metadata={}, page_content='This is the text I would like to'),
 Document(metadata={}, page_content='to chunk up. It is the example text'),
 Document(metadata={}, page_content='text for this exercise')]

In [9]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=35,
    chunk_overlap=4,
)

text = "This is\n the text I would\n like to chunk up.It is the example text for this exercise"
text_splitter.create_documents([text])

[Document(metadata={}, page_content='This is\n the text I would'),
 Document(metadata={}, page_content='like to chunk up.It is the example text for this exercise')]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

#首选分隔符是\n\n，然后是\n，接着是空格，最后是空字符串
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=35,
    chunk_overlap=4,
)

text = "This is\n\n the\n\n text\n I would like\n to chunk up. It is the example text\n for\n\n this exercise"
text_splitter.create_documents([text])

[Document(metadata={}, page_content='This is\n\n the'),
 Document(metadata={}, page_content='text\n I would like'),
 Document(metadata={}, page_content='to chunk up. It is the example'),
 Document(metadata={}, page_content='text'),
 Document(metadata={}, page_content='for'),
 Document(metadata={}, page_content='this exercise')]

In [10]:

text = "text\n for\n\n this exercise"
text_splitter.create_documents([text])

[Document(metadata={}, page_content='text\n for\n this exercise')]

以上能看出，先对\n\n分割，然后组合chunk，然后对剩余长于chunk_size的进行\n分割再组合，然后是‘ ’，最后再分割‘’

对代码进行分割

In [11]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='# Call the function\nhello_world()')]

In [12]:
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

In [15]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

markdown_document = "# Intro \n\n    ## History \n\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \n\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \n\n ## Rise and divergence \n\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \n\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \n\n #### Standardization \n\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \n\n ## Implementations \n\n Implementations of Markdown are available for over a dozen programming languages."

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Intro', 'Header 2': 'History'}, page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]  \nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.'),
 Document(metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}, page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.  \n#### Standardization  \nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.'),
 Document(metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'}, page_content=

In [16]:
# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)

# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)
splits

[Document(metadata={'Header 1': 'Intro', 'Header 2': 'History'}, page_content='# Intro  \n## History  \nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]'),
 Document(metadata={'Header 1': 'Intro', 'Header 2': 'History'}, page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.'),
 Document(metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}, page_content='## Rise and divergence  \nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.'),
 Document(metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}, page_content='#### Standardization  \nFrom 

可以看出上面再次分割了内容

根据语义分割

In [18]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import OpenAIEmbeddings

# This is a long document we can split up.
with open("../../state_of_the_union.txt") as f:
    state_of_the_union = f.read()
    
text_splitter = SemanticChunker(OpenAIEmbeddings())

docs = text_splitter.create_documents([state_of_the_union])

#根据语义进行分割，本质上要用embeddings模型来计算相似度，高于某个阈值的邻域chunk可以合并到一起

FileNotFoundError: [Errno 2] No such file or directory: '../../state_of_the_union.txt'

嵌入

In [20]:
from langchain_openai import OpenAIEmbeddings
import os
# Ensure you have the environment variable set for the API key
os.environ["OpenAI_API_BASE"] = "https://api.zhizengzeng.com/v1"
os.environ["OpenAI_API_KEY"] = "sk-zk22167acdc980aa5bd5dd04774d59f30f0684991a3a2fb2"

embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    chunk_size=1000,  # Optional, default is 1000
)

In [21]:
texts = [
        "你好吗",
        "你的名字是什么",
        "我的肚子好痛啊",
        "肠胃不舒服",
        "我在吃东西"
    ]
embeddings = embeddings_model.embed_documents(texts)

len(embeddings), len(embeddings[0])

(5, 1536)

In [24]:
import numpy as np

def normalize(x):
    x = np.asarray(x)
    norms = np.sum(np.multiply(x, x))
    norms = np.sqrt(norms)
    return x / norms

for i in range(5):
    similarity = np.dot(normalize(embeddings[2]), normalize(embeddings[i]))
    print(f'"{texts[2]}"与"{texts[i]}"的语义相似度为：{similarity}')

"我的肚子好痛啊"与"你好吗"的语义相似度为：0.3594570606893717
"我的肚子好痛啊"与"你的名字是什么"的语义相似度为：0.23817256999431716
"我的肚子好痛啊"与"我的肚子好痛啊"的语义相似度为：1.0
"我的肚子好痛啊"与"肠胃不舒服"的语义相似度为：0.55998006426397
"我的肚子好痛啊"与"我在吃东西"的语义相似度为：0.4246934047589308


Embedding也可以Cache

In [27]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.tiktoken_model_name
)

In [29]:
%%time
cached_embedder.embed_documents(texts)
#两次调用会让第一次调用的结果被缓存起来，第二次调用会直接从缓存中读取，从而加速第二次

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [31]:
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.tiktoken_model_name
)

cached_embedder.embed_documents

<bound method CacheBackedEmbeddings.embed_documents of <langchain.embeddings.cache.CacheBackedEmbeddings object at 0x000001D9A57CF650>>

In [33]:
from langchain.storage import RedisStore

store = RedisStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embeddings_model, store, namespace=embeddings_model.tiktoken_model_name
)

cached_embedder.embed_documents

ValueError: Either a Redis client or a redis_url must be provided.