# 分块

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk
nltk.download('punkt')

text = "这是一段较长的文本，我们将对其进行分块和向量化操作。分块向量化可以帮助我们更好地存储和检索信息。"
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=20,
    chunk_overlap=0,
    separators=["。"]
)
chunks = text_splitter.split_text(text)
print(chunks)

['这是一段较长的文本，我们将对其进行分块和向量化操作', '。分块向量化可以帮助我们更好地存储和检索信息', '。']


[nltk_data] Downloading package punkt to /Users/hezhidong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
len(chunks)

3

# 使用 OpenAI 的 Embedding API 进行向量化

In [3]:
import os
from dotenv import load_dotenv

# 加载 .env 文件中的OpenAI API环境变量
load_dotenv()

True

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from openai import OpenAI

client = OpenAI()

# 将每个分块转换为向量
chunk_vectors_openai = []
for chunk in chunks:
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=chunk
    )
    vector = response.data[0].embedding
    chunk_vectors_openai.append(vector)
print(chunk_vectors_openai)

[[-0.019183307886123657, -0.0021908513735979795, -0.005423692986369133, -0.005283425096422434, -0.0026918083894997835, 0.016057338565587997, -0.010927539318799973, 0.01766039989888668, -0.029068857431411743, -0.02112034149467945, -0.007200420368462801, 0.04063762351870537, -0.019664227962493896, -0.011889376677572727, 0.016498180106282234, 0.007754812482744455, 0.014120304957032204, -0.012964763678610325, 0.014320687390863895, -0.01982453279197216, -0.007674659602344036, 0.012236706912517548, -0.012370294891297817, -0.0008441123645752668, -0.023845547810196877, 0.004542008973658085, 0.019063079729676247, -0.02671770006418228, -0.006001463625580072, -0.004869300872087479, -0.005463769659399986, -0.006539157126098871, -0.022629892453551292, -0.008850238285958767, -0.009511501528322697, 0.0015287534333765507, 0.005697549786418676, 0.002093999646604061, 0.001831832341849804, -0.03489331528544426, 0.03748493269085884, 0.006629329174757004, 0.010192803107202053, 0.010840706527233124, 0.00477

# 向量存储
## 使用openai接口向量化

In [5]:
import faiss
import numpy as np

# 假设使用 OpenAI 的 Embedding向量化结果
chunk_vectors = np.array(chunk_vectors_openai).astype('float32')

# 构建 Faiss 索引
dimension = chunk_vectors.shape[1]  # 向量的维度
index = faiss.IndexFlatL2(dimension)


# 将向量添加到 Faiss 索引中
index.add(chunk_vectors)


# 保存 Faiss 索引到文件
faiss.write_index(index, 'chunk_vectors.index')


# 从文件中读取 Faiss 索引
loaded_index = faiss.read_index('chunk_vectors.index')


# 假设我们有一个查询向量，这里简单地将第一个分块的向量作为查询向量
query_vector = chunk_vectors[0]


# 查找最相似的两个分块
k = 2
distances, indices = loaded_index.search(np.array([query_vector]).astype('float32'), k)


print("最相似的分块的索引:", indices)
print("最相似的分块的距离:", distances)

最相似的分块的索引: [[0 1]]
最相似的分块的距离: [[0.        0.2624845]]


## 使用SentenceTransformer向量化

In [6]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer


# 假设我们已经有了分块后的文本块，存储在一个列表中
chunks = ["这是第一个分块", "这是第二个分块", "这是第三个分块", "这是第四个分块"]


# 使用 Sentence Transformers 进行向量化
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
chunk_vectors = [model.encode(chunk) for chunk in chunks]


# 将 chunk 向量转换为 numpy 数组
chunk_vectors = np.array(chunk_vectors).astype('float32')


# 构建 Faiss 索引
dimension = chunk_vectors.shape[1]  # 向量的维度
index = faiss.IndexFlatL2(dimension)


# 将向量添加到 Faiss 索引中
index.add(chunk_vectors)


# 保存 Faiss 索引到文件
faiss.write_index(index, 'chunk_vectors.index')


# 从文件中读取 Faiss 索引
loaded_index = faiss.read_index('chunk_vectors.index')


# 假设我们有一个查询向量，这里简单地将第一个分块的向量作为查询向量
query_vector = chunk_vectors[0]


# 查找最相似的两个分块
k = 2
distances, indices = loaded_index.search(np.array([query_vector]).astype('float32'), k)


print("最相似的分块的索引:", indices)
print("最相似的分块的距离:", distances)

  from .autonotebook import tqdm as notebook_tqdm


最相似的分块的索引: [[0 1]]
最相似的分块的距离: [[0.        1.4836268]]
