# FAISS

In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


# 假设我们有分块后的文本块
chunks = ["这是一个分块", "这是另一个分块", "还有一个分块"]


# 使用 Sentence Transformers 进行向量化
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
chunk_vectors = [model.encode(chunk) for chunk in chunks]
chunk_vectors = np.array(chunk_vectors).astype('float32')


# 构建 Faiss 索引
dimension = chunk_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_vectors)


# 假设我们有一个用户问题
query = "一个问题"
query_vector = model.encode(query).astype('float32')


# 搜索最相似的分块
k = 5
distances, indices = index.search(np.array([query_vector]), k)
print(indices)

  from .autonotebook import tqdm as notebook_tqdm


[[ 0  1  2 -1 -1]]


# Chroma

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter


# 假设我们有一段长文本
text = "这是一段很长的文本，我们将使用 Chroma 和 Langchain 对其进行处理。首先将其分块，然后将分块向量化存储在 Chroma 中。"


# 分块
text_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=50,
    chunk_overlap=0
)
chunks = text_splitter.split_text(text)


# 向量化
embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-MiniLM-L6-v2")


# 创建 Chroma 存储
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embeddings,
    persist_directory=".chromadb"
)


# 持久化存储
vectorstore.persist()


# 检索
results = vectorstore.similarity_search("查询问题", k=2)
print(results)

  embeddings = SentenceTransformerEmbeddings(model_name="paraphrase-MiniLM-L6-v2")


[Document(metadata={}, page_content='对其进行处理。首先将其分块，然后将分块向量化存储在 Chroma 中。'), Document(metadata={}, page_content='对其进行处理。首先将其分块，然后将分块向量化存储在 Chroma 中。')]


  vectorstore.persist()


# Milvus
注意：关于milvus代码运行需要在python脚本中独立运行，读者可以把如下单元格的代码粘贴到python脚本中运行

In [3]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer


# 连接 Milvus 服务
connections.connect(host='localhost', port='19530')


# 定义集合的字段
fields = [
    FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name='vector', dtype=DataType.FLOAT_VECTOR, dim=384)
]
schema = CollectionSchema(fields=fields)


# 创建集合
collection = Collection(name='text_vectors', schema=schema)


# 假设我们有分块后的文本块
chunks = ["这是一个分块", "这是另一个分块", "还有一个分块"]


# 使用 Sentence Transformers 进行向量化
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
chunk_vectors = [model.encode(chunk) for chunk in chunks]


# 插入向量到集合
data = [
    [vector.tolist()] for vector in chunk_vectors
]
collection.insert(data)


# 构建索引
index_params = {
    'metric_type': 'L2',
    'index_type': 'IVF_FLAT',
    'params': {"nlist": 128}
}
collection.create_index(field_name="vector", index_params=index_params)


# 假设我们有一个用户问题
query = "一个问题"
query_vector = model.encode(query)


# 搜索最相似的分块
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = collection.search(
    data=[query_vector],
    anns_field="vector",
    param=search_params,
    limit=5
)
print(results)