# 1. 学习Chroma

In [None]:
import os

# 获取当前目录
current_directory = os.getcwd()

# 设置persist_directory为Advanced_RAG_From_Scratch文件夹
persist_directory = os.path.join(current_directory, '..', 'ChromaVDB')
print(persist_directory)

In [None]:
import os
from dotenv import load_dotenv # type: ignore

# 设置代理
os.environ["http_proxy"] = "127.0.0.1:7890"
os.environ["https_proxy"] = "127.0.0.1:7890"

# 加载环境变量
load_dotenv()

In [3]:
api_key = os.getenv('ZETATECHS_API_KEY')
base_url = os.getenv('ZETATECHS_API_BASE')

In [4]:
from langchain_openai import OpenAIEmbeddings # type: ignore

embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key, base_url=base_url)


### 1. 初始化

In [5]:
from langchain_chroma import Chroma # type: ignore

# 获取当前目录
current_directory = os.getcwd()
# 设置persist_directory为Advanced_RAG_From_Scratch文件夹
persist_directory = os.path.join(current_directory, '..', 'ChromaVDB')

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory=persist_directory,  # Where to save data locally, remove if not necessary
)

### 2. 添加新的文档

In [9]:
from langchain.document_loaders import PyPDFLoader # type: ignore
from langchain.text_splitter import RecursiveCharacterTextSplitter # type: ignore

file_path = "../files/UnderstandingDeepLearning-ZH-CN-240721.pdf"

loader = PyPDFLoader(file_path) # 创建 PyPDFLoader 实例
documents = loader.load() # 加载 PDF 文件并转换为文本数据
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap  = 100)
documents_chunks = splitter.split_documents(documents)

In [None]:
documents_chunks

In [None]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents_chunks))]

vector_store.add_documents(documents=documents_chunks, ids=uuids)

### 3. 删除

In [None]:
vector_store.delete(ids=uuids[-1])

### 4. 查询向量

In [12]:
query = "深度强化学习有哪些一般的方法？"

In [None]:
results = vector_store.similarity_search(
    query,
    k=2,
)
for res in results:
    print(f"* {res.page_content} \n\n**********\n\n [{res.metadata}] \n\n")

In [None]:
results = vector_store.similarity_search_with_score(
    query, k=10
)
for res, score in results:
    # print(f"* [SIM={score:3f}] \n\n*********\n\n {res.page_content} \n\n##########\n\n [{res.metadata}] \n\n")
    print(type(score), score)

### 5.假如我们换一个collection_name

In [23]:
# 获取当前目录
current_directory = os.getcwd()
# 设置persist_directory为Advanced_RAG_From_Scratch文件夹
persist_directory = os.path.join(current_directory, '..', 'ChromaVDB')

vector_store = Chroma(
    collection_name="test",
    embedding_function=embeddings,
    persist_directory=persist_directory,  # Where to save data locally, remove if not necessary
)

In [25]:
query = "深度强化学习有哪些一般的方法？"

In [26]:
results = vector_store.similarity_search_with_score(
    query, k=10
)
for res, score in results:
    # print(f"* [SIM={score:3f}] \n\n*********\n\n {res.page_content} \n\n##########\n\n [{res.metadata}] \n\n")
    print(type(score), score)

# 2. 测试写的ChromaManager 

In [2]:
import sys
sys.path.append('..') 
from src.vdb_managers.chroma_manager import ChromaManager

In [3]:
# 创建ChromaVectorStore实例 - 初始化的时候可以自定义collection_name和persist_directory
chroma_store = ChromaManager()

In [4]:
chroma_vector_store = chroma_store.get_vector_store()
type(chroma_vector_store)

langchain_chroma.vectorstores.Chroma

In [5]:
# 上传PDF文档
chroma_store.upload_pdf_file("files/UnderstandingDeepLearning-ZH-CN-240721.pdf")

InternalServerError: Error code: 503 - {'error': {'message': '当前分组 default 下对于模型 text-embedding-3-large 无可用渠道 (request id: 20241224225139490570597ABge1nCw)', 'type': 'new_api_error'}}

In [5]:
# 上传PDF文档
chroma_store.upload_pdf_file("../files/论文 - GraphRAG.pdf")

In [6]:
# 执行相似性搜索
query = "深度强化学习有哪些一般的方法？"

In [None]:
results = chroma_store.similarity_search(query)
for result in results:
    print(f"Content: {result['content']}\n\nMetadata: {result['metadata']}\n")

In [None]:
# 执行带分数的相似性搜索
results_with_score = chroma_store.similarity_search_with_score(query)
for result in results_with_score:
    print(f"Content: {result['content']}\nMetadata: {result['metadata']}\nScore: {result['score']}\n")

### Chroma检索出来的信息是根据score升序排列的，这与pinecoe不同，需要注意一下 