In [28]:
import os

from dotenv import load_dotenv

load_dotenv()

# 读取LangSmith Key
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "lang-env"
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
GPT_GOD_API_KEY = os.getenv("GPT_GOD_API_KEY")

In [29]:
# from langchain_openai import ChatOpenAI
# 
# model = ChatOpenAI(model="glm-4-flash",
#                    base_url="https://api.gptgod.online/v1/",
#                    api_key=GPT_GOD_API_KEY
#                    )

In [30]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [31]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
)


In [32]:
vectorstore.similarity_search("cat")

[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Motorcycles are popular in many cities for their agility and fuel efficiency.', metadata={'source': 'motorcycles-doc'})]

In [33]:
vectorstore.similarity_search_with_score("群居动物，会跳跃 需要足够的空间", k=2)

[(Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
  0.8025619983673096),
 (Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'}),
  1.2437764406204224)]

In [34]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small").embed_query("兔子")

vectorstore.similarity_search_by_vector(embedding, k=2)

[Document(page_content='Rabbits are social animals that need plenty of space to hop around.', metadata={'source': 'mammal-pets-doc'}),
 Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'})]

In [35]:
# 检索器

# from typing import List
# from langchain_core.documents import Document
# from langchain_core.runnables import RunnableLambda
# retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)  # 选择顶部结果
# retriever.batch(["cat", "shark"])

In [36]:
# 换个写法
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)
retriever.batch(["cat", "dog"])

[[Document(page_content='Cats are independent pets that often enjoy their own space.', metadata={'source': 'mammal-pets-doc'})],
 [Document(page_content='Dogs are great companions, known for their loyalty and friendliness.', metadata={'source': 'mammal-pets-doc'})]]

## 一个example充分的使用embedding 以及retriever

In [37]:
transportation_documents = [
    Document(
        page_content="The electric car is becoming increasingly popular due to its low emissions.",
        metadata={"source": "environmental-tech-doc"},
    ),
    Document(
        page_content="Trains offer a convenient and eco-friendly way to travel long distances.",
        metadata={"source": "public-transport-doc"},
    ),
    Document(
        page_content="Bicycles are a great way to get around the city, reducing traffic congestion and pollution.",
        metadata={"source": "urban-mobility-doc"},
    ),
    Document(
        page_content="Motorcycles are popular in many cities for their agility and fuel efficiency.",
        metadata={"source": "motorcycles-doc"},
    ),
    Document(
        page_content="Boats are essential for travel on waterways and are a common mode of transportation in coastal regions.",
        metadata={"source": "water-transport-doc"},
    ),
    Document(
        page_content="Airplanes allow people to travel across the globe in a relatively short amount of time.",
        metadata={"source": "aviation-doc"},
    ),
    Document(
        page_content="Horses have been used for transportation for thousands of years, especially in rural areas.",
        metadata={"source": "traditional-transport-doc"},
    ),
]

In [38]:
# 删除之前的第一个向量存储
vectorstore.delete_collection()
# vectorstore.get()
# 1. 存储向量 
vectorstore = Chroma.from_documents(
    transportation_documents,
    embedding=OpenAIEmbeddings(
        model="text-embedding-3-small",
    )
)

In [39]:
# 2. search
vectorstore.get()
# vectorstore2.similarity_search("自行车")

{'ids': ['072e805d-c39e-49ea-b775-3aeea655718e',
  '1df41362-ef61-4809-a5be-413bf24c2716',
  '7bbe7b37-7a9a-4323-8711-f875eed132c4',
  '96150631-94df-4365-b5a9-688d6b8a07b9',
  'cc7b4ce9-37a2-4e7c-8718-47d086061a61',
  'd1388029-a0a0-4e68-9025-80b4d4334f0a',
  'f600c61f-7ba9-4b11-98a5-0df85c791040'],
 'embeddings': None,
 'metadatas': [{'source': 'urban-mobility-doc'},
  {'source': 'aviation-doc'},
  {'source': 'traditional-transport-doc'},
  {'source': 'water-transport-doc'},
  {'source': 'motorcycles-doc'},
  {'source': 'environmental-tech-doc'},
  {'source': 'public-transport-doc'}],
 'documents': ['Bicycles are a great way to get around the city, reducing traffic congestion and pollution.',
  'Airplanes allow people to travel across the globe in a relatively short amount of time.',
  'Horses have been used for transportation for thousands of years, especially in rural areas.',
  'Boats are essential for travel on waterways and are a common mode of transportation in coastal regions.

In [71]:
# 上面的操作无法集成到Runnable，所以我们需要建立检索器使其能够与chain链式调用
# 过滤器
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)
# 单个
# retriever.invoke("dog")
# 批量
# retriever.batch(["速度最快", "水陆两栖"])

In [72]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="glm-4-flash",
                   base_url="https://api.gptgod.online/v1/",
                   api_key=GPT_GOD_API_KEY
                   )


In [73]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda,RunnableParallel

message = """
你先介绍一下自己叫什么，然后根据上下文回答问题
{question}

上下文：
{context}
"""
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI bot. "),
    ("human", message)
])

In [74]:
chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | model

In [76]:
chain.invoke("Tell us about the slowest mode of transportation you know.")

AIMessage(content="Hello, I'm an AI assistant named ChatGLM. The slowest mode of transportation mentioned in the provided context is horses. Horses have been used for transportation for thousands of years, especially in rural areas, and they are slower compared to modern modes of transport such as cars, trains, or airplanes.", response_metadata={'token_usage': {'completion_tokens': 62, 'prompt_tokens': 94, 'total_tokens': 156}, 'model_name': 'glm-4-flash', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-b6ff6174-66d3-40f2-9e06-debb4bbe0b9a-0', usage_metadata={'input_tokens': 94, 'output_tokens': 62, 'total_tokens': 156})