# 添加scores到retriever结果


In [1]:
!pip install pypdfium2 backoff openai langchain_openai langchain langchain_community faiss-cpu rank_bm25

Collecting pypdfium2
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.14-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.22-py3-none-any.whl.metadata (2.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.

配置OpenAI 的 GPT-4o 模型

In [2]:
# 设置OpenAI KEY环境变量
import os
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:··········


In [3]:
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

embedding = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_documents(
    docs, embedding=OpenAIEmbeddings()
)

In [7]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain

# 下面在元数据中增加相似度
@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*faiss_vectorstore.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

In [8]:
result = retriever.invoke("dinosaur")
result

(Document(id='ac56bc3c-7f3d-471b-ae9f-5728e6cfba02', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction', 'score': np.float32(0.3112905)}, page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose'),
 Document(id='77d242db-0fc0-4df5-b5cb-7499115d7ea3', metadata={'year': 1995, 'genre': 'animated', 'score': np.float32(0.41597342)}, page_content='Toys come alive and have a blast doing so'),
 Document(id='61fbd099-5d02-4a00-ad90-2a98effbda9f', metadata={'year': 1979, 'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'score': np.float32(0.4970406)}, page_content='Three men walk into the Zone, three men walk out of the Zone'),
 Document(id='1b5828b4-d972-44c1-8f7b-21f414279b33', metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6, 'score': np.float32(0.50489014)}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea'))

In [10]:
# 查询向量存储
query = "dinosaur"
retriever = faiss_vectorstore.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.0}
)
docs = retriever.get_relevant_documents(query)
# 打印结果
for doc in docs:
    print(doc)

page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose' metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction', 'score': np.float32(0.3112905)}
page_content='Toys come alive and have a blast doing so' metadata={'year': 1995, 'genre': 'animated', 'score': np.float32(0.41597342)}
page_content='Three men walk into the Zone, three men walk out of the Zone' metadata={'year': 1979, 'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'score': np.float32(0.4970406)}
page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea' metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6, 'score': np.float32(0.50489014)}


In [11]:
from langchain.retrievers import EnsembleRetriever
from langchain.vectorstores import VectorStore
from langchain_community.retrievers import BM25Retriever
from langchain_core.retrievers import BaseRetriever
import jieba
bm25_retriever = BM25Retriever.from_documents(
            docs,
            preprocess_func=jieba.lcut_for_search,
        )
bm25_retriever.k = 2
ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, retriever], weights=[0.8, 0.2]
        )

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.645 seconds.
DEBUG:jieba:Loading model cost 0.645 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [12]:
docs1 = ensemble_retriever.get_relevant_documents(query)
# 打印结果
for doc in docs1:
    print(doc)

page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea' metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6, 'score': np.float32(0.50489014)}
page_content='Three men walk into the Zone, three men walk out of the Zone' metadata={'year': 1979, 'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'score': np.float32(0.4970406)}
page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose' metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction', 'score': np.float32(0.3112905)}
page_content='Toys come alive and have a blast doing so' metadata={'year': 1995, 'genre': 'animated', 'score': np.float32(0.41597342)}


此时混合检索器输出带有**相似度**值了