In [19]:
!pip install -U langchain-community
!pip install --upgrade --quiet lark langchain-chroma
!pip install tiktoken
!pip install langchain_openai
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [2]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:··········


In [6]:
# 绑定Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# 如果不使用Google Colab，该步骤忽略
import os
path = "/content/drive/My Drive/Colab/langchain/RAG/retrieval"
os.chdir(path)

In [20]:
from datetime import datetime, timedelta

import faiss
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain_community.docstore import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# 算法公式如下：
semantic_similarity + (1.0 - decay_rate) ^ hours_passed

# Low decay rate
A low decay rate (in this, to be extreme, we will set it close to 0) means memories will be "remembered" for longer. A decay rate of 0 means memories never be forgotten, making this retriever equivalent to the vector lookup.

In [21]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.0000000000000000000000001, k=1
)

In [22]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['fd0b1f78-b2ec-44dc-b432-20ddfd29cd51']

In [23]:
# "Hello World" is returned first because it is most salient, and the decay rate is close to 0., meaning it's still recent enough
retriever.invoke("hello world")

[Document(metadata={'last_accessed_at': datetime.datetime(2024, 12, 31, 3, 33, 13, 369270), 'created_at': datetime.datetime(2024, 12, 31, 3, 33, 0, 573601), 'buffer_idx': 0}, page_content='hello world')]

# High decay rate
With a high decay rate (e.g., several 9's), the recency score quickly goes to 0! If you set this all the way to 1, recency is 0 for all objects, once again making this equivalent to a vector lookup.

In [24]:
# Define your embedding model
embeddings_model = OpenAIEmbeddings()
# Initialize the vectorstore as empty
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(
    vectorstore=vectorstore, decay_rate=0.999, k=1
)

In [25]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents(
    [Document(page_content="hello world", metadata={"last_accessed_at": yesterday})]
)
retriever.add_documents([Document(page_content="hello foo")])

['6aeb3eea-3b8d-403f-a155-79506266a487']

In [26]:
# "Hello Foo" is returned first because "hello world" is mostly forgotten
retriever.invoke("hello world")

[Document(metadata={'last_accessed_at': datetime.datetime(2024, 12, 31, 3, 34, 24, 152462), 'created_at': datetime.datetime(2024, 12, 31, 3, 34, 14, 65086), 'buffer_idx': 1}, page_content='hello foo')]