Contextual Compression Retrivers

In [9]:
from google.colab import userdata
import os

os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

In [None]:
pip install langchain-community langchain-huggingface faiss-cpu langchain-groq

In [19]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.documents import Document
import os

In [22]:
docs = [
    Document(page_content=(
    """The Grand Canyon is one of the most visited natural wonders in the world.
    Photosynthesis is the process by which green plants converts sunlight into energy.
    Millions of tourists travel to seet it every year. The rocks data millions of year."""
    ), metadata={"source": "Doc1"}),

    Document(page_content=(
    """In medieval europe, castles were built primarily for defense.
    The chlorophyll in plant cells captures sunlight during photosynthesis.
    Knight wore armor of metal. Siege weapons were often used to bveeach castle walls."""
    ), metadata={"source": "Doc2"})
]

In [23]:
EmbeddingModels = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstores = FAISS.from_documents(docs, EmbeddingModels)

In [24]:
base_retriever = vectorstores.as_retriever(search_kwargs={"k": 2})

In [25]:
llm = ChatGroq(
    api_key=os.getenv("GROQ_API_KEY"),
    model="llama-3.1-8b-instant"
)

compressor = LLMChainExtractor.from_llm(llm)

In [26]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

In [27]:
query = "What is photosynthesis?"
compressed_docs = compression_retriever.invoke(query)


[Document(metadata={'source': 'Doc1'}, page_content='Photosynthesis is the process by which green plants converts sunlight into energy.'), Document(metadata={'source': 'Doc2'}, page_content='The chlorophyll in plant cells captures sunlight during photosynthesis.')]


In [29]:
for i, doc in enumerate(compressed_docs):
  print(f"\n--- result {i+1}---")
  print(doc.page_content)


--- result 1---
Photosynthesis is the process by which green plants converts sunlight into energy.

--- result 2---
The chlorophyll in plant cells captures sunlight during photosynthesis.
