In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

In [4]:
from langchain.schema import Document

In [6]:
docs = [
    # Cricket
    Document(page_content="Virat Kohli is the captain of RCB and a top international batsman.", metadata={"topic": "cricket"}),
    Document(page_content="Chris Gayle holds records for fastest centuries and most sixes in T20 cricket.", metadata={"topic": "cricket"}),

    # Technology
    Document(page_content="Python is a popular programming language for data science and machine learning.", metadata={"topic": "technology"}),
    Document(page_content="FAISS is a library for efficient similarity search and clustering of dense vectors.", metadata={"topic": "technology"}),

    # Space
    Document(page_content="The James Webb Space Telescope can observe galaxies formed over 13 billion years ago.", metadata={"topic": "space"}),
    Document(page_content="Mars has two moons named Phobos and Deimos.", metadata={"topic": "space"}),

    # Cooking
    Document(page_content="Pasta carbonara is an Italian dish made with eggs, cheese, pancetta, and pepper.", metadata={"topic": "cooking"}),
    Document(page_content="Sourdough bread uses natural yeast and bacteria for fermentation.", metadata={"topic": "cooking"})
]


In [7]:
query = '"Which technology helps in finding similar documents efficiently?"'

In [8]:
from langchain_community.vectorstores import FAISS
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [11]:
vs = FAISS.from_documents(
    documents=docs,
    embedding = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
    
)

In [12]:
ret = vs.as_retriever(search_kwargs={'k':5})

In [13]:
from langchain_ollama import ChatOllama

In [14]:
model = ChatOllama(
    model='llama3'
)

In [15]:
compressor = LLMChainExtractor.from_llm(model)

In [17]:
compression_ret = ContextualCompressionRetriever(
    base_retriever=ret,
    base_compressor=compressor
)

In [18]:
com_res = compression_ret.invoke(query)

In [19]:
for i,r in enumerate(com_res):
    print(f"Result {i+1} -------")
    print(r.page_content)

Result 1 -------
Extracted relevant parts:

FAISS is a library for efficient similarity search and clustering of dense vectors.

These extracted parts are directly relevant to the question about finding similar documents efficiently.
Result 2 -------
NO_OUTPUT. None of the provided context is relevant to answering the question about a technology that helps find similar documents efficiently. The context appears to be discussing the James Webb Space Telescope, which has no relation to document searching or technology for finding similar documents.
