In [33]:
from dotenv import load_dotenv
load_dotenv()


True

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [3]:
client = QdrantClient(":memory:")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)

In [4]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['d3462483-5229-430f-8a1b-98e03224a61f',
 'efe54c2a-c562-48c1-9632-43f4d2668706',
 '54087d00-05a8-430c-b5ef-1ffc24f96d61',
 'acd4e427-93d4-40a8-a076-6fee96279aa3',
 'fc6d08f7-002c-4057-856c-d16725c80c66',
 '86970e96-51d6-4d5a-b296-55603deb7d3b',
 '03be5f5b-2509-46dd-92bd-99b4cd6685ea',
 'ce588e73-208a-4230-a2f9-ebf3ee2a9c2c',
 'c48aa894-c56f-4a7e-af8a-f7fb26b5076b',
 'fc55a772-fe88-4ca6-b046-baa3029ab309']

In [5]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet', '_id': '54087d00-05a8-430c-b5ef-1ffc24f96d61', '_collection_name': 'demo_collection'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet', '_id': 'ce588e73-208a-4230-a2f9-ebf3ee2a9c2c', '_collection_name': 'demo_collection'}]


In [10]:
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    documents,
    embedding=embeddings,
    location=":memory:",
    collection_name="dense_collection",
    retrieval_mode=RetrievalMode.DENSE,
)

query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant.similarity_search(query)

In [11]:
found_docs

[Document(metadata={'source': 'tweet', '_id': 'd9c4bba8c42448f894138bb0eaaf19e2', '_collection_name': 'dense_collection'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(metadata={'source': 'news', '_id': 'd9b7e1f0b6ad43518d24531ec2ce9921', '_collection_name': 'dense_collection'}, page_content='The stock market is down 500 points today due to fears of a recession.'),
 Document(metadata={'source': 'news', '_id': 'a9833d9b1d2243bfb4e88a908d84644a', '_collection_name': 'dense_collection'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet', '_id': '4198386af02c4b8eb861a32f1fa3105e', '_collection_name': 'dense_collection'}, page_content="Wow! That was an amazing movie. I can't wait to see it again.")]

In [14]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode

sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant = QdrantVectorStore.from_documents(
    documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    location=":memory:",
    collection_name="sparse_collection",
    retrieval_mode=RetrievalMode.SPARSE,
)

query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant.similarity_search(query)

Fetching 29 files: 100%|██████████| 29/29 [00:00<00:00, 29120.14it/s]


In [15]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode

sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant = QdrantVectorStore.from_documents(
    documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    location=":memory:",
    collection_name="hybrid_collection",
    retrieval_mode=RetrievalMode.HYBRID,
)

query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant.similarity_search(query)

Fetching 29 files: 100%|██████████| 29/29 [00:00<?, ?it/s]


In [16]:
found_docs

[Document(metadata={'source': 'tweet', '_id': 'b8c3567d54d8461ebdad1e1dc74db816', '_collection_name': 'hybrid_collection'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(metadata={'source': 'news', '_id': '8ed287d5ba454d6bb22cf495d29dbe97', '_collection_name': 'hybrid_collection'}, page_content='The stock market is down 500 points today due to fears of a recession.'),
 Document(metadata={'source': 'news', '_id': '240e1d96e59540b594c807f1c6c67983', '_collection_name': 'hybrid_collection'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet', '_id': '1ccd30535c104341ab02c3ef12a98beb', '_collection_name': 'hybrid_collection'}, page_content="Wow! That was an amazing movie. I can't wait to see it again.")]

In [17]:
# para revisar los scores
results = vector_store.similarity_search_with_score(
    query="Will it be hot tomorrow", k=1
)
for doc, score in results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

* [SIM=0.611811] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news', '_id': 'efe54c2a-c562-48c1-9632-43f4d2668706', '_collection_name': 'demo_collection'}]


Metadata filtering
Qdrant has an extensive filtering system with rich type support. It is also possible to use the filters in Langchain, by passing an additional param to both the similarity_search_with_score and similarity_search methods.

In [18]:
from qdrant_client import models

results = vector_store.similarity_search(
    query="Who are the best soccer players in the world?",
    k=1,
    filter=models.Filter(
        should=[
            models.FieldCondition(
                key="page_content",
                match=models.MatchValue(
                    value="The top 10 soccer players in the world right now."
                ),
            ),
        ]
    ),
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* The top 10 soccer players in the world right now. [{'source': 'website', '_id': '03be5f5b-2509-46dd-92bd-99b4cd6685ea', '_collection_name': 'demo_collection'}]


Query by turning into retriever

You can also transform the vector store into a retriever for easier usage in your chains.

In [31]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})
retriever.invoke("Stealing from the bank is a crime")

[Document(metadata={'source': 'news', '_id': 'acd4e427-93d4-40a8-a076-6fee96279aa3', '_collection_name': 'demo_collection'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet', '_id': 'fc55a772-fe88-4ca6-b046-baa3029ab309', '_collection_name': 'demo_collection'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(metadata={'source': 'tweet', '_id': 'ce588e73-208a-4230-a2f9-ebf3ee2a9c2c', '_collection_name': 'demo_collection'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(metadata={'source': 'tweet', '_id': 'd3462483-5229-430f-8a1b-98e03224a61f', '_collection_name': 'demo_collection'}, page_content='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(metadata={'source': 'website', '_id': '86970e96-51d6-4d5a-b296-55603deb7d3b', '_collection_name': 'demo_collection'}, page_content='Is the new iPhone wo

# Reranker

In [22]:
from dotenv import load_dotenv
load_dotenv()
import cohere

co = cohere.ClientV2()

In [26]:
found_docs


[Document(metadata={'source': 'tweet', '_id': 'b8c3567d54d8461ebdad1e1dc74db816', '_collection_name': 'hybrid_collection'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(metadata={'source': 'news', '_id': '8ed287d5ba454d6bb22cf495d29dbe97', '_collection_name': 'hybrid_collection'}, page_content='The stock market is down 500 points today due to fears of a recession.'),
 Document(metadata={'source': 'news', '_id': '240e1d96e59540b594c807f1c6c67983', '_collection_name': 'hybrid_collection'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet', '_id': '1ccd30535c104341ab02c3ef12a98beb', '_collection_name': 'hybrid_collection'}, page_content="Wow! That was an amazing movie. I can't wait to see it again.")]

In [28]:
response = co.rerank(
    model="rerank-v3.5",
    query="What did the president say about Ketanji Brown Jackson",
    documents=[found_docs[0].page_content],
    top_n=3,
    return_documents=True,
)
print(response)



In [30]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere

In [35]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [36]:
llm = Cohere(temperature=0)
compressor = CohereRerank(model="rerank-english-v3.0")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "What did the president say about Ketanji Jackson Brown"
)
pretty_print_docs(compressed_docs)

Document 1:

I have a bad feeling I am going to get deleted :(
----------------------------------------------------------------------------------------------------
Document 2:

Robbers broke into the city bank and stole $1 million in cash.
----------------------------------------------------------------------------------------------------
Document 3:

The stock market is down 500 points today due to fears of a recession.
