Hybrid Search Langchain

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

pc_api_key = os.getenv("PINECONE_API_KEY")

In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec

index_name = "hybrid-search-langchain-pinecone" # make sure index name is in lowercase

# initialize the pinecone client
pc = Pinecone(api_key=pc_api_key)

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, # dimension of dense model
        metric="dotproduct", # sparse values supported only with dotproduct
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )


In [3]:
# Check if the index was created successfully
print(f"Available indexes: {pc.list_indexes().names()}")
print(f"Index '{index_name}' exists: {index_name in pc.list_indexes().names()}")

# Get index details
if index_name in pc.list_indexes().names():
    index = pc.Index(index_name)
    stats = index.describe_index_stats()
    print(f"Index stats: {stats}")

Available indexes: ['hybrid-search', 'hybrid-search-langchain-pinecone']
Index 'hybrid-search-langchain-pinecone' exists: True


  from .autonotebook import tqdm as notebook_tqdm


Index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [4]:
index = pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x298394df820>

In [None]:
# Vector embedding and sparse matrix
from langchain_huggingface import HuggingFaceEmbeddings

# dense vectors
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [7]:
# sparse matrix
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhananj1\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x29856fd3be0>

In [8]:
sentences = [
    "In 2021 I visited the Grand Canyon for the first time.",
    "In 2022 I went to the beach in Florida.",
    "In 2023 I traveled to New York City.",
    "In 2024 I plan to visit Europe.",
    "In 2025 I want to go to Japan."
]

In [9]:
# tfidf values on these sentences
bm25_encoder.fit(sentences)

# store the values in a json file
bm25_encoder.dump("bm25_encoder.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_encoder.json")

100%|██████████| 5/5 [00:00<00:00, 39.49it/s]


In [10]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder, index=index)
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000029839B76530>, index=<pinecone.db_data.index.Index object at 0x00000298394DF820>)

In [11]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


In [16]:
retriever.invoke("What city I visited in last?")

[Document(metadata={'score': 0.352451891}, page_content='In 2023 I traveled to New York City.'),
 Document(metadata={'score': 0.257683396}, page_content='In 2021 I visited the Grand Canyon for the first time.'),
 Document(metadata={'score': 0.216974497}, page_content='In 2024 I plan to visit Europe.'),
 Document(metadata={'score': 0.193143845}, page_content='In 2022 I went to the beach in Florida.')]