In [1]:
from dotenv import load_dotenv
load_dotenv()

from langchain_community.retrievers import PineconeHybridSearchRetriever

In [None]:
from pinecone import Pinecone, ServerlessSpec
index_name = "hybrid-search-langchain-pinecone"
pc = Pinecone()

if index_name not in pc.list_indexes():
    pc.create_index(
        index_name,
        dimension=384,
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud = 'aws',
            region = 'us-east-1'
        )
    )

In [8]:
index = pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x72cf39929550>

In [9]:
## vector embedding and sparse matrix
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x72ce0ab7d730>

In [14]:
sentences = [
    'In 2023, I Visited Paris',
    'In 2021, I Visited Berlin',
    'In 2024, I Visited Germany',
]

# tfidf values on these sentences
bm25_encoder.fit(sentences)

# store the values to a json file 
bm25_encoder.dump('bm25_values.json')

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder = bm25_encoder, index = index)

In [16]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x72ce0aa8e450>, index=<pinecone.db_data.index.Index object at 0x72cf39929550>)

In [17]:
retriever.add_texts(
    [
    'In 2023, I Visited Paris',
    'In 2021, I Visited Berlin',
    'In 2024, I Visited Germany',
]
)

  0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
retriever.invoke("what city did i visit last")

[Document(metadata={'score': 0.258933753}, page_content='In 2021, I Visited Berlin'),
 Document(metadata={'score': 0.234627947}, page_content='In 2023, I Visited Paris'),
 Document(metadata={'score': 0.229364142}, page_content='In 2024, I Visited Germany')]