# Hybrid Search Langchain

In [4]:
from dotenv import load_dotenv
import os
load_dotenv()

pine_api= os.getenv("PINECONE_API")
groq_api = os.getenv("GROQ_API")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")


In [13]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec
index_name = "hybrid-search-langchain-pinecone"

## initialize the pinecone client
pc = Pinecone(api_key=pine_api)

## create index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, ## this is the dimention of dense model.
        metric="dotproduct",## sparse values suported only for dot product
        spec=ServerlessSpec(cloud='aws',region='us-east-1'),
    )

In [6]:
index = pc.Index(index_name)

In [8]:
## vector embedding 
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

Embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
Embeddings



HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
#sparse embedding.
from pinecone_text.sparse import BM25Encoder
bm25 = BM25Encoder().default()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AKM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AKM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [20]:
sentences = [
"IN 2024, I done nothing.",
"In 2023, I visited paris",
"In 2027, I visited USA."
]
## tf-idf values on these sentences
bm25.fit(sentences)

## store the values to json file
bm25.dump("bm25_values.json")

# load to your bm25Encoder Object
bm25 = BM25Encoder().load("bm25_values.json")


100%|██████████| 3/3 [00:00<?, ?it/s]


In [25]:
retirver = PineconeHybridSearchRetriever(embeddings=Embeddings,sparse_encoder=bm25,index=index)

In [26]:
retirver

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000029FA4D1A990>, index=<pinecone.data.index.Index object at 0x0000029FB75923D0>)

In [27]:
retirver.add_texts(sentences)

100%|██████████| 1/1 [00:07<00:00,  7.26s/it]


In [28]:
retirver.invoke("What i did in 2024")

[Document(metadata={'score': 0.649642825}, page_content='IN 2024, I done nothing.'),
 Document(metadata={'score': 0.31829527}, page_content='In 2027, I visited USA.'),
 Document(metadata={'score': 0.284357786}, page_content='In 2023, I visited paris')]

: 