In [1]:
!pip install --upgrade --quiet pinecone-client pinecone-text pinecone-notebooks

In [2]:
api_key = "your secret key"

In [3]:
pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.7-py3-none-any.whl.metadata (2.5 kB)
Collecting PyYAML>=5.3 (from langchain_community)
  Using cached PyYAML-6.0.1-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Using cached SQLAlchemy-2.0.31-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Using cached aiohttp-3.9.5-cp310-cp310-win_amd64.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.7 (from langchain_community)
  Downloading langchain-0.2.9-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain-core<0.3.0,>=0.2.12 (from langchain_community)
  Downloading langchain_core-0.2.20-py3-none-any.whl.metadata (6.0 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.88-py3-non

In [16]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
# PineconeHybridSearchRetriever this class is responsibe for semantic search and syntactic search  
# as you know keyword search and similar search , so it has combination of sparse matrix and dense matrix 
# and for that we have to create a retriever

In [4]:
# create the index in the pinecone

import os
from pinecone import Pinecone, ServerlessSpec
index_name = "hybridsearch"

# initialize the pinecone client
pc = Pinecone(api_key = api_key)

# create the index

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, # dimension of the dense vector, the reason behind it hugging face embedding techniques i am using
                       # which is nothing but sentence transormers that by default converts any text into 384 dimensions 
        metric='dotproduct', # sparse value supported only for dotproduct. 
        num_shards=1,
        replicas=1,
        server=ServerlessSpec(cloud = 'aws', region = "us-east-1"),
    )

In [6]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x1adca8f9a50>

In [9]:
pip install langchain_huggingface


Collecting langchain_huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting huggingface-hub>=0.23.0 (from langchain_huggingface)
  Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)
Collecting sentence-transformers>=2.6.0 (from langchain_huggingface)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting tokenizers>=0.19.1 (from langchain_huggingface)
  Using cached tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting transformers>=4.39.0 (from langchain_huggingface)
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.6 kB ? eta -:--:--
     -------------------------------------- 43.6/43.6 kB 710.4 kB/s eta 0:00:00
Collecting safetensors>=0.4.1 (from transformers>=4.39.0->langchain_huggingface)
  Using cached safetensors-0.4.3-cp310-none-win_amd64.whl.metadata (3.9 kB)
Downloading langchain_huggingface-0.0

In [12]:
# vector embeddng and sparse matrix

import os
from dotenv import load_dotenv
load_dotenv()

os.environ['hybrid_lang_pinecone_token'] = os.getenv('hybrid_lang_pinecone_token')

# using huggingface embedding techniques
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [13]:
# this embeddings technics used to created a densed vectors
# for my sparse matrix
# this is used sparse encoder tf-idf techniques

from pinecone_text.sparse import BM25Encoder
bm25_encoder = BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x1ada2ec8880>

#### Sparse Matrix

In [14]:
sentences = [
    "In 2023, I visited Paris",
    "In 2022, I visited Newyork",
    "In 2021, I  Visited Orleans",
]

# tfidf vallues on these sentence
bm25_encoder.fit(sentences)
# store the values
bm25_encoder.dump("bm25_values.json")

# to load the values
bm25_encoder = BM25Encoder().load("bm25_values.json")

# we created the sparse matrix 

100%|██████████| 3/3 [00:00<00:00, 86.27it/s]


#### Retriever

In [17]:
# this retriver techniques supports to both vector dense embeddings techniques along with the sparse matrix 
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder= bm25_encoder, index = index)

In [18]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000001ADA337CF70>, index=<pinecone.data.index.Index object at 0x000001ADCA8F9A50>)

In [20]:
retriever.add_texts(
    [
        "In 2023, I visited Paris",
        "In 2022, I visited Newyork",
        "In 2021, I  Visited Orleans",
        "In 2020, I visited Tokyo",
        "In 2019, I visited Sydney",
        "In 2018, I visited Rome",
        "In 2017, I visited London",
        "In 2016, I visited Berlin",
        "In 2015, I visited Moscow",
        "In 2014, I visited Dublin",
        "In 2013, I visited Vienna",
        "In 2012, I visited Budapest",
        "In 2011, I visited Amsterdam",
        
    ]
)

  0%|          | 0/1 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 1/1 [00:05<00:00,  5.16s/it]


In [30]:
retriever.invoke("which city i visited recently")

[Document(page_content='In 2017, I visited London'),
 Document(page_content='In 2016, I visited Berlin'),
 Document(page_content='In 2011, I visited Amsterdam'),
 Document(page_content='In 2015, I visited Moscow')]