In [18]:
# installing langchain , pinecone , openAI  libraries quietly
! pip install langchain langchain-core langchain-community langgraph langsmith "langserve[all]" -q
! pip install pinecone-client -q
! pip install langchain-openai -q
! pip install python-dotenv -q
! pip install config -q

In [21]:
! pip install weaviate-client cohere torch transformers -q
! pip install nltk -q
! pip install InstructorEmbedding sentence-transformers -q

True

# Vector Database creation

In [26]:
import os
from config import Config

config = Config(os.path.join(os.getcwd(), 'configurationData.json'))

# # Accessing configuration values
# db_host = config.get('database', 'host')
# log_level = config.get('logging', 'level')
# feature_x_enabled = config.get('feature_flags', 'enable_feature_x')

# print(f"Database Host: {db_host}")
# print(f"Log Level: {log_level}")
# print(f"Feature X Enabled: {feature_x_enabled}")/

In [None]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm
import pandas as pd


class PineconeVectorDb:
    def __init__(self, config_filename , embedding_model , embeddings , api_key = None, env = None, index_name = None, embed_api_key = None):
        from config import Config
        #load the configuration file
        
        config = Config(os.path.join(os.getcwd() , config_filename))
        if embedding_model is not None or embedding_model not in config.get('embeddings', 'names'):
            print("EMBEDDING : Provide Embedding model name is not valid")
            return

        if api_key is None:
            api_key = os.environ.get("PINECONE_API_KEY")
            print('PINECONE: Loaded API key from environment variables.')
        if env is None:
            env = os.environ.get("PINECONE_ENV")
            print('PINECONE: Loaded environment from environment variables.')
        if index_name is None:
            index_name = os.environ.get("PINECONE_INDEX")
            print('PINECONE: Loaded index name from environment variables.')
        if embed_api_key is None:
            embed_api_key = os.environ.get(config.get('embeddings', embedding_model ))
            print(f"{embedding_model} : Loaded API key from environment variables.")

        pinecone.init(api_key=api_key, environment=env)
        print('PINECONE: initialized')
        
        self.index = pinecone.Index(index_name)
        print('PINECONE: Set index to - ', index_name)

        self.embeddings = embeddings
        print('Embedding model Loaded')

        self.vector_search = Pinecone(self.index, self.embeddings.embed_query, "text")
    
    def search(self, query, top_k=20):
        return self.vector_search.similarity_search(query, k=top_k)
    
    def upsert(self, data_path: str):
        """Upserts data into the vector database.
        Args:
            data_path (str): Path to the data file.
        """

        if '.csv' in data_path:
            data = pd.read_csv(data_path)
        elif '.parquet' in data_path:
            data = pd.read_parquet(data_path)        
        else:
            raise Exception('Data format not supported. Please provide a csv or parquet file.')
               
        for item in tqdm(data.values , desc = "Upserting" , unit = "row" , ncols = 100):
            # item - some ID , product id , merged product description  , metadata
            product_id = item[1]
            product_vector = self.embedding.embed_query(item[2])
            product_metadata = item[3]
            record_metadata = {"description" : item[2], "product_source" : str(product_metadata) }

            self.index.upsert(vectors =[{'product_id': product_id , 'values' : product_vector , 'metadata': record_metadata}])

        print("Product Data has been Upsered into Pinecone")
        

In [3]:
from dotenv import load_dotenv
import os
import config

# Load environment variables from .env file
load_dotenv()

pinecone_api_key = os.environ.get('PINECONE_API_KEY')
pinecone_index_name = os.environ.get('PINECONE_INDEX')
# pinecone_environment = os.environ.get('PINECONE_ENV')
# cohere_key = os.environ.get('COHERE_API_KEY')

In [4]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone  import Pinecone , ServerlessSpec


index_name = "hybrid-product-search-langchain-pinecone"
pc = Pinecone(api_key= pinecone_api_key)

In [5]:
import tiktoken

openai_embedding_model = 'text-embedding-ada-002'
embedding_encoding = "cl100k_base"
max_tokens = 1536

encoding = tiktoken.get_encoding(embedding_encoding)
df

# pinecone

In [10]:
! pip install pinecone-client pinecone-text pinecone-notebooks -q

In [16]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain_community.retrievers import PineconeHybridSearchRetriever


In [22]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
embeddings

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [23]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()
bm25_encoder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prade\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prade\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x230d5f48f50>

In [26]:
sentences = ["In 2023, I visited Paris" , "In 2022 , I visited New York" , "In 2021 , I visited New Orleans",]

bm25_encoder.fit([text for text in sentences])
bm25_encoder.dump("bm25_values.json")

  0%|          | 0/3 [00:00<?, ?it/s]


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\prade/nltk_data'
    - 'c:\\ProgramData\\anaconda3\\envs\\amzenv\\nltk_data'
    - 'c:\\ProgramData\\anaconda3\\envs\\amzenv\\share\\nltk_data'
    - 'c:\\ProgramData\\anaconda3\\envs\\amzenv\\lib\\nltk_data'
    - 'C:\\Users\\prade\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
