### Library installations:

In [1]:
!pip install -q langchain langchain-community \
pypdf2 pypdf pinecone-client python-dotenv \
langchain-groq langchain-openai sentence_transformers \
protoc_gen_openapiv2 langchain-pinecone faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
path_to_pdf = "./docs/Einstein_Field_Equations.pdf"

In [4]:
# local libraries for system runtime
import os,getpass,time
import numpy as np
# from dotenv import load_dotenv
# load_dotenv()
# load pdfreader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
# load docreader
from langchain_community.document_loaders import Docx2txtLoader
# load textreader
from langchain_community.document_loaders import TextLoader
# load document splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# load embedding model
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_openai import OpenAIEmbeddings
# load LLM
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
# load vector database
import pinecone
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, PodSpec
from langchain_community.vectorstores import FAISS
# load chain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain

In [5]:
pinecone_api_key = '9854bfe4-4bc9-44e8-a378-5133358eb7bf'
groq_api_key = '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'

In [6]:
chatbot_name = "pinecone RAG"

### Load and process document:

In [7]:
loader = PyPDFLoader(path_to_pdf)
docs_before_split = loader.load()

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
# split the document into chunks
documents = text_splitter.split_documents(docs_before_split)
documents[0]

Document(metadata={'source': './docs/Einstein_Field_Equations.pdf', 'page': 0}, page_content="Einstein's Field Equations - Explanations\nIntroduction to Einsteins Field Equations\nEinstein's Field Equations are the cornerstone of General Relativity, a theory that describes the\ngravitational interaction as a curvature of spacetime. These equations establish a relationship\nbetween the geometry of spacetime and the energy-momentum of whatever matter and radiation\nare present. The equations are complex, tensorial, and highly non-linear, meaning they can be")

In [9]:
# huggingface_embeddings = HuggingFaceBgeEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-l6-v2",
#     model_kwargs={'device':'cpu'},
#     encode_kwargs={'normalize_embeddings': True}
# )

huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", 
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

In [10]:
sample_embedding = np.array(huggingface_embeddings.embed_query(documents[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-2.57196296e-02  5.61678642e-03 -4.17459942e-02 -2.03188742e-03
 -1.19265709e-02 -2.38850955e-02  1.80438452e-03  3.45708914e-02
 -4.19951379e-02 -7.01188445e-02  4.61646877e-02 -1.22090057e-01
 -3.90262678e-02  2.57404149e-02 -1.81447342e-02 -6.88697472e-02
 -2.91969553e-02  4.17362414e-02 -8.81220028e-02  2.08869129e-02
  9.87989679e-02 -2.67759431e-02 -4.79759388e-02 -2.20333878e-02
 -3.84740159e-03  3.31196263e-02  2.22840183e-03  4.95486334e-03
 -3.64686698e-02 -1.39846593e-01 -3.24269608e-02 -1.34811271e-02
  5.22301048e-02  4.92836535e-02 -3.68084647e-02  4.45043389e-03
  8.81441683e-02 -2.61307135e-02 -4.14519347e-02  7.41159320e-02
  5.33964932e-02  1.70158688e-02  1.01144565e-02 -1.83858741e-02
  2.81684194e-02 -1.52100883e-02  1.49226533e-02  3.81897874e-02
 -4.50172834e-02 -5.51118776e-02  2.28613764e-02 -3.07494570e-02
 -1.07446492e-01  3.40125076e-02 -1.62826814e-02  2.65529547e-02
 -5.82772633e-03 -9.51422658e-03 -8.40124581e-03  3

### Loading the Vector DB:

In [15]:
# configure client
use_serverless = True
pc = Pinecone(api_key=pinecone_api_key)
if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-east-1')
else:
    # if not using a starter index, you should specify a pod_type too
    spec = PodSpec()
# check for and delete index if already exists
index_name = 'pinecone-rag'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
# create a new index
pc.create_index(
    index_name,
    dimension=384,  # confirm embedding model dimensionality
    metric='dotproduct',
    spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)


In [16]:
# verify database is created
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [18]:
vector_store = PineconeVectorStore(index=index, embedding=huggingface_embeddings)


In [21]:
from uuid import uuid4


In [24]:
os.environ['PINECONE_API_KEY'] = '9854bfe4-4bc9-44e8-a378-5133358eb7bf'


In [25]:
#now upsert document to db
vectordb = PineconeVectorStore.from_documents(
        documents,
        index_name=index_name,
        embedding=huggingface_embeddings
    )

In [26]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 16}},
 'total_vector_count': 16}

In [28]:
query = "What are the field equations?"
k_3 = vectordb.similarity_search(query,k=3)# return 3 most relevant docs

In [29]:
print(k_3[0])

page_content='Einstein's Field Equations - Explanations
Introduction to Einsteins Field Equations
Einstein's Field Equations are the cornerstone of General Relativity, a theory that describes the
gravitational interaction as a curvature of spacetime. These equations establish a relationship
between the geometry of spacetime and the energy-momentum of whatever matter and radiation
are present. The equations are complex, tensorial, and highly non-linear, meaning they can be' metadata={'page': 0.0, 'source': './docs/Einstein_Field_Equations.pdf'}
