In [10]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
os.environ['HF_TOKEN']=os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
# This code works for both first run and subsequent runs -> it first caches code and then in the future it checks and loads the model
embeddings = HuggingFaceEmbeddings(
  model_name="all-MiniLM-L6-v2",
  cache_folder="../../../hugging_face_embedding",
  model_kwargs={'device': 'cpu'}  # or 'cuda' if you have GPU
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:
embeddings.embed_query("hello AI")

[-0.033388201147317886,
 0.034539803862571716,
 0.059474579989910126,
 0.05928615480661392,
 -0.06353538483381271,
 -0.06819586455821991,
 0.08823323249816895,
 0.0344407856464386,
 -0.03278515860438347,
 -0.015814995393157005,
 0.020981667563319206,
 -0.018340280279517174,
 -0.03983214125037193,
 -0.08047076314687729,
 -0.014469259418547153,
 0.03326486423611641,
 0.01425926387310028,
 -0.03404999524354935,
 -0.142915740609169,
 -0.023083265870809555,
 -0.02138010412454605,
 0.0026335634756833315,
 -0.04729275032877922,
 -0.010752726346254349,
 -0.06866799294948578,
 0.03112499974668026,
 0.07594592869281769,
 0.0011283049825578928,
 0.011631982401013374,
 -0.036039214581251144,
 0.04483765363693237,
 0.01839079149067402,
 0.12672799825668335,
 -0.001359735382720828,
 0.008206715807318687,
 0.06909963488578796,
 -0.0807635709643364,
 -0.05841311439871788,
 0.05375451594591141,
 0.026227623224258423,
 -0.00682859867811203,
 -0.05635840445756912,
 0.0032929950393736362,
 -0.072501808404

In [6]:
len(embeddings.embed_query("hello AI"))

384

reload your jupyter

In [None]:
# %load_ext autoreload # do it once then comment out
%autoreload 2
load_dotenv()

True

In [11]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") # it takes env key directly from environment variables

In [12]:
len(embeddings.embed_query("Hello AI"))

768

In [13]:
from pinecone import Pinecone

In [16]:
import os
pinecone_api_key=os.getenv("PINECONE_API_KEY")

In [17]:
pc=Pinecone(api_key=pinecone_api_key)

In [18]:
from pinecone import ServerlessSpec
#Serverless: Server will be Managed by the cloud provider

In [19]:
index_name="agenticbatch2"

In [None]:
pc.has_index(index_name) # no index inside

False

In [22]:
#creating an index
if not pc.has_index(index_name):
    pc.create_index( # right now we use flat index
    name=index_name,
    dimension=768, # we use google gemini model, so we use 768 as dimension
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1")    
)

In [23]:
pc.has_index(index_name) # index is added now
# index is added: https://app.pinecone.io/organizations/-ORjEfC6X56LCBuIAroE/projects/060ef57d-4171-4b0e-9656-19818cbf0419/indexes

True

In [24]:
#loading the index
index=pc.Index(index_name)

In [25]:
from langchain_pinecone import PineconeVectorStore # now we can store the data under the index

In [26]:
vector_store=PineconeVectorStore(index=index,embedding=embeddings) # we define a vector store under this specific index and using defined embedding. It's important to use
# correct embedding -> the one which was assumed during index creation. Otherwise, dimensions won't match.

In [27]:
results = vector_store.similarity_search("what is a langchain?") # currently vectore store is empty

In [28]:
results

[]

In [29]:
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},#additional info
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)


In [30]:
documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [31]:
documents

[Document(metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.'),
 Document(metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.'),
 Document(metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic application

In [32]:
len(documents)

10

In [33]:
range(len(documents))

range(0, 10)

In [34]:
for _ in range(len(documents)):
    print(_)
    print(str(uuid4()))
    

0
fb38bde8-baac-4129-8d23-41eed85c8bd6
1
20c20d88-3f1d-4b76-90a0-e07bdaacbe84
2
d5cfa931-78a1-463b-bd23-b41336dada7f
3
6e1b62f7-7410-41f5-9a88-4f60607c36da
4
ae185029-702a-4056-931e-bd877933bcb4
5
adae158f-7c74-41ff-8f6c-d4a023852ad2
6
d7f80285-4ef2-4df1-9f35-26b6fd40864b
7
548dbfa7-f21d-470f-8fcd-1f5f7ed4830a
8
b6cc0619-9cfd-4a7d-b626-591b53de4617
9
0feb7a96-de86-4e99-a79c-5a2625f9c39e


In [35]:
#universal indentification number
uuids = [str(uuid4()) for _ in range(len(documents))] # -> it's automatically generated with FAISS

In [36]:
uuids

['ea3353a8-df38-4043-8eb5-271a37e88bad',
 'c072a87d-74cf-49b1-85dd-a4fd3092084c',
 '8c80af33-db61-493b-968e-9d9520cdb82b',
 '7ed2644a-7180-4cb8-87e2-a46e9148df1c',
 '28810865-4b0d-432b-a0fb-8b76ea9255b5',
 '921eddd8-252d-4439-bb49-8305fd83ef98',
 '4a13f02d-ac41-42cb-8003-7e4ad95fac8a',
 '5c092720-9853-47e7-a7ed-6f8548ce9b1a',
 'e1c0ee54-8b99-47ec-92cd-9c25e0c7a36c',
 'daffc073-692e-4521-a2fa-5e24268b9a72']

In [37]:
vector_store.add_documents(documents=documents, ids=uuids) # we add documents to vector_store with defined universal indentifaction number

['ea3353a8-df38-4043-8eb5-271a37e88bad',
 'c072a87d-74cf-49b1-85dd-a4fd3092084c',
 '8c80af33-db61-493b-968e-9d9520cdb82b',
 '7ed2644a-7180-4cb8-87e2-a46e9148df1c',
 '28810865-4b0d-432b-a0fb-8b76ea9255b5',
 '921eddd8-252d-4439-bb49-8305fd83ef98',
 '4a13f02d-ac41-42cb-8003-7e4ad95fac8a',
 '5c092720-9853-47e7-a7ed-6f8548ce9b1a',
 'e1c0ee54-8b99-47ec-92cd-9c25e0c7a36c',
 'daffc073-692e-4521-a2fa-5e24268b9a72']

In [38]:
results = vector_store.similarity_search("what langchain provides to us?",k=1)

In [39]:
results

[Document(id='8c80af33-db61-493b-968e-9d9520cdb82b', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!')]

In [42]:
results = vector_store.similarity_search("what langchain provides to us?",k=2,filter={"source": "tweet"})

In [43]:
results

[Document(id='8c80af33-db61-493b-968e-9d9520cdb82b', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='5c092720-9853-47e7-a7ed-6f8548ce9b1a', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [47]:
retriever=vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k":3,"score_threshold": 0.7} #hyperparameter -> similarity level based on cos similarity + k closest matches
)

In [48]:
retriever.invoke("langchain")

[Document(id='8c80af33-db61-493b-968e-9d9520cdb82b', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='5c092720-9853-47e7-a7ed-6f8548ce9b1a', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='daffc073-692e-4521-a2fa-5e24268b9a72', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [49]:
retriever.invoke("google")

[Document(id='8c80af33-db61-493b-968e-9d9520cdb82b', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='daffc073-692e-4521-a2fa-5e24268b9a72', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='5c092720-9853-47e7-a7ed-6f8548ce9b1a', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [50]:
from langchain_google_genai import ChatGoogleGenerativeAI
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt") # we use rag-prompt

In [52]:
import pprint
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [53]:
from langchain_core.prompts import PromptTemplate

In [54]:
prompt=PromptTemplate(
    template="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:""",
    input_variables=['context', 'question']
)

In [55]:
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

In [56]:
prompt.invoke({"question":"what is a langchain?","context":"langchain is very super framework for LLM."}) # question and context are needed parameters as langchain picks and submits them into Prompt

StringPromptValue(text="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: what is a langchain? \nContext: langchain is very super framework for LLM. \nAnswer:")

StringPromptValue(text="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: what is a langchain? \nContext: langchain is very super framework for LLM. \nAnswer:")

In [57]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [58]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [59]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [60]:
rag_chain.invoke("what is llama model?")

"I don't know.  The provided text does not contain information about Llama models."

# second assisgnment is: take a multiple pdf with text,image,table
1. fetch the data from pdf
2. at lesat there should be 200 pages
3. if chunking(use the sementic chunking technique) required do chunking and then embedding
4. store it inside the vector database(use any of them 1. mongodb 2. astradb 3. opensearch 4.milvus) ## i have not discuss then you need to explore
5. create a index with all three index machnism(Flat, HNSW, IVF) ## i have not discuss then you need to explore
6. create a retriever pipeline
7. check the retriever time(which one is fastet)
8. print the accuray score of every similarity search
9. perform the reranking either using BM25 or MMR ## i have not discuss then you need to explore
10. then write a prompt template
11. generte a oputput through llm
12. render that output over the DOCx ## i have not discuss then you need to explore
as a additional tip: you can follow rag playlist from my youtube

after completing it keep it on your github and share that link on my  mail id:
snshrivas3365@gmail.com

and share the assignment in your community chat as well by tagging krish and sunny

deadline is: till firday 9PM
   