In [32]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_groq import ChatGroq
import os, requests
from dotenv import load_dotenv

In [31]:
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")

In [17]:
url = "https://raw.githubusercontent.com/hwchase17/chroma-langchain/refs/heads/master/state_of_the_union.txt"

response = requests.get(url)

with open("state_of_the_union.txt", "wb") as f:
    f.write(response.text.encode("utf-8"))

In [None]:
## 1. Load the document
loader = TextLoader("state_of_the_union.txt", encoding="utf-8")
docs = loader.load()

1

In [None]:
## 2. Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 100)
splitted_docs = text_splitter.split_documents(docs)

In [34]:
## 3. Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [36]:
## 4. Store the embeddings in a vector store
vectorstore = FAISS.from_documents(splitted_docs, embeddings)

In [40]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [41]:
from langchain.prompts import PromptTemplate

template = """You are a helpful assistant that answers questions based on the provided context for Question Answering Tasks.
Use the following pieces of context to answer the question at the end.
if you don't know the answer, just say that you don't know. Don't try to make up an answer.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
prompt = PromptTemplate.from_template(template)

In [42]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [46]:
llm = ChatGroq(model="gemma2-9b-it", temperature=0.5)

In [47]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("What is the document is about?")

'The document is a collection of excerpts from a State of the Union address.  \n\nThe excerpts highlight themes of unity, responsibility, and the strength of American democracy. \n\nThey also touch upon overcoming challenges like COVID-19 and the importance of American values. \n'

In [50]:
rag_chain.invoke("Who is the speaker?")

'The speaker is the President of the United States. \n\nThis is indicated by the opening salutation "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman." \n'