In [5]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

load_dotenv("../.env", override=True)

llm = ChatOpenAI(
    model="llama-3.3-70b-versatile",
    base_url="https://api.groq.com/openai/v1",
    openai_api_key=os.getenv("GROQ_API_KEY"),
    temperature=0
)

response = llm.invoke("Say hello in one short sentence.")
print(response.content)

Hello, it's nice to meet you.


In [6]:
def load_pdf(pdf_path: str):
    loader = PyPDFLoader(pdf_path)
    return loader.load()
docs = load_pdf("/Users/emmanuelreddy/Desktop/rag2/rag-grok/data/POST-SESSION RESOURCES - S4.pdf")
docs[0].page_content[:300]


"POST-SESSION  RESOURCES  \nSession  4:  The  AI  Prototyper's  Sandbox:  Mastering  Google  AI  Studio  \n  \nThis\n \nresource\n \npack\n \nprovides\n \nthe\n \ntools,\n \ntemplates,\n \nand\n \nreference\n \nmaterial\n \nto\n \nhelp\n \nyou\n \ndeepen\n \nyour\n \nunderstanding\n \nof\n \ntodayâ€™s\n \nconcepts\n \nand\n \napply\n \nthem\n \ndir"

In [7]:

def chunk_documents(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    return splitter.split_documents(documents)
chunks = chunk_documents(docs)
len(chunks)


11

In [8]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vector_db = FAISS.from_documents(chunks, embeddings)


In [10]:
def retrieve_context(vector_db, query, k=4):
    retriever = vector_db.as_retriever(search_kwargs={"k": k})
    return retriever.invoke(query)


In [11]:

query = "What is the main topic of the document?"
retrieved_docs = retrieve_context(vector_db, query)

retrieved_docs[0].page_content[:300]


'3.  Prompt  Templates    Session 4 Prompt Templates.pdf Leverage  these  prompting  resources  for  a  head  start  on  structured,  high-quality  outputs  \n4.  Demo  Inputs  Used  in  Session'

In [12]:
def generate_answer(query, retrieved_docs):
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)

    prompt = f"""
You are a helpful assistant.
Answer ONLY using the provided context.
If the answer is not present, say:
"Answer not found in the document."

Context:
{context}

Question:
{query}
"""

    response = llm.invoke(prompt)
    return response.content


In [13]:
answer = generate_answer(query, retrieved_docs)
print(answer)


The main topic of the document is Session 4 of a course or training program, specifically focused on "The AI Prototyper's Sandbox: Mastering Google AI Studio" and using Gemini AI Studio for AI product engineering.


In [14]:
def rag_pipeline(pdf_path, query):
    docs = load_pdf(pdf_path)
    chunks = chunk_documents(docs)
    vector_db = FAISS.from_documents(chunks, embeddings)
    retrieved_docs = retrieve_context(vector_db, query)
    return generate_answer(query, retrieved_docs)


In [15]:
rag_pipeline("../data/POST-SESSION RESOURCES - S4.pdf", "Explain the objective of this document")


'The objective of this document is to provide guidance on leveraging prompting resources for high-quality outputs, managing AI behavior, and implementing best practices for AI development, including transparency, security, and human oversight, to ensure responsible and effective AI usage.'