In [19]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("all-of-statistics.pdf")
data = loader.load()

In [None]:
data

In [None]:
len(data)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(data)

print(f"Number of documents: {len(docs)}")

In [None]:
docs[0]

In [None]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv()

embeddings = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')

vector = embeddings.embed_query('hello')
vector[:5]

In [28]:
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

In [37]:
retriever = vectorstore.as_retriever(search_type='similarity',search_kwargs={"k": 10})

retrieved_docs = retriever.get_relevant_documents("Summarize the content of these documents.")

In [38]:
len(retrieved_docs)

10

In [None]:
retrieved_docs

In [None]:
print(retrieved_docs[5].page_content)

In [69]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3, max_tokens=500)

In [70]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = ("""You are a helpful AI assistant that helps people find information about courses from the provided context.
If you don't know the answer, just say that you don't know. DO NOT try to make up an answer.
Use the following pieces of context to answer the question at the end.
{context}  
Answer the question truthfully and as best as you can and keep it concise.
""")


prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{input}")
])

In [71]:
question_answering_chain = create_stuff_documents_chain(
    llm,
    prompt,   
)

rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [None]:
response = rag_chain.invoke({"input":"What is Chebyshev's inequality?"})
print(response['answer'])