In [19]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("all-of-statistics.pdf")
data = loader.load()

In [20]:
data

[Document(metadata={'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'creator': 'A-PDF Merger(Infix)', 'creationdate': '2005-08-15T22:23:50-04:00', 'moddate': 'D:20080525100447', 'source': 'all-of-statistics.pdf', 'total_pages': 458, 'page': 0, 'page_label': '1'}, page_content='To Isa'),
 Document(metadata={'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'creator': 'A-PDF Merger(Infix)', 'creationdate': '2005-08-15T22:23:50-04:00', 'moddate': 'D:20080525100447', 'source': 'all-of-statistics.pdf', 'total_pages': 458, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'creator': 'A-PDF Merger(Infix)', 'creationdate': '2005-08-15T22:23:50-04:00', 'moddate': 'D:20080525100447', 'source': 'all-of-statistics.pdf', 'total_pages': 458, 'page': 2, 'page_label': '3'}, page_content='Preface\nTaken literally, the title “All of Statistics” is an exaggeration. But in spirit,\nthe title is apt, as the book doe

In [21]:
len(data)

458

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(data)

print(f"Number of documents: {len(docs)}")

Number of documents: 838


In [23]:
docs[0]

Document(metadata={'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'creator': 'A-PDF Merger(Infix)', 'creationdate': '2005-08-15T22:23:50-04:00', 'moddate': 'D:20080525100447', 'source': 'all-of-statistics.pdf', 'total_pages': 458, 'page': 0, 'page_label': '1'}, page_content='To Isa')

In [None]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv()

embeddings = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004')

vector = embeddings.embed_query('hello')
vector[:5]

[0.01805581897497177,
 0.00474091200158,
 -0.04882135987281799,
 -0.024462424218654633,
 -0.005917029920965433]

In [28]:
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

In [37]:
retriever = vectorstore.as_retriever(search_type='similarity',search_kwargs={"k": 10})

retrieved_docs = retriever.get_relevant_documents("Summarize the content of these documents.")

In [38]:
len(retrieved_docs)

10

In [39]:
retrieved_docs

[Document(id='1f170c0f-03e7-4cff-8988-53c52be16157', metadata={'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'creationdate': '2005-08-15T22:23:50-04:00', 'creator': 'A-PDF Merger(Infix)', 'page_label': '17', 'total_pages': 458, 'moddate': 'D:20080525100447', 'source': 'all-of-statistics.pdf', 'page': 16}, page_content='Part I\nProbability'),
 Document(id='1943c8ca-f264-4508-a266-bc2d92035446', metadata={'total_pages': 458, 'page_label': '1', 'source': 'all-of-statistics.pdf', 'page': 0, 'moddate': 'D:20080525100447', 'creationdate': '2005-08-15T22:23:50-04:00', 'creator': 'A-PDF Merger(Infix)', 'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)'}, page_content='To Isa'),
 Document(id='d038731e-8d7f-4f00-9038-2da1ec97c7f0', metadata={'creator': 'A-PDF Merger(Infix)', 'creationdate': '2005-08-15T22:23:50-04:00', 'total_pages': 458, 'source': 'all-of-statistics.pdf', 'page': 222, 'moddate': 'D:20080525100447', 'producer': 'A-PDF Merger 3.0.4 (http://www.a-pdf.com)', 'page_l

In [43]:
print(retrieved_docs[5].page_content)

website for this book. Estimate the cdf F(x). Compute and plot a 95
percent conﬁdence envelope for F (as described in the appendix). Find
an approximate 95 percent conﬁdence interval for F(4.9) − F(4.3).


In [69]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3, max_tokens=500)

In [70]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = ("""You are a helpful AI assistant that helps people find information about courses from the provided context.
If you don't know the answer, just say that you don't know. DO NOT try to make up an answer.
Use the following pieces of context to answer the question at the end.
{context}  
Answer the question truthfully and as best as you can and keep it concise.
""")


prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{input}")
])

In [71]:
question_answering_chain = create_stuff_documents_chain(
    llm,
    prompt,   
)

rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [None]:
response = rag_chain.invoke({"input":"What is Chebyshev's inequality?"})
print(response['answer'])

The documents appear to be excerpts from a textbook or course material covering topics in probability and statistics. The content is divided into three main parts: "Probability," "Statistical Inference," and "Statistical Models and Methods."

Key topics discussed include:
*   **Probability:** Bivariate, Marginal, Independent, and Conditional Distributions.
*   **Statistical Inference:** Confidence intervals (for differences of means, CDFs, and correlation coefficients ρ), permutation tests, bootstrap methods, Fisher's method, large sample methods, and confidence envelopes.
*   **References:** Mentions various authors and their works related to advanced convergence theory and problems.

The text also includes acknowledgments to students and colleagues for their feedback and contributions to the book.
