## Packages & Paths

In [3]:
import os
import numpy as np 

from typing import List, Tuple  
from pathlib import Path
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader            # loads PDFs page-by-page and stores page metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter     # Text splitter for documents
from langchain_openai import ChatOpenAI, OpenAIEmbeddings               # LLM for answering & Embedding tool to turn text into vectors
from langchain_chroma import Chroma 

In [4]:
BASE_DIR = Path.cwd()
PDF_DIR = BASE_DIR / "data"             # Store data PDFs 
CHROMA_DIR = BASE_DIR / "chroma_db"     # Store Chroma DB

## Get data

In [5]:
pdf_paths = sorted(PDF_DIR.glob("*.pdf"))

for path in pdf_paths:
    print(path)

/Users/andrealunghini/Desktop/data_science/rag_lab_solita/data/foundations_of_data_science.pdf


## Step 1: Ingest & Parse

In [6]:
from langchain_core.documents import Document
import re


def clean_pdf_text(text:str) -> str: 
    "We want pure text in chunks." 
    "Cleaning blanks."
    text = re.sub(r'\n \n', ' ', text) 
    text = re.sub(r'\n\n+', '\n\n', text) 
    text = re.sub(r' +', ' ', text) 
    text = text.strip()
    return text

def load_pdfs(paths: List[Path]) -> List[Document]:
    all_docs: List[Document] = [] 
    for path in paths:
        loader = PyPDFLoader(str(path))
        docs = loader.load() 
        for d in docs:
            d.metadata["source"] = Path(d.metadata.get("source",path)).name 
            d.page_content = clean_pdf_text(d.page_content)
        all_docs.extend(docs)
    return all_docs

docs = load_pdfs(pdf_paths)
print(f"Loaded {len(docs)} page-documents.")

Loaded 479 page-documents.


## Step 2: Chunking & Embedding

### Embedding strategies
- Fixed-size chunking with overlap
- Semantic chunking: avoiding splitting a sentence in a semantically important part of the text
- Recursive chunking: recurservly splitting into chunk
- Document structured-based chunk: inherent structure of the text
- LLM-based chunking: input to LLM and LLM generated chunks.

In [7]:
load_dotenv(override=True)

if not os.getenv("OPENAI_API_KEY"):
    raise RuntimeError(
        "OPENAI_API_KEY not found. Create a .env file (copy from .env.example) and set OPENAI_API_KEY."
    )

CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini")
EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")

print("OPENAI_API_KEY found.")              # never print your API keys :)
print(f"Using chat model: {CHAT_MODEL}")
print(f"Using embed model: {EMBED_MODEL}")

OPENAI_API_KEY found.
Using chat model: gpt-4o-mini
Using embed model: text-embedding-3-small


In [8]:
from langchain_text_splitters import CharacterTextSplitter
# Simplest and dumbest approach

# Common sense: "gambling" that overlap with important sentence is 
# not longer than 150 characters. 
splitter = CharacterTextSplitter(chunk_size=3000, 
                                 chunk_overlap=150, 
                                 separator=" ")

chunks = splitter.split_documents(docs)

# Foundation of DS: this is a good candidate for Document-structured based chunking

# Step 3: Build Vector DB & Create Embeddings

In [9]:
ids = []
for i, d in enumerate(chunks):
    src = d.metadata.get("source", "unknown")
    page = d.metadata.get("page", "na")
    ids.append(f"{src}::p{page}::c{i}")

In [11]:
embeddings = OpenAIEmbeddings(
    model=EMBED_MODEL,
    chunk_size=150,     
    )

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    ids=ids,
    persist_directory=str(CHROMA_DIR),
)

try:
    vectorstore.persist()
except Exception:
    pass

print("Chroma vector DB is ready")
print(f"Stored chunks: {vectorstore._collection.count()}")

Chroma vector DB is ready
Stored chunks: 510


In [14]:
col = vectorstore._collection

data = col.get(
    include=["embeddings", "documents", "metadatas"],
    limit=3,
)

print(data.keys())
print("embedding length:", len(data["embeddings"][0]))
print("first 10 dims:", data["embeddings"][0][:10])

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])
embedding length: 1536
first 10 dims: [-0.01326403  0.01361025  0.07392517 -0.00787365  0.03622238 -0.03068277
 -0.01536526  0.00055478 -0.08509991  0.02956052]


In [15]:
print(len(data["embeddings"][0]))
print(data["embeddings"][0][:50])

1536
[-0.01326403  0.01361025  0.07392517 -0.00787365  0.03622238 -0.03068277
 -0.01536526  0.00055478 -0.08509991  0.02956052  0.08376276 -0.00732447
 -0.00537545 -0.02195549  0.00729462  0.04443629  0.02827112  0.02034375
  0.02397315  0.06494718 -0.00453675 -0.05066835  0.00295635  0.01362219
 -0.00732447 -0.00786171 -0.00829151  0.02767418  0.01807537  0.0017759
  0.04811344 -0.02223008 -0.01578312 -0.01622486  0.00956299  0.03786993
 -0.0195916   0.0212511  -0.04178587  0.02256437  0.02120334 -0.0796558
 -0.01353862  0.04087852  0.01760976 -0.03139909 -0.02633704 -0.03495687
  0.01228505  0.04059198]


# Step 4: Retrieval

In [21]:
question = "Linear Programming"
question_embedding = embeddings.embed_query(question)

print("Question embedding length:", len(question_embedding))
print("First 10 dims:", question_embedding[:10])

Question embedding length: 1536
First 10 dims: [-0.05963408946990967, -0.013415073044598103, 0.022505635395646095, -0.055322565138339996, 0.011720332317054272, -0.015272144228219986, 0.03407662361860275, 0.0024154921993613243, -0.02147969976067543, 0.04215423762798309]


In [40]:
question_embedding

[-0.05963408946990967,
 -0.013415073044598103,
 0.022505635395646095,
 -0.055322565138339996,
 0.011720332317054272,
 -0.015272144228219986,
 0.03407662361860275,
 0.0024154921993613243,
 -0.02147969976067543,
 0.04215423762798309,
 0.01815515197813511,
 0.0042790574952960014,
 -0.009064589627087116,
 -0.031505290418863297,
 0.02659638784825802,
 -0.042777590453624725,
 0.03194683417677879,
 -0.03623238578438759,
 -0.03270005062222481,
 0.028414500877261162,
 0.015622780658304691,
 0.03495970368385315,
 0.0031800735741853714,
 0.010746343061327934,
 0.0072205038741230965,
 0.017830489203333855,
 0.038881633430719376,
 -0.002274263883009553,
 -0.0031589705031365156,
 -0.016908446326851845,
 0.031661130487918854,
 -0.024427639320492744,
 -0.017986327409744263,
 -0.006960773374885321,
 0.00992819294333458,
 -0.025219816714525223,
 0.0061815823428332806,
 0.0476345457136631,
 0.003717390587553382,
 0.03127153590321541,
 0.005804973188787699,
 0.009661968797445297,
 -0.02771322801709175,
 -

In [22]:
data = col.get(include=["embeddings", "documents"], limit=10)

chunk_embeddings = data["embeddings"]
chunk_docs = data["documents"]

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print("Similarity scores:\n")

for i, emb in enumerate(chunk_embeddings):
    score = cosine_similarity(question_embedding, emb)
    print(f"Chunk {i} similarity: {score:.4f}")
    print(f"Preview: {chunk_docs[i]}")
    print()

Similarity scores:

Chunk 0 similarity: 0.2210
Preview: Foundations of Data Science ∗
Avrim Blum, John Hopcroft, and Ravindran Kannan
Thursday 4th January, 2018
∗Copyright 2015. All rights reserved
1

Chunk 1 similarity: 0.2158
Preview: Contents
1 Introduction 9
2 High-Dimensional Space 12
2.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12
2.2 The Law of Large Numbers . . . . . . . . . . . . . . . . . . . . . . . . . . 12
2.3 The Geometry of High Dimensions . . . . . . . . . . . . . . . . . . . . . . 15
2.4 Properties of the Unit Ball . . . . . . . . . . . . . . . . . . . . . . . . . . . 17
2.4.1 Volume of the Unit Ball . . . . . . . . . . . . . . . . . . . . . . . . 17
2.4.2 Volume Near the Equator . . . . . . . . . . . . . . . . . . . . . . . 19
2.5 Generating Points Uniformly at Random from a Ball . . . . . . . . . . . . 22
2.6 Gaussians in High Dimension . . . . . . . . . . . . . . . . . . . . . . . . . 23
2.7 Random Projection and Johnson-Lin

In [23]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

test_query = "Linear Programming"
retrieved = retriever.invoke(test_query)

In [25]:
print(f"Query: {test_query}")
print(f"Retrieved chunks: {len(retrieved)}\n")

for i, doc in enumerate(retrieved, start=1):
    src = doc.metadata.get("source", "unknown")
    page = doc.metadata.get("page", "unknown")
    print(f"[{i}] {src} — page {page}")
    print(doc.page_content)
    print()

Query: Linear Programming
Retrieved chunks: 3

[1] foundations_of_data_science.pdf — page 374
depends on the error parameter as well as the presentation of the convex set. We do not
go into these details. But, in principle we can minimize a convex function over a convex
domain. We can also maximize a concave function over a concave domain. However, in
general, we do not have eﬃcient procedures to maximize a convex function over a convex
domain. It is easy to see that at a ﬁrst-order local minimum of a possibly non-convex
function, the gradient vanishes. But second-order local decrease of the function may be
possible. The steepest second-order decrease is in the direction of ±v, where, v is the
eigenvector of the Hessian corresponding to the largest absolute valued eigenvalue.
10.6 Linear Programming
Linear programming is an optimization problem that has been carefully studied and is
immensely useful. We consider linear programming problem in the following form where
A is an m × n matri

# Step 5: Querying

- Ask LLM question: answer question with retrieved context.
- No hallucination: I don't know.

In [28]:
# Zero temperature = no hallucination.
llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

# Prompting the LLM
RAG_SYSTEM_PROMPT = '''You are a helpful assistant.
Answer the user's question using ONLY the provided context.
If the answer is not contained in the context, say: "I don't know."
Keep the answer concise and clear. Only answer questions about the provided context. Do not use any information that is not in the context.
'''

In [31]:
def format_context(docs: List[Document], max_chars: int = 8000) -> str:
    parts = []
    total = 0
    for d in docs:
        src = d.metadata.get("source", "unknown")
        page = d.metadata.get("page", "unknown")
        header = f"\n\n---\nSOURCE: {src} | PAGE: {page}\n"
        text = d.page_content.strip()
        block = header + text
        if total + len(block) > max_chars:
            break
        parts.append(block)
        total += len(block)
    return "".join(parts).strip()

def dedupe_sources(docs: List[Document]) -> List[Tuple[str, int]]:
    seen = set()
    out = []
    for d in docs:
        src = d.metadata.get("source", "unknown")
        page = d.metadata.get("page", None)
        key = (src, page)
        if key not in seen:
            seen.add(key)
            out.append(key)
    return out

def rag_answer(question: str, k: int = 4) -> dict:
    retriever_k = vectorstore.as_retriever(search_kwargs={"k": k})
    retrieved_docs = retriever_k.invoke(question)

    if not retrieved_docs:
        return {"answer": "I don't know.", "sources": []}

    context = format_context(retrieved_docs)

    messages = [
        {"role": "system", "content": RAG_SYSTEM_PROMPT},
        {"role": "user", "content": f"Question: {question}\n\nContext:\n{context}"},
    ]

    response = llm.invoke(messages)
    sources = dedupe_sources(retrieved_docs)

    return {"answer": response.content, "sources": sources, "retrieved_docs": retrieved_docs}


In [38]:
question = "What is MLE?"
result = rag_answer(question, k=5)

print("Question:", question)
print("\nAnswer:\n", result["answer"])

print("\nSources used:")
for src, page in result["sources"]:
    print(f"- {src} — page {page}")


Question: What is MLE?

Answer:
 Maximum Likelihood Estimation (MLE) is a method used to find the best parameter value \( r \) for a probability distribution \( p(x|r) \) after observing values of a random variable \( x \). It involves maximizing the likelihood \( L(r|x) \), which is the probability of observing \( x \) given the parameter \( r \). In cases where a prior guess about the distribution of \( r \) exists, the likelihood \( p(x|r) \) is treated as the posterior to be maximized.

Sources used:
- foundations_of_data_science.pdf — page 428
- foundations_of_data_science.pdf — page 466
- foundations_of_data_science.pdf — page 346
- foundations_of_data_science.pdf — page 211
- foundations_of_data_science.pdf — page 129


## END