# Sell the infomation of what we are doing

# 1: Cell imports & paths

In [1]:
import os
import sys
import chromadb
import hashlib
import cryptography
import pypdf

from pathlib import Path
from typing import List, Optional, Dict
from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from pypdf.errors import WrongPasswordError

print("Python:", sys.executable)
print("cryptography:", cryptography.__version__)
print("pypdf:", pypdf.__version__)

Python: c:\Users\Christian\Desktop\LangChain-Local-RA\.venv\Scripts\python.exe
cryptography: 46.0.2
pypdf: 6.1.1


# 2: Data Directories

In [2]:
try:
    ROOT_DIR = Path(__file__).parent.resolve()
except NameError:
    ROOT_DIR = Path(os.getcwd()).resolve()

if ROOT_DIR.name.lower() == "notebooks":
    ROOT_DIR = ROOT_DIR.parent

DATA_DIR = ROOT_DIR / "data"
DB_DIR   = ROOT_DIR / "chroma_db"

DATA_DIR.mkdir(parents=True, exist_ok=True)
DB_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT_DIR:", ROOT_DIR)
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)
print("DATA_DIR contents:", [p.name for p in DATA_DIR.glob('*')])

ROOT_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA
DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db
DATA_DIR contents: ['CalculusVolume1-OP.pdf']


# 3: Ollama Models

In [3]:
LLM_MODEL   = "llama3.2:1b"          # ollama pull llama3.1
EMBED_MODEL = "nomic-embed-text"  # ollama pull nomic-embed-text

# verify if kaggle or collab will work

### 3a: Verification

In [4]:
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db


# 4: Sanity Check
- This will error if Ollama isn't running or the model isn't pulled yet.
- In a terminal (outside Python), run:
    - ollama pull llama3.2:1b
    - ollama pull nomic-embed-text
- Forces Chroma to use the ROOT DB_DIR (outside notebooks) <- hard pins the path
- Sanity check: confirm the DB file is in the right place

In [5]:
llm = OllamaLLM(model=LLM_MODEL, temperature=0.2)
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

client = chromadb.PersistentClient(path=str(DB_DIR))

vectorstore = Chroma(
    client=client,                             # <- use client, not persist_directory
    collection_name="local_research_assistant",
    embedding_function=embeddings,
)

print("LLM and Embeddings ready")
print("DB present at:", DB_DIR / "chroma.sqlite3", (DB_DIR / "chroma.sqlite3").exists())


LLM and Embeddings ready
DB present at: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db\chroma.sqlite3 True


# 5: Cryptography
- add more info under the titles

In [6]:
# add more comments
# If any PDF is password-protected, add it here:
PDF_PASSWORDS: Dict[str, Optional[str]] = {
    # "SomeLockedFile.pdf": "your_password_here",
}

def load_pdf_pages_robust(pdf_path: Path) -> List:
    """Load a PDF via PyMuPDF, fall back to PyPDF. Sets absolute 'source' in metadata."""
    pages: List = []
    pwd = PDF_PASSWORDS.get(pdf_path.name)

    # 1) PyMuPDF (best text extraction on technical PDFs)
    try:
        pages = PyMuPDFLoader(str(pdf_path)).load()
        for d in pages:
            d.metadata["source"] = str(pdf_path.resolve())
        if pages:
            return pages
    except Exception as e:
        print(f"PyMuPDF failed for {pdf_path.name}: {e}")

    # 2) Fallback: PyPDF (needs 'cryptography' for encrypted files)
    try:
        pages = PyPDFLoader(str(pdf_path), password=pwd).load()
        for d in pages:
            d.metadata["source"] = str(pdf_path.resolve())
        if pages:
            return pages
    except WrongPasswordError:
        print(f"Skipping encrypted PDF (password needed): {pdf_path.name}")
        return []
    except Exception as e:
        print(f"PyPDF failed for {pdf_path.name}: {e}")

    return []


# 6: Load Documents (PDF's and/or URLs)

In [7]:
# Dedupe across *.pdf and *.PDF
pdf_paths = sorted({str(p.resolve()) for p in DATA_DIR.glob("*.pdf")} |
                    {str(p.resolve()) for p in DATA_DIR.glob("*.PDF")})
pdfs = [Path(p) for p in pdf_paths]
print("PDFs found:", [p.name for p in pdfs])

raw_docs: List = []
for p in pdfs:
    raw_docs.extend(load_pdf_pages_robust(p))

# Optional web pages (if you need them)
urls: list[str] = []  # e.g., ["https://langchain.readthedocs.io/"]
# raw_docs += UnstructuredURLLoader(urls=urls, continue_on_failure=True).load()

print(f"Loaded {len(raw_docs)} pages across {len(pdfs)} PDF(s).")


PDFs found: ['CalculusVolume1-OP.pdf']
Loaded 873 pages across 1 PDF(s).


# 7: Chunking
- explain how this works

In [8]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    add_start_index=True,
)
docs = splitter.split_documents(raw_docs)
print(f"Split into {len(docs)} chunks.")


Split into 1543 chunks.


# 8: Vector Store (Create or Reuse; initial add only if empty)

In [9]:
has_index = len(vectorstore.get().get("ids", [])) > 0
if (not has_index) and len(docs) > 0:
    vectorstore.add_documents(docs)  # auto-persist; no .persist()
    print("Indexed documents (auto-persisted).")
else:
    print("Using existing index (or no docs to add).")

Using existing index (or no docs to add).


# 9: Folder-wide sync 
- (add new, remove deleted) 
- 
— public API only

In [10]:
def _chunk_id(doc):
    src  = str(doc.metadata.get("source",""))
    page = str(doc.metadata.get("page",""))
    head = doc.page_content[:200].encode("utf-8", errors="ignore")
    return f"{src}::p{page}::{hashlib.md5(head).hexdigest()}"

def sync_all_pdfs():
    # Current files in folder
    current_files = {str(p.resolve()) for p in (list(DATA_DIR.glob("*.pdf")) + list(DATA_DIR.glob("*.PDF")))}

    # What's in Chroma now — request a VALID include (ids will still be in the result)
    all_data = vectorstore.get(include=["metadatas"])  # ✅ no "ids" here
    existing_ids = set(all_data.get("ids", []))        # ids are still present in the return
    metas = all_data.get("metadatas", [])
    indexed_sources = {m.get("source") for m in metas if m.get("source")}

    # 1) Remove chunks for PDFs that were deleted
    removed_sources = {s for s in indexed_sources if s not in current_files}
    if removed_sources:
        print(f"Removing chunks for deleted PDFs: {[Path(s).name for s in removed_sources]}")
        for s in removed_sources:
            vectorstore.delete(where={"source": s})
    else:
        print("No deleted PDFs found.")

    # 2) Add new/changed PDFs
    added = 0
    for pdf in sorted(current_files):
        pages = load_pdf_pages_robust(Path(pdf))
        if not pages:
            continue
        chunks = splitter.split_documents(pages)
        ids = [_chunk_id(c) for c in chunks]
        new_docs, new_ids = [], []
        for d, cid in zip(chunks, ids):
            if cid not in existing_ids:
                new_docs.append(d); new_ids.append(cid)
        if new_docs:
            vectorstore.add_documents(new_docs, ids=new_ids)  # auto-persist
            existing_ids.update(new_ids)
            added += len(new_docs)

    print(f"Sync complete. Added {added} new chunk(s).")


# 9b: Optional — Clear and Resync Database
- Not needed for presentation

In [11]:
def clear_chroma_all():
    data = vectorstore.get(include=["metadatas"])  # valid include
    ids = data.get("ids", [])
    if not ids:
        print("No vectors to delete.")
        return
    vectorstore.delete(ids=ids)
    print(f"Cleared {len(ids)} vectors from Chroma.")

clear_chroma_all()
sync_all_pdfs()  # rebuild from PDFs currently in DATA_DIR


Cleared 1543 vectors from Chroma.
No deleted PDFs found.
Sync complete. Added 1543 new chunk(s).


# 10: Ask helpers
- Give more context on the prompt can be changed

In [12]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Add section to be able to change the prompt + needed context variables

def ask(question: str) -> str:
    rel_docs = retriever.invoke(question)  # new API
    context = "\n\n---\n\n".join(
        f"[{Path(d.metadata.get('source','?')).name} p{d.metadata.get('page','?')}] {d.page_content[:1000]}"
        for d in rel_docs
    )
    prompt = (
        "You are a helpful research assistant. Answer strictly from the context. "
        "If unsure, say you don't know. Add brief page refs.\n\n"
        f"Q: {question}\n\nContext:\n{context}\n\nA:"
    )
    return llm.invoke(prompt)

def ask_fresh(question: str) -> str:
    sync_all_pdfs()   # keep index in sync with folder
    return ask(question)

# 11: Quick checks for new data

In [13]:
print("PDFs in data/:", [p.name for p in DATA_DIR.glob("*.pdf")])
print("Indexed chunks:", len((vectorstore.get() or {}).get("ids", [])))

PDFs in data/: ['CalculusVolume1-OP.pdf']
Indexed chunks: 1543


# 12:Question section

In [14]:
# Example queries:
ans = ask("Explain each method that is described in the CalculusVolume1 like a 5 year old.")
print(ans)

I'd be happy to explain each method in CalculusVolume1 like a 5-year-old.

**Defining the Derivative**

Imagine you're playing with a toy car on a track. You want to know how fast it's moving at any given moment. One way to do this is to look at how far it has traveled so far and divide that by the time it took to get there. That's kind of like what the derivative does - it helps us figure out how fast something (like a function) is changing at any given point.

Think of it like this: Imagine you have a toy box with some toys inside, and you want to know how fast they're moving around in the box. You can measure how many toys are in the box at one moment, and then divide that by how long it's been since you measured them. That gives you an idea of how fast the toys are moving.

**Calculating the Slope of a Tangent Line**

Now, let's say you have a toy car on the track, and you want to know its speed at any given moment. You can use the derivative to figure out that speed. The derivativ