# 1: Cell imports & paths

In [2]:
from pathlib import Path
from typing import List, Optional, Dict
import hashlib
import sys
import chromadb

from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from pypdf.errors import WrongPasswordError

import cryptography, pypdf

print("Python:", sys.executable)
print("cryptography:", cryptography.__version__)
print("pypdf:", pypdf.__version__)

Python: c:\Users\Christian\Desktop\LangChain-Local-RA\.venv\Scripts\python.exe
cryptography: 46.0.2
pypdf: 6.1.1


# 2: Data Directories

In [3]:
DATA_DIR = Path(r"C:\Users\Christian\Desktop\LangChain-Local-RA\data")
DB_DIR   = Path(r"C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db")

DATA_DIR.mkdir(parents=True, exist_ok=True)
DB_DIR.mkdir(parents=True, exist_ok=True)

print("CWD:", Path.cwd())
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)
print("DATA_DIR contents:", [p.name for p in DATA_DIR.glob('*')])


CWD: c:\Users\Christian\Desktop\LangChain-Local-RA\notebooks
DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db
DATA_DIR contents: ['CalculusVolume1-OP.pdf']


# 3: Ollama Models

In [4]:
LLM_MODEL   = "llama3.1"          # ollama pull llama3.1
EMBED_MODEL = "nomic-embed-text"  # ollama pull nomic-embed-text

### 3a: Verification

In [5]:
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db


# 4: Sanity Check
- This will error if Ollama isn't running or the model isn't pulled yet.
- In a terminal (outside Python), run:
-   ollama pull llama3.1
-   ollama pull nomic-embed-text

In [6]:
llm = OllamaLLM(model=LLM_MODEL, temperature=0.2)
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

# Force Chroma to use the ROOT DB_DIR (outside notebooks)
client = chromadb.PersistentClient(path=str(DB_DIR))  # <- hard pins the path

vectorstore = Chroma(
    client=client,                             # <- use client, not persist_directory
    collection_name="local_research_assistant",
    embedding_function=embeddings,
)

print("LLM and Embeddings ready")
# sanity check: confirm the DB file is in the right place
print("DB present at:", DB_DIR / "chroma.sqlite3", (DB_DIR / "chroma.sqlite3").exists())


LLM and Embeddings ready
DB present at: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db\chroma.sqlite3 True


# 5: Chunking

In [None]:
# If any PDF is password-protected, add it here:
PDF_PASSWORDS: Dict[str, Optional[str]] = {
    # "SomeLockedFile.pdf": "your_password_here",
}

def load_pdf_pages_robust(pdf_path: Path) -> List:
    """Load a PDF via PyMuPDF, fall back to PyPDF. Sets absolute 'source' in metadata."""
    pages: List = []
    pwd = PDF_PASSWORDS.get(pdf_path.name)

    # 1) PyMuPDF (best text extraction on technical PDFs)
    try:
        pages = PyMuPDFLoader(str(pdf_path)).load()
        for d in pages:
            d.metadata["source"] = str(pdf_path.resolve())
        if pages:
            return pages
    except Exception as e:
        print(f"PyMuPDF failed for {pdf_path.name}: {e}")

    # 2) Fallback: PyPDF (needs 'cryptography' for encrypted files)
    try:
        pages = PyPDFLoader(str(pdf_path), password=pwd).load()
        for d in pages:
            d.metadata["source"] = str(pdf_path.resolve())
        if pages:
            return pages
    except WrongPasswordError:
        print(f"Skipping encrypted PDF (password needed): {pdf_path.name}")
        return []
    except Exception as e:
        print(f"PyPDF failed for {pdf_path.name}: {e}")

    return []


# 6: Load Documents (PDF's and/or URLs)

In [None]:
# Dedupe across *.pdf and *.PDF
pdf_paths = sorted({str(p.resolve()) for p in DATA_DIR.glob("*.pdf")} |
                    {str(p.resolve()) for p in DATA_DIR.glob("*.PDF")})
pdfs = [Path(p) for p in pdf_paths]
print("PDFs found:", [p.name for p in pdfs])

raw_docs: List = []
for p in pdfs:
    raw_docs.extend(load_pdf_pages_robust(p))

# Optional web pages (if you need them)
urls: list[str] = []  # e.g., ["https://langchain.readthedocs.io/"]
# raw_docs += UnstructuredURLLoader(urls=urls, continue_on_failure=True).load()

print(f"Loaded {len(raw_docs)} pages across {len(pdfs)} PDF(s).")


PDFs found: ['CalculusVolume1-OP.pdf']
Loaded 873 pages across 1 PDF(s).


# 7: Chunking

In [8]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    add_start_index=True,
)
docs = splitter.split_documents(raw_docs)
print(f"Split into {len(docs)} chunks.")


Split into 1543 chunks.


# 8: Vector Store (Create or Reuse; initial add only if empty)

In [9]:
has_index = len(vectorstore.get().get("ids", [])) > 0
if (not has_index) and len(docs) > 0:
    vectorstore.add_documents(docs)  # auto-persist; no .persist()
    print("Indexed documents (auto-persisted).")
else:
    print("Using existing index (or no docs to add).")

Indexed documents (auto-persisted).


# 9: Folder-wide sync (add new, remove deleted) — public API only

In [None]:
def _chunk_id(doc):
    src  = str(doc.metadata.get("source",""))
    page = str(doc.metadata.get("page",""))
    head = doc.page_content[:200].encode("utf-8", errors="ignore")
    return f"{src}::p{page}::{hashlib.md5(head).hexdigest()}"

def sync_all_pdfs():
    # Current files in folder
    current_files = {str(p.resolve()) for p in (list(DATA_DIR.glob("*.pdf")) + list(DATA_DIR.glob("*.PDF")))}

    # What's in Chroma now — request a VALID include (ids will still be in the result)
    all_data = vectorstore.get(include=["metadatas"])  # ✅ no "ids" here
    existing_ids = set(all_data.get("ids", []))        # ids are still present in the return
    metas = all_data.get("metadatas", [])
    indexed_sources = {m.get("source") for m in metas if m.get("source")}

    # 1) Remove chunks for PDFs that were deleted
    removed_sources = {s for s in indexed_sources if s not in current_files}
    if removed_sources:
        print(f"🧹 Removing chunks for deleted PDFs: {[Path(s).name for s in removed_sources]}")
        for s in removed_sources:
            vectorstore.delete(where={"source": s})
    else:
        print("✅ No deleted PDFs found.")

    # 2) Add new/changed PDFs
    added = 0
    for pdf in sorted(current_files):
        pages = load_pdf_pages_robust(Path(pdf))
        if not pages:
            continue
        chunks = splitter.split_documents(pages)
        ids = [_chunk_id(c) for c in chunks]
        new_docs, new_ids = [], []
        for d, cid in zip(chunks, ids):
            if cid not in existing_ids:
                new_docs.append(d); new_ids.append(cid)
        if new_docs:
            vectorstore.add_documents(new_docs, ids=new_ids)  # auto-persist
            existing_ids.update(new_ids)
            added += len(new_docs)

    print(f"Sync complete. Added {added} new chunk(s).")


# 9b: Optional — Clear and Resync Database

In [None]:
def clear_chroma_all():
    data = vectorstore.get(include=["metadatas"])  # valid include
    ids = data.get("ids", [])
    if not ids:
        print("No vectors to delete.")
        return
    vectorstore.delete(ids=ids)
    print(f"Cleared {len(ids)} vectors from Chroma.")

clear_chroma_all()
sync_all_pdfs()  # rebuild from PDFs currently in DATA_DIR


🧹 Cleared 1543 vectors from Chroma.
✅ No deleted PDFs found.
✅ Sync complete. Added 1543 new chunk(s).


# 10: Ask helpers

In [14]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

def ask(question: str) -> str:
    rel_docs = retriever.invoke(question)  # new API
    context = "\n\n---\n\n".join(
        f"[{Path(d.metadata.get('source','?')).name} p{d.metadata.get('page','?')}] {d.page_content[:1000]}"
        for d in rel_docs
    )
    prompt = (
        "You are a helpful research assistant. Answer strictly from the context. "
        "If unsure, say you don't know. Add brief page refs.\n\n"
        f"Q: {question}\n\nContext:\n{context}\n\nA:"
    )
    return llm.invoke(prompt)

def ask_fresh(question: str) -> str:
    sync_all_pdfs()   # keep index in sync with folder
    return ask(question)

# 11: Quick checks / examples

In [16]:
print("PDFs in data/:", [p.name for p in DATA_DIR.glob("*.pdf")])
print("Indexed chunks:", len((vectorstore.get() or {}).get("ids", [])))

# Example queries:
ans = ask("EXplain each method that is described in the pdf.")
print(ans)

PDFs in data/: ['CalculusVolume1-OP.pdf']
Indexed chunks: 1543
I'll explain each method described in the pdf.

**Newton's Method**

Newton's method is an iterative process used to find roots of a function f(x). It starts with an initial guess x0 and uses the formula:

xn = F(xn-1) = xn-1 - [f(xn-1)/f'(xn-1)]

where f'(x) is the derivative of f(x).

In the given example, we use Newton's method to find the root of f(x) = x^2 - 3 with an initial guess x0 = 3. We calculate x1 and x2 using the formula above.

**Iterative Process**

An iterative process is a type of process where each term is defined in terms of the previous term by repeating the same function. In Newton's method, we define F(x) = x - [f(x)/f'(x)], which is an example of an iterative process.

**Failures of Newton's Method**

Newton's method can fail to find a root if:

1. The derivative f'(x) is zero at xn, but f(xn) ≠ 0. This means that the tangent line of f at xn does not intersect the x-axis, so we cannot use it as an ap