# 1: Cell imports & paths

In [10]:
from pathlib import Path
from typing import List, Optional, Dict
import hashlib
import sys

from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings

import cryptography, pypdf

print("Python:", sys.executable)
print("cryptography:", cryptography.__version__)
print("pypdf:", pypdf.__version__)

Python: c:\Users\Christian\Desktop\LangChain-Local-RA\.venv\Scripts\python.exe
cryptography: 46.0.2
pypdf: 6.1.1


# 2: Data Directories

In [11]:
DATA_DIR = Path("data").resolve()      # put your PDFs here
DB_DIR   = Path("chroma_db").resolve() # Chroma persistence

DATA_DIR.mkdir(exist_ok=True, parents=True)
DB_DIR.mkdir(exist_ok=True, parents=True)

print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\chroma_db


# 3: Ollama Models

In [12]:
LLM_MODEL   = "llama3.1"          # ollama pull llama3.1
EMBED_MODEL = "nomic-embed-text"  # ollama pull nomic-embed-text

### 3a: Verification

In [13]:
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\chroma_db


# 4: Sanity Check
- This will error if Ollama isn't running or the model isn't pulled yet.
- In a terminal (outside Python), run:
-   ollama pull llama3.1
-   ollama pull nomic-embed-text

In [14]:

llm = OllamaLLM(model=LLM_MODEL, temperature=0.2)
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

vectorstore = Chroma(
    collection_name="local_research_assistant",
    embedding_function=embeddings,
    persist_directory=str(DB_DIR),  # auto-persist on writes
)

print("LLM and Embeddings ready")

LLM and Embeddings ready


# 5: Chunking

In [15]:
from pypdf.errors import WrongPasswordError

# If you have passwords for specific encrypted PDFs, add them here:
PDF_PASSWORDS: Dict[str, Optional[str]] = {
    # "LockedFile.pdf": "your_password_here",
}

def load_pdf_pages(path: Path):
    """Load a single PDF; skip gracefully if encrypted or unreadable."""
    pwd = PDF_PASSWORDS.get(path.name)
    try:
        return PyPDFLoader(str(path), password=pwd).load()
    except WrongPasswordError:
        print(f"🔒 Skipping encrypted PDF (needs password): {path.name}")
        return []
    except Exception as e:
        print(f"⚠️ Could not load {path.name}: {e}")
        return []


# 6: Load Documents (PDF's and/or URLs)

In [17]:
pdfs = list(DATA_DIR.glob("*.pdf"))

# Optionally add web pages:
urls: list[str] = []  # e.g., ["https://langchain.readthedocs.io/"]

raw_docs: List = []
for p in pdfs:
    raw_docs.extend(load_pdf_pages(p))

# If you want to load URLs too, uncomment:
# raw_docs += UnstructuredURLLoader(urls=urls, continue_on_failure=True).load()

print(f"Found {len(pdfs)} PDF(s). Loaded {len(raw_docs)} raw docs.")


Found 0 PDF(s). Loaded 0 raw docs.


# 7: Chunking

In [20]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    add_start_index=True,
)
docs = splitter.split_documents(raw_docs)
print(f"Split into {len(docs)} chunks.")


Split into 0 chunks.


# 8: Vector Store (Create or Reuse; initial add only if empty)

In [21]:
# 8: Vector Store (Create or Reuse; initial add only if empty)

has_index = len(vectorstore.get().get("ids", [])) > 0
if (not has_index) and len(docs) > 0:
    vectorstore.add_documents(docs)  # auto-persist; no .persist()
    print("Indexed documents (auto-persisted).")
else:
    print("Using existing index (or no docs to add).")


Using existing index (or no docs to add).


# 9: Folder-wide sync (idempotent; skips duplicates)

In [22]:
def _chunk_id(doc):
    """Stable ID per chunk: source path + page + hash(first 200 chars)."""
    src  = str(doc.metadata.get("source",""))
    page = str(doc.metadata.get("page",""))
    head = doc.page_content[:200].encode("utf-8", errors="ignore")
    return f"{src}::p{page}::{hashlib.md5(head).hexdigest()}"

def sync_all_pdfs():
    pdf_paths = list(DATA_DIR.glob("*.pdf"))
    if not pdf_paths:
        print("No PDFs found in data/. Drop files there and rerun.")
        return

    existing_ids = set(vectorstore.get().get("ids", []))
    added, total = 0, 0

    for pdf in pdf_paths:
        pages = load_pdf_pages(pdf)
        if not pages:
            continue

        chunks = splitter.split_documents(pages)
        total += len(chunks)

        ids = [_chunk_id(c) for c in chunks]
        new_docs, new_ids = [], []
        for d, cid in zip(chunks, ids):
            if cid not in existing_ids:
                new_docs.append(d); new_ids.append(cid)

        if new_docs:
            vectorstore.add_documents(new_docs, ids=new_ids)  # auto-persist
            existing_ids.update(new_ids)
            added += len(new_docs)

    print(f"✅ Synced {len(pdf_paths)} PDF(s). Added {added} new chunk(s) (of {total} total).")


# 10: Ask helpers

In [23]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

def ask(question: str) -> str:
    rel_docs = retriever.get_relevant_documents(question)
    context = "\n\n---\n\n".join(
        f"[{d.metadata.get('source','?')} p{d.metadata.get('page','?')}] {d.page_content[:1000]}"
        for d in rel_docs
    )
    prompt = (
        "You are a helpful research assistant. Answer strictly from the context. "
        "If unsure, say you don't know. Add brief page refs.\n\n"
        f"Q: {question}\n\nContext:\n{context}\n\nA:"
    )
    return llm.invoke(prompt)

def ask_fresh(question: str) -> str:
    sync_all_pdfs()   # quick incremental sync; no duplicates re-embedded
    return ask(question)


# 11: Quick checks / examples

In [24]:
print("PDFs in data/:", [p.name for p in DATA_DIR.glob("*.pdf")])
print("Indexed chunks:", len((vectorstore.get() or {}).get("ids", [])))

# Example queries:
# ans = ask("List the key study tactics and exam strategies across all PDFs.")
# print(ans)

# ans = ask_fresh("Summarize memory techniques mentioned.")
# print(ans)


PDFs in data/: []
Indexed chunks: 0
