# 1: Cell imports & paths

In [1]:
from pathlib import Path
from typing import List, Optional, Dict
import hashlib
import sys

from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings

import cryptography, pypdf

print("Python:", sys.executable)
print("cryptography:", cryptography.__version__)
print("pypdf:", pypdf.__version__)

Python: c:\Users\Christian\Desktop\LangChain-Local-RA\.venv\Scripts\python.exe
cryptography: 46.0.2
pypdf: 6.1.1


# 2: Data Directories

In [11]:
# 2: Data Directories (fixed absolute path)
from pathlib import Path

DATA_DIR = Path(r"C:\Users\Christian\Desktop\LangChain-Local-RA\data")
DB_DIR   = Path(r"C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db")

DATA_DIR.mkdir(parents=True, exist_ok=True)
DB_DIR.mkdir(parents=True, exist_ok=True)

print("CWD:", Path.cwd())
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)
print("DATA_DIR contents:", [p.name for p in DATA_DIR.glob('*')])


CWD: c:\Users\Christian\Desktop\LangChain-Local-RA\notebooks
DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\chroma_db
DATA_DIR contents: ['Fundamentals-of-Parallel-Processing-chapters1-5_22-43.pdf', 'Spark Charts - Study Tactics.pdf']


# 3: Ollama Models

In [3]:
LLM_MODEL   = "llama3.1"          # ollama pull llama3.1
EMBED_MODEL = "nomic-embed-text"  # ollama pull nomic-embed-text

### 3a: Verification

In [4]:
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\data
DB_DIR: C:\Users\Christian\Desktop\LangChain-Local-RA\notebooks\chroma_db


# 4: Sanity Check
- This will error if Ollama isn't running or the model isn't pulled yet.
- In a terminal (outside Python), run:
-   ollama pull llama3.1
-   ollama pull nomic-embed-text

In [5]:

llm = OllamaLLM(model=LLM_MODEL, temperature=0.2)
embeddings = OllamaEmbeddings(model=EMBED_MODEL)

vectorstore = Chroma(
    collection_name="local_research_assistant",
    embedding_function=embeddings,
    persist_directory=str(DB_DIR),  # auto-persist on writes
)

print("LLM and Embeddings ready")

LLM and Embeddings ready


# 5: Chunking

In [13]:
from pypdf.errors import WrongPasswordError

# If you have passwords for specific encrypted PDFs, add them here:
PDF_PASSWORDS: Dict[str, Optional[str]] = {
    # "LockedFile.pdf": "your_password_here",
}

def load_pdf_pages(path: Path):
    """Load a single PDF; skip gracefully if encrypted or unreadable."""
    pwd = PDF_PASSWORDS.get(path.name)
    try:
        return PyPDFLoader(str(path), password=pwd).load()
    except WrongPasswordError:
        print(f"Skipping encrypted PDF (needs password): {path.name}")
        return []
    except Exception as e:
        print(f"Could not load {path.name}: {e}")
        return []


# 6: Load Documents (PDF's and/or URLs)

In [12]:
pdfs = list(DATA_DIR.glob("*.pdf")) + list(DATA_DIR.glob("*.PDF"))  # case-insensitive

# Optionally add web pages:
urls: list[str] = []  # e.g., ["https://langchain.readthedocs.io/"]

raw_docs: List = []
for p in pdfs:
    raw_docs.extend(load_pdf_pages(p))

# If you want to load URLs too, uncomment:
# raw_docs += UnstructuredURLLoader(urls=urls, continue_on_failure=True).load()

print(f"Found {len(pdfs)} PDF(s). Loaded {len(raw_docs)} raw docs.")
print("PDFs in data/:", [p.name for p in pdfs])


Found 4 PDF(s). Loaded 52 raw docs.
PDFs in data/: ['Fundamentals-of-Parallel-Processing-chapters1-5_22-43.pdf', 'Spark Charts - Study Tactics.pdf', 'Fundamentals-of-Parallel-Processing-chapters1-5_22-43.pdf', 'Spark Charts - Study Tactics.pdf']


# 7: Chunking

In [14]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    add_start_index=True,
)
docs = splitter.split_documents(raw_docs)
print(f"Split into {len(docs)} chunks.")


Split into 326 chunks.


# 8: Vector Store (Create or Reuse; initial add only if empty)

In [15]:
has_index = len(vectorstore.get().get("ids", [])) > 0
if (not has_index) and len(docs) > 0:
    vectorstore.add_documents(docs)  # auto-persist; no .persist()
    print("Indexed documents (auto-persisted).")
else:
    print("Using existing index (or no docs to add).")

Indexed documents (auto-persisted).


# 9: Folder-wide sync (idempotent; skips duplicates)

In [16]:
def _chunk_id(doc):
    """Stable ID per chunk: source path + page + hash(first 200 chars)."""
    src  = str(doc.metadata.get("source",""))
    page = str(doc.metadata.get("page",""))
    head = doc.page_content[:200].encode("utf-8", errors="ignore")
    return f"{src}::p{page}::{hashlib.md5(head).hexdigest()}"

def sync_all_pdfs():
    pdf_paths = list(DATA_DIR.glob("*.pdf"))
    if not pdf_paths:
        print("No PDFs found in data/. Drop files there and rerun.")
        return

    existing_ids = set(vectorstore.get().get("ids", []))
    added, total = 0, 0

    for pdf in pdf_paths:
        pages = load_pdf_pages(pdf)
        if not pages:
            continue

        chunks = splitter.split_documents(pages)
        total += len(chunks)

        ids = [_chunk_id(c) for c in chunks]
        new_docs, new_ids = [], []
        for d, cid in zip(chunks, ids):
            if cid not in existing_ids:
                new_docs.append(d); new_ids.append(cid)

        if new_docs:
            vectorstore.add_documents(new_docs, ids=new_ids)  # auto-persist
            existing_ids.update(new_ids)
            added += len(new_docs)

    print(f"✅ Synced {len(pdf_paths)} PDF(s). Added {added} new chunk(s) (of {total} total).")


# 10: Ask helpers

In [17]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

def ask(question: str) -> str:
    rel_docs = retriever.get_relevant_documents(question)
    context = "\n\n---\n\n".join(
        f"[{d.metadata.get('source','?')} p{d.metadata.get('page','?')}] {d.page_content[:1000]}"
        for d in rel_docs
    )
    prompt = (
        "You are a helpful research assistant. Answer strictly from the context. "
        "If unsure, say you don't know. Add brief page refs.\n\n"
        f"Q: {question}\n\nContext:\n{context}\n\nA:"
    )
    return llm.invoke(prompt)

def ask_fresh(question: str) -> str:
    sync_all_pdfs()   # quick incremental sync; no duplicates re-embedded
    return ask(question)


# 11: Quick checks / examples

In [21]:
print("PDFs in data/:", [p.name for p in DATA_DIR.glob("*.pdf")])
print("Indexed chunks:", len((vectorstore.get() or {}).get("ids", [])))

# Example queries:
ans = ask("List the key study tactics and exam strategies across all PDFs.")
print(ans)

ans = ask_fresh("Summarize memory techniques mentioned.")
print(ans)


PDFs in data/: ['Fundamentals-of-Parallel-Processing-chapters1-5_22-43.pdf', 'Spark Charts - Study Tactics.pdf']
Indexed chunks: 326
Based on the provided PDFs, here are the key study tactics and exam strategies mentioned:

**Study Tactics:**

1. **SQ3R Method**: Survey, Question, Read, Recite, Review (p2)
2. **PQRST Method**: Preview, Question, Read, Summarize, Test (p2)
3. **Chunking**: Break down large amounts of information into smaller chunks (p2)
4. **Mnemonics**: Use associations and acronyms to remember key terms and concepts (p2)
5. **Self-Testing**: Test yourself on the material to identify areas for improvement (p3)

**Exam Strategies:**

1. **Read the Question Carefully**: Understand what is being asked before answering (p3)
2. **Use Process of Elimination**: Eliminate obviously incorrect answers and focus on the remaining options (p3)
3. **Manage Your Time Effectively**: Allocate time wisely to complete all questions within the allotted time (p3)

Please note that these ar