# 1: Cell imports & paths

In [14]:
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyPDFLoader, UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings

# 2: Data Directories

In [2]:
DATA_DIR = Path("../data").resolve()
DB_DIR = Path("../chroma_db").resolve()
DB_DIR.mkdir(exist_ok=True, parents=True)
DATA_DIR.mkdir(exist_ok=True, parents=True)

# 3: Ollama Models

In [3]:
LLM_MODEL = "llama3.1"
EMBED_MODEL = "nomic-embed-text"

### 3a: Verification

In [4]:
print("DATA_DIR:", DATA_DIR)
print("DB_DIR:", DB_DIR)

DATA_DIR: C:\Users\Christian\OneDrive - The University of Colorado Denver\LangChain-Local-RA\data
DB_DIR: C:\Users\Christian\OneDrive - The University of Colorado Denver\LangChain-Local-RA\chroma_db


# 4: Sanity Check
- This will error if Ollama isn't running or the model isn't pulled yet.
- In a terminal (outside Python), run:
-   ollama pull llama3.1
-   ollama pull nomic-embed-text

In [9]:

llm = OllamaLLM(model=LLM_MODEL, temperature=0.2)
embeddings = OllamaEmbeddings(model=EMBED_MODEL)
print("LLM and Embeddings ready ✅")


LLM and Embeddings ready ✅


# 5: Load Documents (PDF's and/or URLs)

In [None]:
def load_pdfs(pdf_paths: List[Path]):
    docs = []
    for p in pdf_paths:
        if p.exists() and p.suffix.lower() == ".pdf":
            docs.extend(PyPDFLoader(str(p)).load())
    return docs

def load_urls(urls: List[str]):
    if not urls:
        return []
    return UnstructuredURLLoader(urls=urls, continue_on_failure=True).load()

# Put any PDFs into ../data/ and they’ll be picked up automatically
pdfs = list(DATA_DIR.glob("*.pdf"))
urls = []  # e.g., ["https://langchain.readthedocs.io/"]

raw_docs = load_pdfs(pdfs) + load_urls(urls)
print(f"Loaded {len(raw_docs)} raw docs")

Loaded 0 raw docs


# 6: Chunking

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    add_start_index=True,
)
docs = splitter.split_documents(raw_docs)
print(f"Split into {len(docs)} chunks")

Split into 0 chunks


# 7: Vector Store (Create or Reuse)

In [15]:
vectorstore = Chroma(
    collection_name="local_research_assistant",
    embedding_function=embeddings,
    persist_directory=str(DB_DIR),
)

# Only add if index is empty and we have docs
needs_index = (len(vectorstore.get().get("ids", [])) == 0) and len(docs) > 0
if needs_index:
    vectorstore.add_documents(docs)
    vectorstore.persist()
    print("Indexed and persisted documents.")
else:
    print("Using existing index (or no docs to add).")


Using existing index (or no docs to add).


# 8: Ask questions to the helper

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

def ask(q: str) -> str:
    rel_docs = retriever.get_relevant_documents(q)
    # Build a compact context with source + page ref
    ctx = "\n\n---\n\n".join(
        f"[{d.metadata.get('source','?')} p{d.metadata.get('page','?')}] {d.page_content[:1000]}"
        for d in rel_docs
    )
    prompt = (
        "You are a helpful research assistant. Answer strictly from the context.\n"
        "If unsure, say you don't know. Add brief page refs.\n\n"
        f"Q: {q}\n\nContext:\n{ctx}\n\nA:"
    )
    return llm.invoke(prompt)

# Try it (after adding a PDF):
# ask("Give me a 3-sentence summary with page references.")