
# RAG Lab 2: Query → Retrieve (Chroma) → Rerank → HF LLM Answer

**Updated:** 2025-11-08

In this short lab you will:
1. Enter a natural-language **query** via a widget
2. **Embed** the query using the **same model** as your document embeddings
3. Retrieve **top k×2** candidates from an **existing ChromaDB** collection
4. **Rerank** those candidates with a **cross-encoder** reranker
5. Keep the **top k** passages and build a grounded prompt
6. Run **Hugging Face LLM** inference and display the **answer**

> Goal: illustrate a lean RAG pipeline with reranking. Designed for CPU-only machines and minimal code.


In [None]:

# Minimal dependencies (CPU friendly). If already installed, this is a no-op.
%pip -q install "chromadb==0.4.24" "sentence-transformers==2.5.1" "transformers>=4.41.0" "ipywidgets>=8.1.0" "tqdm>=4.66.0"

import sys, platform, importlib
print("Python:", sys.version.split()[0], "| Platform:", platform.platform())

def _ver(m):
    try:
        return importlib.import_module(m).__version__
    except Exception as e:
        return f"not found ({e})"

print("chromadb:", _ver("chromadb"))
print("sentence_transformers:", _ver("sentence_transformers"))
print("transformers:", _ver("transformers"))
print("ipywidgets:", _ver("ipywidgets"))


In [None]:

# ---- Imports & configuration ----
import os, numpy as np, pandas as pd
from typing import List, Dict
from tqdm import tqdm
import ipywidgets as w

import chromadb
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline

# === Paths & model names (match prior lab defaults) ===
PERSIST_DIR    = "./rag_chroma"                      # where the prior lab persisted vectors
COLLECTION_NAME= "cnu_rag_lab"                       # collection name used previously
EMBED_MODEL    = "sentence-transformers/all-MiniLM-L6-v2"  # same embedder for query
RERANK_MODEL   = "cross-encoder/ms-marco-MiniLM-L-6-v2"    # compact cross-encoder
LLM_MODEL      = "google/flan-t5-base"                     # small HF seq2seq model (CPU-ok)
K_DEFAULT      = 5

# Connect to existing Chroma collection
client = chromadb.PersistentClient(path=PERSIST_DIR)
try:
    collection = client.get_collection(COLLECTION_NAME)
    print(f"Connected to collection '{COLLECTION_NAME}' with {collection.count()} vectors at {PERSIST_DIR}")
except Exception as e:
    raise SystemExit(
        f"[Error] Could not open Chroma collection '{COLLECTION_NAME}' at {PERSIST_DIR}.\n"
        "Run the previous RAG lab to build it, then re-run this notebook."
    )


In [None]:

# Load the SAME embedder used for documents, so query embeddings live in the same space.
embedder = SentenceTransformer(EMBED_MODEL)
print("Embedder:", EMBED_MODEL)

# Cross-encoder reranker: computes relevance for (query, passage) pairs. Higher = better.
reranker = CrossEncoder(RERANK_MODEL)
print("Reranker:", RERANK_MODEL)

# Lightweight HF text2text model for grounded answering (fast on CPU relative to larger LLMs).
# You can swap to a chat model later if you have more compute.
gen = pipeline("text2text-generation", model=LLM_MODEL)
print("HF LLM:", LLM_MODEL)


In [None]:

def embed_query(query: str) -> np.ndarray:
    '''Return a normalized embedding for the user query (same model as documents).'''
    v = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    return v[0].astype("float32")

def chroma_retrieve(query: str, k2: int) -> List[Dict]:
    '''Retrieve top k*2 candidates from Chroma using vector similarity.'''
    q_emb = embed_query(query)
    res = collection.query(
        query_embeddings=[q_emb],
        n_results=k2,
        include=["documents", "metadatas", "distances", "ids"]
    )
    # Flatten results into a list of dicts for easy handling
    items = []
    for cid, doc, meta, dist in zip(res["ids"][0], res["documents"][0], res["metadatas"][0], res["distances"][0]):
        items.append({
            "id": cid,
            "text": doc,
            "source": meta.get("source"),
            "page": meta.get("page"),
            "method": meta.get("method"),
            "distance": float(dist)
        })
    return items

def rerank(query: str, candidates: List[Dict], k: int) -> List[Dict]:
    '''Use a cross-encoder to rerank and return the top-k passages for the final prompt.'''
    pairs = [(query, c["text"]) for c in candidates]
    scores = reranker.predict(pairs)  # vector of relevance scores
    for c, s in zip(candidates, scores):
        c["score"] = float(s)
    # Sort by score descending and keep top-k
    return sorted(candidates, key=lambda x: x["score"], reverse=True)[:k]

def build_context(passages: List[Dict]) -> str:
    '''Build a compact context block with source/page for grounding.'''
    lines = []
    for p in passages:
        tag = f"[{p.get('source')} p.{p.get('page')} | {p.get('method')}] "
        text = p["text"].strip().replace("\n", " ")
        lines.append(tag + text)
    return "\n- " + "\n- ".join(lines) if lines else ""


In [None]:

SYS = "You are a concise TA. Answer ONLY using the provided context. If insufficient, say you don't know. Cite filename and page when possible."

def answer_with_llm(query: str, passages: List[Dict], max_new_tokens: int = 256) -> str:
    '''Create a grounded prompt and generate an answer with a small HF model (FLAN-T5).'''
    context = build_context(passages)
    # FLAN-T5 is seq2seq; a plain instruction-style prompt works well
    prompt = (
        f"{SYS}\n\n"
        f"Question: {query}\n"
        f"Context:\n{context}\n\n"
        f"Answer:"
    )
    out = gen(prompt, max_new_tokens=max_new_tokens, num_beams=4, do_sample=False)
    return out[0]["generated_text"].strip()


In [None]:

# Simple UI: text area for the query and slider for k
q_box   = w.Textarea(value="", placeholder="Type your question here…", description="Query:", layout=w.Layout(width="100%", height="100px"))
k_slider= w.IntSlider(value=K_DEFAULT, min=2, max=10, step=1, description="k")
run_btn = w.Button(description="Retrieve → Rerank → Answer", button_style="primary")
out     = w.Output()

def on_click(_):
    out.clear_output(wait=True)
    query = q_box.value.strip()
    k = int(k_slider.value)
    if not query:
        with out: print("Please type a query.")
        return
    with out:
        print("Retrieving from Chroma…")
        cands = chroma_retrieve(query, k2=k*2)
        if not cands:
            print("No results. Make sure your Chroma collection is populated (run the first lab).")
            return
        print(f"Retrieved {len(cands)} candidates. Reranking…")
        topk = rerank(query, cands, k=k)
        # Tiny preview table of the top-k reranked passages
        df = pd.DataFrame([{
            "rank": i+1,
            "score": round(p["score"], 4),
            "source": p["source"],
            "page": p["page"],
            "snippet": (p["text"][:220] + "…") if len(p["text"]) > 220 else p["text"]
        } for i, p in enumerate(topk)])
        display(df)
        print("\nGenerating grounded answer…")
        ans = answer_with_llm(query, topk, max_new_tokens=256)
        print("\n=== Answer ===\n", ans)

run_btn.on_click(on_click)
w.VBox([q_box, k_slider, run_btn, out])



### Tips & next steps
- **Why rerankers help:** bi-encoders (embeddings) are fast but approximate; cross-encoders read the *pair* (query, passage) and usually re-order the top candidates more accurately.
- Try other rerankers: `cross-encoder/ms-marco-MiniLM-L-12-v2`, `BAAI/bge-reranker-base` (bigger = slower but better).
- You can swap the LLM (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`) via `pipeline("text-generation", ...)` if you prefer chat-style models.
- Keep the **same embedder** for documents and queries. Only the reranker changes.
- Experiment: different `k`, prompt templates, or add citation formatting to your final answer.
