# Project Title: Mini RAG
### A Lightweight, Colab-Friendly RAG Demo using ChromaDB + Hugging Face


#RAG Pipeline

In [6]:
# =========================
# Mini RAG (ChromaDB + HF)
# Single-block, Colab-friendly, production-minded
# =========================

# --- Install dependencies ---
import sys, subprocess, importlib.util

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

required = [
    "chromadb>=0.5.3",
    "sentence-transformers>=2.6.1",
    "transformers>=4.44.0",
    "torch>=2.1.0",
    "numpy",
    "pandas",
    "tqdm",
    "rank_bm25",
    "python-dotenv"
]

# ✅ Use importlib.util.find_spec instead of pkgutil.find_loader
missing = [
    p for p in required
    if importlib.util.find_spec(p.split("==")[0].split(">=")[0]) is None
]

if missing:
    pip_install(missing)


# --- Imports ---
import os, uuid, json, time, logging
import numpy as np, pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# --- Logging ---
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("mini_rag")

# --- Config ---
load_dotenv()
CONFIG = {
    "PERSIST_DIR": "/content/chroma_store",
    "COLLECTION_NAME": "mini_rag_docs",
    "EMBED_MODEL": "sentence-transformers/all-MiniLM-L6-v2",
    "RERANK_MODEL": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "GEN_MODEL": "t5-base",
    "CHUNK_TOKENS_MIN": 800,
    "CHUNK_TOKENS_MAX": 1200,
    "CHUNK_OVERLAP_RATIO": 0.12,
    "TOP_K": 8,
    "FINAL_K": 4,
    "MMR_LAMBDA": 0.5,
    "MAX_NEW_TOKENS": 256,
    "TEMPERATURE": 0.3
}

# --- Tokenizer for chunking ---
TOKENIZER_FOR_CHUNK = AutoTokenizer.from_pretrained("bert-base-uncased")
def count_tokens(text): return len(TOKENIZER_FOR_CHUNK.encode(text, add_special_tokens=False))
def chunk_text(text, min_tokens=CONFIG["CHUNK_TOKENS_MIN"], max_tokens=CONFIG["CHUNK_TOKENS_MAX"], overlap_ratio=CONFIG["CHUNK_OVERLAP_RATIO"]):
    tokens = TOKENIZER_FOR_CHUNK.encode(text, add_special_tokens=False)
    chunks, i, overlap = [], 0, int(max_tokens * overlap_ratio)
    while i < len(tokens):
        end = min(i + max_tokens, len(tokens))
        chunk_tokens = tokens[i:end]
        if len(chunk_tokens) < min_tokens and end < len(tokens):
            end = min(i + min_tokens, len(tokens))
            chunk_tokens = tokens[i:end]
        chunks.append(TOKENIZER_FOR_CHUNK.decode(chunk_tokens))
        if end == len(tokens): break
        i = max(end - overlap, 0)
    return chunks

# --- Models ---
logger.info("Loading models...")
embedder = SentenceTransformer(CONFIG["EMBED_MODEL"])
reranker = CrossEncoder(CONFIG["RERANK_MODEL"])
gen_tokenizer = AutoTokenizer.from_pretrained(CONFIG["GEN_MODEL"])
gen_model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["GEN_MODEL"])

# --- ChromaDB client ---
client = chromadb.PersistentClient(path=CONFIG["PERSIST_DIR"])
if CONFIG["COLLECTION_NAME"] in [c.name for c in client.list_collections()]:
    client.delete_collection(CONFIG["COLLECTION_NAME"])
collection = client.create_collection(name=CONFIG["COLLECTION_NAME"], metadata={"hnsw:space": "cosine"})

# --- Example docs (replace with your JD ingestion) ---
docs = [
    {"source":"jd","title":"Intern JD","section":"Track B","text":"Goal: Build and host a small RAG app..."},
    {"source":"jd2","title":"Intern JD","section":"LLM","text":"Use any provider... generate grounded answers with citations."},
    {"source":"jd3","title":"Intern JD","section":"Frontend","text":"Frontend: upload/paste area, query box, answers panel... Hosting: deploy on free host..."},
    {"source":"jd4","title":"Intern JD","section":"Acceptance","text":"Acceptance: working URL; query → retrieved chunks → reranked → LLM answer with citations visible..."}
]

# --- Indexing ---
ids, metadatas, documents = [], [], []
for d_i, d in enumerate(docs):
    for c_i, chunk in enumerate(chunk_text(d["text"])):
        ids.append(f"{d['source']}::{d['section']}::{d_i}-{c_i}-{uuid.uuid4().hex[:8]}")
        metadatas.append({"source":d["source"],"title":d["title"],"section":d["section"],"position":c_i})
        documents.append(chunk)
embeddings = embedder.encode(documents, convert_to_numpy=True, normalize_embeddings=True)
collection.add(ids=ids, metadatas=metadatas, documents=documents, embeddings=embeddings.tolist())
logger.info(f"Indexed {len(documents)} chunks.")

# --- Retrieval (MMR) ---
def mmr(query_vec, doc_vecs, k=CONFIG["TOP_K"], lambda_div=CONFIG["MMR_LAMBDA"]):
    sim_to_query = doc_vecs @ query_vec
    selected, candidates = [], list(range(len(doc_vecs)))
    while len(selected) < min(k, len(doc_vecs)):
        if not selected:
            idx = int(np.argmax(sim_to_query)); selected.append(idx); candidates.remove(idx)
        else:
            sim_to_selected = np.max(doc_vecs[selected] @ doc_vecs.T, axis=0)
            mmr_scores = lambda_div * sim_to_query - (1 - lambda_div) * sim_to_selected
            mmr_scores[selected] = -np.inf
            idx = int(np.argmax(mmr_scores))
            if idx in candidates: selected.append(idx); candidates.remove(idx)
            else: break
    return selected

def rerank(query, docs_list, top_n=CONFIG["FINAL_K"]):
    scores = reranker.predict([(query, d) for d in docs_list])
    order = np.argsort(-scores)[:top_n]
    return [(docs_list[i], float(scores[i])) for i in order]

def generate_answer(query, contexts):
    prompt_context = "".join([f"[{i}] {c['text']}\nSource: {c['meta']['source']} | {c['meta']['section']}\n\n" for i,c in enumerate(contexts,1)])
    prompt = f"Answer the query using ONLY context. Cite sources inline as [1], [2].\nQuery: {query}\n\nContext:\n{prompt_context}\nAnswer:"
    inputs = gen_tokenizer(prompt, return_tensors="pt")
    output_ids = gen_model.generate(**inputs, do_sample=True, temperature=CONFIG["TEMPERATURE"], max_new_tokens=CONFIG["MAX_NEW_TOKENS"])
    return gen_tokenizer.decode(output_ids[0], skip_special_tokens=True)

def query_pipeline(user_query):
    q_vec = embedder.encode([user_query], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = collection.query(query_embeddings=[q_vec.tolist()], n_results=CONFIG["TOP_K"], include=["documents","metadatas","embeddings"])
    docs_list, metas_list, emb_list = res["documents"][0], res["metadatas"][0], np.array(res["embeddings"][0])
    mmr_idxs = mmr(q_vec, emb_list)
    mmr_docs, mmr_metas = [docs_list[i] for i in mmr_idxs], [metas_list[i] for i in mmr_idxs]
    reranked = rerank(user_query, mmr_docs)
    contexts = [{"text":doc_text,"meta":mmr_metas[mmr_docs.index(doc_text)],"score":score} for doc_text,score in reranked]
    answer = generate_answer(user_query, contexts)
    citations = [{"label":f"[{i}]","source":c["meta"]["source"],"section":c["meta"]["section"],"snippet":c["text"][:100]} for i,c in enumerate(contexts,1)]
    return {"query":user_query,"answer":answer,"citations":citations}

# --- Demo ---
if __name__ == "__main__":
    result = query_pipeline("What are acceptance criteria?")
    print("Answer:", result["answer"])
    print("Citations:", result["citations"])


Answer: Acceptance
Citations: [{'label': '[1]', 'source': 'jd4', 'section': 'Acceptance', 'snippet': 'acceptance : working url ; query → retrieved chunks → reranked → llm answer with citations visible..'}, {'label': '[2]', 'source': 'jd2', 'section': 'LLM', 'snippet': 'use any provider... generate grounded answers with citations.'}, {'label': '[3]', 'source': 'jd', 'section': 'Track B', 'snippet': 'goal : build and host a small rag app...'}, {'label': '[4]', 'source': 'jd3', 'section': 'Frontend', 'snippet': 'frontend : upload / paste area, query box, answers panel... hosting : deploy on free host...'}]


#Query 1

In [7]:
result2 = query_pipeline("What is required in the frontend?")
print("Answer:", result2["answer"])
print("Citations:", result2["citations"])


Answer: Context
Citations: [{'label': '[1]', 'source': 'jd3', 'section': 'Frontend', 'snippet': 'frontend : upload / paste area, query box, answers panel... hosting : deploy on free host...'}, {'label': '[2]', 'source': 'jd2', 'section': 'LLM', 'snippet': 'use any provider... generate grounded answers with citations.'}, {'label': '[3]', 'source': 'jd', 'section': 'Track B', 'snippet': 'goal : build and host a small rag app...'}, {'label': '[4]', 'source': 'jd4', 'section': 'Acceptance', 'snippet': 'acceptance : working url ; query → retrieved chunks → reranked → llm answer with citations visible..'}]


#Query 2

In [8]:
result3 = query_pipeline("Summarize the overall responsibilities of the intern role.")
print("Answer:", result3["answer"])
print("Citations:", result3["citations"])


Answer: Query
Citations: [{'label': '[1]', 'source': 'jd2', 'section': 'LLM', 'snippet': 'use any provider... generate grounded answers with citations.'}, {'label': '[2]', 'source': 'jd', 'section': 'Track B', 'snippet': 'goal : build and host a small rag app...'}, {'label': '[3]', 'source': 'jd4', 'section': 'Acceptance', 'snippet': 'acceptance : working url ; query → retrieved chunks → reranked → llm answer with citations visible..'}, {'label': '[4]', 'source': 'jd3', 'section': 'Frontend', 'snippet': 'frontend : upload / paste area, query box, answers panel... hosting : deploy on free host...'}]


#Query 3

In [9]:
result4 = query_pipeline("what is RAG.")
print("Answer:", result4["answer"])
print("Citations:", result4["citations"])


Answer: RAG
Citations: [{'label': '[1]', 'source': 'jd', 'section': 'Track B', 'snippet': 'goal : build and host a small rag app...'}, {'label': '[2]', 'source': 'jd2', 'section': 'LLM', 'snippet': 'use any provider... generate grounded answers with citations.'}, {'label': '[3]', 'source': 'jd4', 'section': 'Acceptance', 'snippet': 'acceptance : working url ; query → retrieved chunks → reranked → llm answer with citations visible..'}, {'label': '[4]', 'source': 'jd3', 'section': 'Frontend', 'snippet': 'frontend : upload / paste area, query box, answers panel... hosting : deploy on free host...'}]


#Summary: This notebook demonstrates a lightweight, Colab‑friendly Retrieval‑Augmented Generation (RAG) pipeline using ChromaDB and Hugging Face models. It showcases document chunking, retrieval, reranking, and answer generation with citations, tailored for job description analysis.

#Note: This Mini RAG model is a simulated, Colab‑friendly version built with AI tools, and there remains scope for further improvement.