
# 🧪 Starter RAG Pipeline (Basic)

This notebook builds a **minimal Retrieval-Augmented Generation (RAG)** demo using:
- **SentenceTransformer** for embeddings (`all-MiniLM-L6-v2`)
- **FAISS** for vector search
- **FLAN-T5** for lightweight text generation (works on CPU)
- Simple file-based dataset (plain `.txt` files)


In [None]:

# If running in Colab, uncomment:
# !pip install -q sentence-transformers faiss-cpu transformers accelerate datasets


In [None]:

from pathlib import Path

# === Configuration ===
DOCS_DIR = Path("./sample_docs")  # Put your .txt files here
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_NAME = "google/flan-t5-base"   # lightweight text2text model (good for Q&A-style prompts)
INDEX_DIR = Path("./rag_index")          # where to persist FAISS index + metadata

INDEX_DIR.mkdir(exist_ok=True, parents=True)
DOCS_DIR.mkdir(exist_ok=True, parents=True)

print(f"Docs folder: {DOCS_DIR.resolve()}")
print(f"Index folder: {INDEX_DIR.resolve()}")


In [None]:

# Create a few sample docs if folder is empty
sample_files = list(DOCS_DIR.glob("*.txt"))
if not sample_files:
    samples = {
        "faq_bank.txt": """
Q: How do I report a lost credit card?
A: Call our 24/7 hotline immediately and freeze the card in the mobile app.

Q: What is a chargeback?
A: A chargeback is a reversal of funds to dispute a card transaction.
""",

        "llm_primer.txt": """
Transformers rely on self-attention to model relationships across tokens.
Retrieval-Augmented Generation (RAG) pairs an LLM with an external knowledge base.
The FLAN-T5 family of models is trained to follow instructions for text generation.
"""
    }
    for name, content in samples.items():
        (DOCS_DIR / name).write_text(content.strip())
    print(f"Created {len(samples)} sample docs in {DOCS_DIR}")
else:
    print(f"Found {len(sample_files)} existing docs.")


In [None]:

from typing import Dict, Any, List
import re

def load_text_docs(folder: Path) -> Dict[str, str]:
    docs = {}
    for p in folder.glob("*.txt"):
        docs[p.name] = p.read_text(encoding="utf-8", errors="ignore")
    return docs

def simple_sent_split(text: str) -> List[str]:
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in parts if s.strip()]

def chunk_text(text: str, max_tokens: int = 120) -> List[str]:
    words = text.split()
    chunks, cur = [], []
    for w in words:
        cur.append(w)
        if len(cur) >= max_tokens:
            chunks.append(" ".join(cur))
            cur = []
    if cur:
        chunks.append(" ".join(cur))
    return chunks

def build_chunks(docs: Dict[str, str], max_tokens: int = 120) -> List[Dict[str, Any]]:
    chunks = []
    for source, text in docs.items():
        for i, ch in enumerate(chunk_text(text, max_tokens=max_tokens)):
            chunks.append({"text": ch, "source": source, "chunk_id": f"{source}::chunk::{i}"})
    return chunks

docs = load_text_docs(DOCS_DIR)
chunks = build_chunks(docs, max_tokens=120)
print(f"Loaded {len(docs)} docs -> {len(chunks)} text chunks")


In [None]:

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss, json

embed_model = SentenceTransformer(EMBED_MODEL_NAME)
embeddings = embed_model.encode([c["text"] for c in chunks], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
with open(INDEX_DIR / "meta.json", "w", encoding="utf-8") as f:
    json.dump({"chunks": chunks, "embed_model": EMBED_MODEL_NAME}, f, ensure_ascii=False, indent=2)

print("Index built and saved.")


In [None]:

def retrieve(query: str, k: int = 3) -> List[Dict]:
    q_emb = embed_model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    scores, idxs = index.search(q_emb, k)
    results = []
    for score, ix in zip(scores[0], idxs[0]):
        ch = chunks[ix]
        results.append({**ch, "score": float(score)})
    return results

# quick test
retrieve("How do I report a lost credit card?", k=2)


In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
generator = pipeline("text2text-generation", model=gen_model, tokenizer=gen_tokenizer)

def make_prompt(query: str, contexts: List[Dict], max_ctx_chars: int = 1200) -> str:
    context_blob = "\n\n".join([c['text'] for c in contexts])
    context_blob = context_blob[:max_ctx_chars]
    return f"""You are a helpful assistant. Use the CONTEXT to answer the QUESTION.

CONTEXT:
{context_blob}

QUESTION: {query}

ANSWER:"""


In [None]:

def rag_answer(query: str, k: int = 3) -> Dict:
    ctx = retrieve(query, k=k)
    prompt = make_prompt(query, ctx)
    out = generator(prompt, max_new_tokens=200)[0]["generated_text"]
    return {"query": query, "answer": out, "contexts": ctx}

rag_answer("What is a chargeback?", k=2)
