# RAG Generation with LLMs

Learn to combine retrieval with LLM generation for question answering.

In [None]:
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import faiss
import json
import tiktoken

## Setup

**Important:** Set your OpenAI API key

In [None]:
# Initialize OpenAI client
api_key = "your-api-key-here"
client = OpenAI(api_key=api_key)

# Load models
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load index and chunks from previous notebook
index = faiss.read_index('../output/documents.index')
with open('../output/chunks.json', 'r') as f:
    chunks = json.load(f)

print(f"✓ Loaded {index.ntotal} chunks")

## Basic RAG Pipeline

In [None]:
def retrieve(query: str, k: int = 5):
    """Retrieve relevant chunks"""
    query_emb = embed_model.encode(
        query,
        normalize_embeddings=True
    ).astype('float32').reshape(1, -1)

    scores, indices = index.search(query_emb, k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append({
            **chunks[idx],
            'score': float(score)
        })

    return results

def rag(question: str, k: int = 5):
    """RAG: Retrieve + Generate"""

    # 1. Retrieve relevant chunks
    relevant_chunks = retrieve(question, k=k)

    # 2. Build context
    context = "\n\n".join([c['text'] for c in relevant_chunks])

    # 3. Generate answer
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question: {question}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content

# Test
question = "What is Python?"
answer = rag(question)

print(f"Question: {question}")
print(f"Answer: {answer}")

## RAG with Citations

In [None]:
def rag_with_citations(question: str, k: int = 5):
    """RAG with source tracking"""

    # Retrieve
    relevant_chunks = retrieve(question, k=k)

    # Build numbered context
    context_parts = []
    sources = []

    for i, chunk in enumerate(relevant_chunks, 1):
        context_parts.append(f"[{i}] {chunk['text']}")
        sources.append({
            'id': i,
            'source': chunk['source'],
            'score': chunk['score']
        })

    context = "\n\n".join(context_parts)

    # Generate with citation instruction
    prompt = f"""Answer the question based on the context below.
Cite sources using [1], [2], etc.

Context:
{context}

Question: {question}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer questions with citations."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': sources
    }

# Test
result = rag_with_citations("How much does the Professional plan cost?")

print("Answer:", result['answer'])
print("\nSources:")
for source in result['sources']:
    print(f"  [{source['id']}] {source['source']} (score: {source['score']:.3f})")

## Token-Aware Context Construction

In [None]:
def build_context_with_limit(chunks, max_tokens=3000):
    """Build context respecting token limit"""
    encoding = tiktoken.encoding_for_model("gpt-4o-mini")

    context_parts = []
    sources = []
    total_tokens = 0

    for i, chunk in enumerate(chunks, 1):
        chunk_text = f"[{i}] {chunk['text']}"
        chunk_tokens = len(encoding.encode(chunk_text))

        if total_tokens + chunk_tokens > max_tokens:
            print(f"⚠️  Stopped at chunk {i-1} (token limit reached)")
            break

        context_parts.append(chunk_text)
        sources.append({'id': i, 'source': chunk['source']})
        total_tokens += chunk_tokens

    context = "\n\n".join(context_parts)
    print(f"✓ Built context: {total_tokens} tokens, {len(sources)} chunks")

    return context, sources

# Test
relevant_chunks = retrieve("What is machine learning?", k=10)
context, sources = build_context_with_limit(relevant_chunks, max_tokens=500)

print(f"\nContext:\n{context}")

## Compare: With vs Without RAG

In [None]:
question = "What was our Q4 2023 revenue?"

# Without RAG (LLM only)
print("WITHOUT RAG:")
print("="*60)
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": question}
    ]
)
print(response.choices[0].message.content)

# With RAG
print("\n\nWITH RAG:")
print("="*60)
answer = rag(question)
print(answer)

print("\n" + "="*60)
print("RAG provides accurate, up-to-date information!")

## Handling 'Not in Context'

In [None]:
def rag_strict(question: str, k: int = 5):
    """RAG that only answers from context"""

    relevant_chunks = retrieve(question, k=k)
    context = "\n\n".join([c['text'] for c in relevant_chunks])

    prompt = f"""Answer ONLY based on the context below.
If the answer is not in the context, respond with: "I don't have that information in the provided context."

Context:
{context}

Question: {question}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer only from provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content

# Test with question NOT in documents
question = "What is the weather today?"
answer = rag_strict(question)

print(f"Question: {question}")
print(f"Answer: {answer}")
print("\n✓ Correctly refuses to hallucinate!")

## Test Multiple Questions

In [None]:
# Load test queries
with open('../fixtures/input/test_queries.json', 'r') as f:
    test_queries = json.load(f)

# Test RAG on all queries
for query_data in test_queries[:5]:  # First 5
    question = query_data['query']

    print(f"\nQuestion: {question}")
    print("="*60)

    result = rag_with_citations(question, k=3)

    print(f"Answer: {result['answer']}")
    print(f"\nSources: {[s['source'] for s in result['sources']]}")
    print(f"Expected docs: {query_data['expected_doc_ids']}")

## Summary

✅ Built basic RAG pipeline  
✅ Added citation tracking  
✅ Implemented token-aware context  
✅ Handled non-answerable questions  
✅ Compared with/without RAG

**Key patterns:**
- Retrieve → Build Context → Generate
- Always check token limits
- Number chunks for citations
- Instruct LLM to stay in context
- Use temperature=0 for factual answers

**Next:** Build production RAG in tasks!