# Task 1: Build RAG Pipeline - SOLUTION

In [None]:
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
import json
import numpy as np

In [None]:
api_key = "your-api-key-here"
client = OpenAI(api_key=api_key)
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

with open('../fixtures/input/documents.json', 'r') as f:
    documents = json.load(f)

## Task 1: Chunk Documents

In [None]:
# SOLUTION
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=120
)

all_chunks = []
for doc in documents:
    doc_chunks = splitter.split_text(doc['content'])
    for i, chunk_text in enumerate(doc_chunks):
        all_chunks.append({
            'text': chunk_text,
            'source': doc['source'],
            'doc_id': doc['doc_id'],
            'chunk_id': i
        })

print(f"✓ Created {len(all_chunks)} chunks")

## Task 2: Build FAISS Index

In [None]:
# SOLUTION
chunk_texts = [c['text'] for c in all_chunks]
embeddings = embed_model.encode(
    chunk_texts,
    normalize_embeddings=True,
    show_progress_bar=True
).astype('float32')

dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print(f"✓ Indexed {index.ntotal} chunks")

## Task 3: Implement Retrieval

In [None]:
# SOLUTION
def retrieve(query: str, k: int = 5):
    query_emb = embed_model.encode(
        query,
        normalize_embeddings=True
    ).astype('float32').reshape(1, -1)

    scores, indices = index.search(query_emb, k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append({
            **all_chunks[idx],
            'score': float(score)
        })

    return results

print("✓ Retrieve function created")

## Task 4: Implement RAG with Citations

In [None]:
# SOLUTION
def rag(question: str, k: int = 5):
    # Retrieve
    chunks = retrieve(question, k=k)

    # Build context with citations
    context_parts = []
    sources = []
    for i, chunk in enumerate(chunks, 1):
        context_parts.append(f"[{i}] {chunk['text']}")
        sources.append({
            'id': i,
            'source': chunk['source'],
            'doc_id': chunk['doc_id']
        })

    context = "\n\n".join(context_parts)

    # Generate
    prompt = f"""Answer based on context. Cite sources using [1], [2], etc.

Context:
{context}

Question: {question}

Answer:"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Answer with citations."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': sources
    }

print("✓ RAG function created")

## Task 5: Test on Multiple Queries

In [None]:
# SOLUTION
with open('../fixtures/input/test_queries.json', 'r') as f:
    test_queries = json.load(f)

correct = 0
for query_data in test_queries:
    result = rag(query_data['query'], k=3)
    retrieved_doc_ids = [s['doc_id'] for s in result['sources']]
    expected = query_data['expected_doc_ids']

    if any(doc_id in retrieved_doc_ids for doc_id in expected):
        correct += 1

accuracy = correct / len(test_queries)
print(f"✓ Accuracy: {accuracy:.1%}")