# Assignment: Implementing a Simple Retrieval-Augmented Generation (RAG) System

### Importing libraries and packages

In [9]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline

## Knowledge Base Creation

In [17]:
# Load your KB from a text file
with open("Film_Summary.txt", "r", encoding="utf-8") as f:
    kb_text = f.read()

# Simple fixed-size chunking
chunk_size = 300
chunks = [kb_text[i:i+chunk_size] for i in range(0, len(kb_text), chunk_size)]
print(f"Total chunks: {len(chunks)}")

Total chunks: 5


In [18]:
for chunk in chunks:
    print("Beginning of chunk\n" + chunk + "\n !!! This is the end of the chunk!!!\n")

Beginning of chunk
Sympathy for Lady Vengeance (2005), directed by Park Chan-wook, follows Lee Geum-ja, a woman who is released from prison after serving 13 years for the kidnapping and murder of a young boy—a crime she did not commit. During her sentence, she cultivated a reputation for angelic behavior, winning favo
 !!! This is the end of the chunk!!!

Beginning of chunk
r with guards and inmates alike. Yet beneath her gentle facade, she carefully plotted revenge against the real killer, a man who manipulated her into taking the fall.
Upon gaining her freedom, Geum-ja immediately begins executing her plan. She reconnects with individuals she helped in prison, callin
 !!! This is the end of the chunk!!!

Beginning of chunk
g in favors to gather resources and information. As she tracks down Mr. Baek, the true murderer, she wrestles with her own guilt, especially after being reunited with the daughter she was forced to give up. Her transformation becomes both emotional and moral as she

## Embedding and Indexing

In [19]:
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings for cosine similarity search
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.71it/s]


In [20]:
# Determine embedding dimension
dimension = embeddings.shape[1]

# Create an index that uses Inner Product (cosine similarity for normalized vectors)
index = faiss.IndexFlatIP(dimension)

# Add embeddings to index
index.add(embeddings)
print("FAISS index populated with", index.ntotal, "vectors.")

FAISS index populated with 5 vectors.


## Retrieval

In [21]:
def retrieve_relevant_chunks(query, top_k=2):
    # Encode and normalize query
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

    # Search for the top_k most similar chunks
    scores, indices = index.search(query_embedding, top_k)

    # Retrieve matching text chunks
    retrieved_chunks = [chunks[i] for i in indices[0]]
    return retrieved_chunks, scores[0]

In [22]:
model_name = "google/flan-t5-small"
generator = pipeline("text2text-generation", model=model_name)

Device set to use cpu


## Generation (Augmentation)

In [26]:
def generate_answer(query):
    context_chunks, scores = retrieve_relevant_chunks(query, top_k=2)
    context = "\n".join(context_chunks)
    
    prompt = f"Use the context below to help answer the question. If the answer cannot be inferred by the context or general knowledge, respond with 'I don't know'.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    
    response = generator(prompt, max_length=750, do_sample=False)
    print(context_chunks)
    return response[0]["generated_text"]

## Testing

### Test Case 1 (Factual)

In [27]:
query = "What year did Sympathy for Lady Vengeance come out?"
answer = generate_answer(query)

print("Query:", query)
print("\nAnswer:", answer)

['Sympathy for Lady Vengeance (2005), directed by Park Chan-wook, follows Lee Geum-ja, a woman who is released from prison after serving 13 years for the kidnapping and murder of a young boy—a crime she did not commit. During her sentence, she cultivated a reputation for angelic behavior, winning favo', 'past choices.\nThe story reaches a moral and psychological climax when Geum-ja discovers the full extent of Mr. Baek’s crimes—multiple murdered children—and gathers the victims’ parents, giving them the chance to exact justice themselves. What begins as a pure revenge mission evolves into a shared re']
Query: What year did Sympathy for Lady Vengeance come out?

Answer: (2005)


### Test Case 2 (Foil/General)

In [28]:
query = "Is Sympathy for Lady Vengeance a happy or sad movie?"
answer = generate_answer(query)

print("Query:", query)
print("\nAnswer:", answer)

['Sympathy for Lady Vengeance (2005), directed by Park Chan-wook, follows Lee Geum-ja, a woman who is released from prison after serving 13 years for the kidnapping and murder of a young boy—a crime she did not commit. During her sentence, she cultivated a reputation for angelic behavior, winning favo', 'ckoning with grief, rage, and vengeance. The film ultimately explores themes of redemption, responsibility, and the messy, painful pursuit of closure in a world where forgiveness and justice rarely align neatly.\n\n']
Query: Is Sympathy for Lady Vengeance a happy or sad movie?

Answer: sad


### Test Case 3 (Synthesis)

In [29]:
query = "What is the name of the man who manipulated Geum-ja?"
answer = generate_answer(query)

print("Query:", query)
print("\nAnswer:", answer)

['r with guards and inmates alike. Yet beneath her gentle facade, she carefully plotted revenge against the real killer, a man who manipulated her into taking the fall.\nUpon gaining her freedom, Geum-ja immediately begins executing her plan. She reconnects with individuals she helped in prison, callin', 'past choices.\nThe story reaches a moral and psychological climax when Geum-ja discovers the full extent of Mr. Baek’s crimes—multiple murdered children—and gathers the victims’ parents, giving them the chance to exact justice themselves. What begins as a pure revenge mission evolves into a shared re']
Query: What is the name of the man who manipulated Geum-ja?

Answer: Baek
