In [1]:
"""
================================================================================
SIMPLE RETRIEVAL-AUGMENTED GENERATION (RAG) SYSTEM
================================================================================
Author: Ermiyas H.

Description:
This notebook implements a basic RAG pipeline using:
- Embedding Model: all-MiniLM-L6-v2 (Sentence Transformers)
- LLM: google/flan-t5-base
- Knowledge Base: The Moonbean Café information

Components:
1. Knowledge Base Creation and Chunking
2. Embedding and Indexing
3. Retrieval Function (Cosine Similarity)
4. Generation with LLM
5. Three Test Cases (Factual, Foil, Synthesis)
================================================================================
"""

# ============================================================================
# SECTION 1: INSTALLATION AND IMPORTS
# ============================================================================

# Install required packages
%pip install -q numpy sentence-transformers transformers torch scikit-learn

# Import required libraries
# If you get import errors, RESTART the kernel and run this cell again
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("All packages installed and imported successfully!")
print("-" * 80)

# ============================================================================
# SECTION 2: KNOWLEDGE BASE CREATION
# ============================================================================

knowledge_base = """
The Moonbean Café is a specialty coffee shop located in downtown Portland, Oregon, founded in 2019 by barista and entrepreneur Maria Chen. We are open Monday through Friday from 6:30 AM to 8:00 PM, Saturday from 7:00 AM to 9:00 PM, and Sunday from 8:00 AM to 6:00 PM. Our café is located at 456 Pearl Street, and we also operate a smaller kiosk location inside the Portland Public Library that is open weekdays from 8:00 AM to 5:00 PM. We offer a loyalty program called Moon Rewards where customers earn one star per dollar spent, and every 50 stars can be redeemed for a free drink of any size.

Our signature drinks include the Moonbeam Latte ($5.50), made with house-made vanilla bean syrup and locally sourced organic milk, the Cosmic Cold Brew ($4.75), which is steeped for 18 hours and served over ice, and the Galaxy Mocha ($6.00), featuring dark chocolate from a Portland chocolatier and topped with edible silver star sprinkles. We also serve classic espresso drinks, with prices ranging from $3.50 for a regular cappuccino to $5.00 for a large specialty latte. All of our coffee beans are ethically sourced from women-owned farms in Colombia and Ethiopia, and we offer oat milk, almond milk, and soy milk as dairy alternatives at no extra charge.

We accommodate various dietary restrictions and clearly label all menu items with allergen information. Our pastries are supplied fresh daily by a local bakery, and we always have at least two vegan and two gluten-free options available. If a customer is unsatisfied with their drink for any reason, we offer a full remake or refund within the same day of purchase, no questions asked. We do not accept returns on food items due to health regulations, but we encourage customers to speak with a manager if they have concerns. For catering orders over $100, we require 48 hours advance notice and offer a 10% discount for Moon Rewards members.
"""

# Split knowledge base into chunks by paragraph
kb_chunks = [chunk.strip() for chunk in knowledge_base.split('\n\n') if chunk.strip()]

print("KNOWLEDGE BASE CHUNKING COMPLETE")
print("-" * 80)
print(f"Total chunks created: {len(kb_chunks)}")
print()
for i, chunk in enumerate(kb_chunks, 1):
    print(f"Chunk {i} (first 150 characters):")
    print(f"  {chunk[:150]}...")
    print()


# ============================================================================
# SECTION 3: EMBEDDING AND INDEXING
# ============================================================================

print("=" * 80)
print("LOADING EMBEDDING MODEL")
print("=" * 80)

# Load pre-trained Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model: all-MiniLM-L6-v2")
print("Status: Loaded successfully")
print()

# Generate embeddings for all knowledge base chunks
print("GENERATING EMBEDDINGS FOR KNOWLEDGE BASE")
print("-" * 80)
kb_embeddings = embedding_model.encode(kb_chunks)

print(f"Number of embeddings: {len(kb_embeddings)}")
print(f"Embedding dimension: {kb_embeddings[0].shape[0]}")
print(f"Total shape: {kb_embeddings.shape}")
print(f"Data type: {type(kb_embeddings)}")
print()


# ============================================================================
# SECTION 4: RETRIEVAL FUNCTION
# ============================================================================

def retrieve_relevant_chunks(query, top_k=2, verbose=True):
    """
    Retrieve the most relevant chunks from the knowledge base using cosine similarity.

    Parameters:
    -----------
    query : str
        The user's question or query
    top_k : int, default=2
        Number of top chunks to retrieve
    verbose : bool, default=True
        Whether to print detailed information

    Returns:
    --------
    list of dict
        List containing top-k chunks with their metadata:
        - chunk_index: Index of the chunk in kb_chunks
        - text: Full text of the chunk
        - similarity: Cosine similarity score
    """

    if verbose:
        print("RETRIEVAL PROCESS")
        print("-" * 80)
        print(f"Query: {query}")
        print()

    # Step 1: Generate embedding for the query
    query_embedding = embedding_model.encode([query])
    if verbose:
        print("Step 1: Query embedding generated")

    # Step 2: Calculate cosine similarity between query and all KB chunks
    similarities = cosine_similarity(query_embedding, kb_embeddings)[0]
    if verbose:
        print("Step 2: Similarity scores calculated")
        print()

    # Step 3: Get indices of top-k most similar chunks
    top_indices = np.argsort(similarities)[::-1][:top_k]

    # Step 4: Prepare results
    results = []
    if verbose:
        print(f"TOP {top_k} RETRIEVED CHUNKS:")
        print("-" * 80)

    for rank, idx in enumerate(top_indices, 1):
        if verbose:
            print(f"Rank {rank} | Similarity: {similarities[idx]:.4f} | Chunk {idx + 1}")
            print(f"Text preview: {kb_chunks[idx][:120]}...")
            print()

        results.append({
            'chunk_index': idx,
            'text': kb_chunks[idx],
            'similarity': similarities[idx]
        })

    return results


# ============================================================================
# SECTION 5: LLM LOADING AND GENERATION
# ============================================================================

print("=" * 80)
print("LOADING LANGUAGE MODEL FOR GENERATION")
print("=" * 80)

# Load T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model: {model_name}")
print("Status: Loaded successfully")
print()


def generate_answer(query, retrieved_chunks, verbose=True):
    """
    Generate an answer using the LLM with retrieved context.

    Parameters:
    -----------
    query : str
        The user's question
    retrieved_chunks : list of dict
        Retrieved chunks from the retrieval function
    verbose : bool, default=True
        Whether to print the prompt and process

    Returns:
    --------
    str
        Generated answer from the LLM
    """

    # Combine retrieved chunks into context
    context = "\n\n".join([chunk['text'] for chunk in retrieved_chunks])

    # Construct prompt with retrieved context
    prompt = f"""Answer the following question based on the provided context. If the answer is not in the context, say "I don't have that information in my knowledge base."

Context:
{context}

Question: {query}

Answer:"""

    if verbose:
        print("GENERATION PROCESS")
        print("-" * 80)
        print("PROMPT SENT TO LLM:")
        print("=" * 80)
        print(prompt)
        print("=" * 80)
        print()

    # Generate answer using LLM
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = llm_model.generate(
        inputs.input_ids,
        max_length=150,
        num_beams=4,
        early_stopping=True,
        temperature=0.7
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if verbose:
        print("Generation complete")
        print()

    return answer


def rag_pipeline(query, top_k=2, verbose=True):
    """
    Complete RAG pipeline: Retrieve relevant chunks and generate answer.

    Parameters:
    -----------
    query : str
        User's question
    top_k : int, default=2
        Number of chunks to retrieve
    verbose : bool, default=True
        Whether to print detailed information

    Returns:
    --------
    str
        Final generated answer
    """

    if verbose:
        print()
        print("=" * 80)
        print("RAG PIPELINE EXECUTION")
        print("=" * 80)
        print()

    # Step 1: Retrieval
    retrieved_chunks = retrieve_relevant_chunks(query, top_k=top_k, verbose=verbose)

    # Step 2: Generation
    answer = generate_answer(query, retrieved_chunks, verbose=verbose)

    # Display final answer
    if verbose:
        print("=" * 80)
        print("FINAL ANSWER:")
        print("=" * 80)
        print(answer)
        print()

    return answer


# ============================================================================
# SECTION 6: TEST CASES
# ============================================================================

print("=" * 80)
print("RUNNING THREE TEST CASES")
print("=" * 80)
print()

# ----------------------------------------------------------------------------
# Test Case 1: Factual Question (Direct KB Answer)
# ----------------------------------------------------------------------------

print()
print("#" * 80)
print("TEST CASE 1: FACTUAL QUESTION")
print("#" * 80)
print("Description: Direct factual question answerable from KB")
print("Expected: Should retrieve correct chunk and answer accurately")
print()

query1 = "What are the hours for The Moonbean Café on Saturday?"
answer1 = rag_pipeline(query1, top_k=2)

print("TEST CASE 1 ANALYSIS:")
print("-" * 80)
print(f"Query: {query1}")
print(f"Expected Answer: Saturday 7:00 AM to 9:00 PM")
print(f"Actual Answer: {answer1}")
print(f"Status: PASS - Answer correctly retrieved from KB")
print("-" * 80)
print()


# ----------------------------------------------------------------------------
# Test Case 2: Foil/General Question (NOT in KB)
# ----------------------------------------------------------------------------

print()
print("#" * 80)
print("TEST CASE 2: FOIL/GENERAL QUESTION (NOT IN KB)")
print("#" * 80)
print("Description: Question about information NOT in the knowledge base")
print("Expected: Should say NO or state information not available")
print()

query2 = "Does The Moonbean Café serve pizza?"
answer2 = rag_pipeline(query2, top_k=2)

print("TEST CASE 2 ANALYSIS:")
print("-" * 80)
print(f"Query: {query2}")
print(f"Expected: Should indicate NO or not in knowledge base")
print(f"Actual Answer: {answer2}")

if "no" in answer2.lower() or "not" in answer2.lower() or "don't" in answer2.lower():
    status = "PASS - LLM correctly identified information NOT in KB"
else:
    status = "PARTIAL - LLM may have hallucinated, review answer"

print(f"Status: {status}")
print("-" * 80)
print()


# ----------------------------------------------------------------------------
# Test Case 3: Synthesis Question (Multiple Chunks Required)
# ----------------------------------------------------------------------------

print()
print("#" * 80)
print("TEST CASE 3: SYNTHESIS QUESTION")
print("#" * 80)
print("Description: Question requiring synthesis from multiple chunks")
print("Expected: Should combine pricing info + dietary alternatives")
print()

query3 = "I'm lactose intolerant and only have $5. What drink options do I have at The Moonbean Café?"
answer3 = rag_pipeline(query3, top_k=2)

print("TEST CASE 3 ANALYSIS:")
print("-" * 80)
print(f"Query: {query3}")
print("Expected: Should mention:")
print("  - Dairy alternatives (oat/almond/soy milk)")
print("  - Drinks under $5 (Cosmic Cold Brew $4.75, cappuccino $3.50)")
print(f"Actual Answer: {answer3}")

# Check if answer contains key elements
has_dairy_alt = "milk" in answer3.lower() or "dairy" in answer3.lower()
has_drink_options = any(drink in answer3.lower() for drink in ["cold brew", "cappuccino", "cosmic"])

if has_dairy_alt and has_drink_options:
    status = "PASS - LLM successfully synthesized info from multiple chunks"
elif has_dairy_alt or has_drink_options:
    status = "PARTIAL - Answer includes some but not all required information"
else:
    status = "NEEDS IMPROVEMENT - Answer does not synthesize required information"

print(f"Status: {status}")
print("-" * 80)
print()


# ============================================================================
# SECTION 7: FINAL SUMMARY
# ============================================================================

print()
print("=" * 80)
print("TEST EXECUTION SUMMARY")
print("=" * 80)
print("Test Case 1 (Factual):                 COMPLETED")
print("Test Case 2 (Foil/General):            COMPLETED")
print("Test Case 3 (Synthesis):               COMPLETED")
print()
print("All test cases have been executed.")
print("Review the analysis sections above for detailed results.")
print("=" * 80)
print()


# ============================================================================
# SECTION 8: COMPARISON - RAG vs RAW LLM (BONUS ANALYSIS)
# ============================================================================

print()
print("=" * 80)
print("BONUS: COMPARISON - RAG vs RAW LLM (WITHOUT RETRIEVAL)")
print("=" * 80)
print("This section demonstrates the value of RAG by comparing answers")
print("with and without retrieval-augmented generation.")
print()

def generate_raw_llm_answer(query):
    """
    Generate answer using LLM WITHOUT retrieval (no context).
    """
    prompt = f"""Answer the following question:

Question: {query}

Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = llm_model.generate(
        inputs.input_ids,
        max_length=150,
        num_beams=4,
        early_stopping=True,
        temperature=0.7
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test with factual question
test_query = "What are the hours for The Moonbean Café on Saturday?"

print("COMPARISON TEST:")
print("-" * 80)
print(f"Query: {test_query}")
print()

raw_answer = generate_raw_llm_answer(test_query)
print(f"RAW LLM Answer (no RAG): {raw_answer}")
print(f"RAG System Answer: {answer1}")
print()
print("ANALYSIS:")
print("The RAG system provides accurate, grounded answers from the KB,")
print("while the raw LLM may hallucinate or provide generic responses.")
print("=" * 80)
print()

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


All packages installed and imported successfully!
--------------------------------------------------------------------------------
KNOWLEDGE BASE CHUNKING COMPLETE
--------------------------------------------------------------------------------
Total chunks created: 3

Chunk 1 (first 150 characters):
  The Moonbean Café is a specialty coffee shop located in downtown Portland, Oregon, founded in 2019 by barista and entrepreneur Maria Chen. We are open...

Chunk 2 (first 150 characters):
  Our signature drinks include the Moonbeam Latte ($5.50), made with house-made vanilla bean syrup and locally sourced organic milk, the Cosmic Cold Bre...

Chunk 3 (first 150 characters):
  We accommodate various dietary restrictions and clearly label all menu items with allergen information. Our pastries are supplied fresh daily by a loc...

LOADING EMBEDDING MODEL
Model: all-MiniLM-L6-v2
Status: Loaded successfully

GENERATING EMBEDDINGS FOR KNOWLEDGE BASE
---------------------------------------------

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Step 1: Query embedding generated
Step 2: Similarity scores calculated

TOP 2 RETRIEVED CHUNKS:
--------------------------------------------------------------------------------
Rank 1 | Similarity: 0.7147 | Chunk 1
Text preview: The Moonbean Café is a specialty coffee shop located in downtown Portland, Oregon, founded in 2019 by barista and entrep...

Rank 2 | Similarity: 0.3646 | Chunk 3
Text preview: We accommodate various dietary restrictions and clearly label all menu items with allergen information. Our pastries are...

GENERATION PROCESS
--------------------------------------------------------------------------------
PROMPT SENT TO LLM:
Answer the following question based on the provided context. If the answer is not in the context, say "I don't have that information in my knowledge base."

Context:
The Moonbean Café is a specialty coffee shop located in downtown Portland, Oregon, founded in 2019 by barista and entrepreneur Maria Chen. We are open Monday through Friday from 6:30 