In [1]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import warnings
import re
from typing import List

warnings.filterwarnings('ignore')

# Check if GPU is available (industry standard practice)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [2]:
with open('../data/handbook_text.txt', 'r') as file:
    handbook_text = file.read()

# Load FAQ dataset from JSON format
data_path = '../data/campus_faq.json'

try:
    with open(data_path) as f:
        data = json.load(f)
    print("✓ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ Error: Could not find the dataset file. Please check the path.")
    raise

# Extract questions and answers from the nested structure
questions = []
answers = []

for item in data['faq']:
    questions.append(item['question'])
    answers.append(item['answer'])

# Create DataFrame for easier data manipulation
faq_data = pd.DataFrame({
    'question': questions,
    'answer': answers
})

print(f"Dataset contains {len(faq_data)} question-answer pairs")
print(f"Handbook text length: {len(handbook_text)} characters")

✓ Dataset loaded successfully!
Dataset contains 30 question-answer pairs
Handbook text length: 1924 characters


In [3]:
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    """
    Split text into overlapping chunks for better retrieval.
    This is a simplified version - production systems use more sophisticated methods.
    
    Args:
        text: Input text to chunk
        chunk_size: Maximum characters per chunk
        overlap: Characters to overlap between chunks
    
    Returns:
        List of text chunks
    """
    # TODO: Split text into sentences (hint: use re.split with pattern r'[.!?]+')
    sentences = re.split(r'[.!?]+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # TODO: Check if adding this sentence would exceed chunk_size
        # If yes AND current_chunk is not empty, start a new chunk
        if current_chunk and len(current_chunk) + len(sentence) + 1 > chunk_size:
            # Add current_chunk to chunks
            chunks.append(current_chunk.strip())
            
            # Add overlap from the end of previous chunk
            if overlap > 0 and len(current_chunk) > overlap:
                current_chunk = current_chunk[-overlap:] + " " + sentence
            else:
                current_chunk = sentence
        else:
            # TODO: Add sentence to current_chunk (handle empty chunk case)
            current_chunk = sentence
    
    # Don't forget the last chunk!
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test your chunking function
print("🧪 Testing your chunking function...")
sample_text = "This is sentence one. This is sentence two. This is sentence three. This is sentence four."
test_chunks = chunk_text(sample_text, chunk_size=50, overlap=10)
print(f"Original: {sample_text}")
print(f"Chunks: {test_chunks}")

🧪 Testing your chunking function...
Original: This is sentence one. This is sentence two. This is sentence three. This is sentence four.
Chunks: ['This is sentence four']


In [6]:
print("Chunking handbook text...")
# TODO: Use your chunking function to split handbook_text into chunks
# Use chunk_size=400 and overlap=50
handbook_chunks = (chunk_text(handbook_text,chunk_size=400,overlap=50))
print(f"✓ Created {len(handbook_chunks)} handbook chunks")

# Step 3b: Format FAQ documents
print("Preparing FAQ documents...")
faq_documents = []

# TODO: Loop through faq_data and create formatted documents
# Format each as: "Question: [question]\nAnswer: [answer]"
for _, row in faq_data.iterrows():
    faq_doc = f"Question: {row['question']} \nAnswer: {row['answer']}"
    faq_documents.append(faq_doc)

print(f"✓ Created {len(faq_documents)} FAQ documents")

# Step 3c: Combine and create metadata
all_documents = handbook_chunks + faq_documents
document_metadata = []

# Create metadata for handbook chunks (this is complete)
for i, chunk in enumerate(handbook_chunks):
    document_metadata.append({
        'source': 'handbook',
        'chunk_id': i,
        'type': 'text_chunk'
    })

# TODO: Create metadata for FAQ documents
for i, faq_doc in enumerate(faq_documents):
    metadata = {
        'source':'faq',
        'faq_id':'i',
        'type':'qa_pair',
        'question':faq_data.iloc[i]['question']
        # Include: 'source': 'faq', 'faq_id': i, 'type': 'qa_pair', 'question': faq_data.iloc[i]['question']
    }
    document_metadata.append(metadata)

print(f"\n📚 Knowledge Base Summary:")
print(f"- Total documents: {len(all_documents)}")
print(f"- Handbook chunks: {len(handbook_chunks)}")
print(f"- FAQ documents: {len(faq_documents)}")
print(f"- Metadata entries: {len(document_metadata)}")

# Let's examine a few examples
print(f"\n🔍 Sample Documents:")
if handbook_chunks:
    print(f"Handbook chunk example: {handbook_chunks[0][:200]}...")
if faq_documents:
    print(f"FAQ document example: {faq_documents[0][:200]}...")

Chunking handbook text...
✓ Created 6 handbook chunks
Preparing FAQ documents...
✓ Created 30 FAQ documents

📚 Knowledge Base Summary:
- Total documents: 36
- Handbook chunks: 6
- FAQ documents: 30
- Metadata entries: 36

🔍 Sample Documents:
Handbook chunk example: **Machine Learning Basics**
   - Overview of Machine Learning
   - Types of Machine Learning: Supervised, Unsupervised, Reinforcement Learning
   - Key Algorithms: Linear Regression, Decision Trees, N...
FAQ document example: Question: What are the library hours? 
Answer: The library is open from 8 AM to 10 PM, Monday to Friday....


In [7]:
print("🔧 Loading sentence transformer model...")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Model loaded successfully!")

# TODO: Create embeddings for all documents
print(f"\n🧮 Creating embeddings for {len(all_documents)} documents...")
print("This might take a moment - we're converting text to mathematical vectors!")

# YOUR CODE HERE: Use sentence_model.encode() to create embeddings
# Hint: Use show_progress_bar=True to see progress
document_embeddings = (sentence_model.encode(all_documents,show_progress_bar=True))

print(f"✓ Document embeddings created!")
print(f"📊 Embedding Statistics:")
print(f"   - Shape: {document_embeddings.shape}")
print(f"   - Each document → {document_embeddings.shape[1]} numbers")
print(f"   - Memory usage: ~{document_embeddings.nbytes / 1024 / 1024:.1f} MB")

# 🧪 Let's explore what embeddings look like
print(f"\n🔬 Embedding Analysis:")
print(f"First document embedding (first 10 values): {document_embeddings[0][:10]}")
print(f"Embedding range: {document_embeddings.min():.3f} to {document_embeddings.max():.3f}")

# TODO: Calculate similarity between first two documents
# Hint: Use cosine_similarity([document_embeddings[0]], [document_embeddings[1]])[0][0]
similarity = cosine_similarity([document_embeddings[0]],[document_embeddings[1]])[0][0]
print(f"Similarity between first two documents: {similarity:.3f}")


🔧 Loading sentence transformer model...
✓ Model loaded successfully!

🧮 Creating embeddings for 36 documents...
This might take a moment - we're converting text to mathematical vectors!


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✓ Document embeddings created!
📊 Embedding Statistics:
   - Shape: (36, 384)
   - Each document → 384 numbers
   - Memory usage: ~0.1 MB

🔬 Embedding Analysis:
First document embedding (first 10 values): [-4.5584787e-02 -8.5991524e-02  4.0799651e-02 -4.6326522e-02
  1.3739467e-02  3.0480000e-05 -5.2691088e-03 -5.7575550e-02
 -1.3184093e-01  2.9281942e-02]
Embedding range: -0.211 to 0.217
Similarity between first two documents: 0.638


In [9]:
def retrieve_relevant_documents(query: str, top_k: int = 3, similarity_threshold: float = 0.1):
    """
    Retrieve the most relevant documents for a given query.
    This function demonstrates the retrieval process step-by-step.
    """
    print(f"🔍 RETRIEVAL PROCESS for: '{query}'")
    print("=" * 50)
    
    # Step 1: Convert query to embedding
    print("Step 1: Converting query to embedding...")
    # TODO: Create embedding for the query using sentence_model
    query_embedding = sentence_model.encode([query],show_progress_bar=True)
    print(f"✓ Query converted to {query_embedding.shape[1]}-dimensional vector")
    
    # Step 2: Calculate similarities
    print("Step 2: Calculating similarities with all documents...")
    # TODO: Calculate cosine similarities between query and all documents
    similarities = cosine_similarity(query_embedding,document_embeddings)
    similarities = similarities[0]  # Extract the array
    print(f"✓ Calculated {len(similarities)} similarity scores")
    print(f"   Similarity range: {similarities.min():.3f} to {similarities.max():.3f}")
    
    # Step 3: Find top-k most similar documents
    print(f"Step 3: Finding top-{top_k} most relevant documents...")
    # TODO: Get indices of top-k highest similarity scores
    # Hint: Use np.argsort(similarities)[::-1][:top_k]
    top_indices = np.argsort(similarities)[::-1][:top_k]
    print(f"✓ Top document indices: {top_indices}")
    
    # Step 4: Filter by threshold and prepare results
    print(f"Step 4: Filtering by threshold ({similarity_threshold})...")
    results = []
    
    for idx in top_indices:
        score = similarities[idx]
        # TODO: Check if score meets threshold
        if score>= similarity_threshold:
            # Prepare result dictionary
            result = {
                'document': all_documents[idx],
                'score': float(score),
                'metadata': document_metadata[idx],
                'index': int(idx)
            }
            results.append(result)
    
    print(f"✓ {len(results)} documents passed the threshold")
    
    # Step 5: Display results for learning
    print(f"\n📋 RETRIEVAL RESULTS:")
    for i, result in enumerate(results):
        source = result['metadata']['source']
        score = result['score']
        preview = result['document'][:80] + "..." if len(result['document']) > 80 else result['document']
        print(f"  {i+1}. [{source.upper()}] Score: {score:.3f}")
        print(f"      Preview: {preview}")
        print()
    
    return results

# 🧪 Test your retrieval function
print("🧪 Testing Your Retrieval Function")
print("=" * 40)
test_queries = [
    "What are the library hours?",
    "How do I register for classes?"
]

for query in test_queries:
    retrieved_docs = retrieve_relevant_documents(query, top_k=3)
    print(f"Retrieved {len(retrieved_docs)} documents for: '{query}'\n")

🧪 Testing Your Retrieval Function
🔍 RETRIEVAL PROCESS for: 'What are the library hours?'
Step 1: Converting query to embedding...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Query converted to 384-dimensional vector
Step 2: Calculating similarities with all documents...
✓ Calculated 36 similarity scores
   Similarity range: -0.042 to 0.798
Step 3: Finding top-3 most relevant documents...
✓ Top document indices: [ 6 33 17]
Step 4: Filtering by threshold (0.1)...
✓ 3 documents passed the threshold

📋 RETRIEVAL RESULTS:
  1. [FAQ] Score: 0.798
      Preview: Question: What are the library hours? 
Answer: The library is open from 8 AM to ...

  2. [FAQ] Score: 0.554
      Preview: Question: Where can I study late at night? 
Answer: The library has 24-hour stud...

  3. [FAQ] Score: 0.482
      Preview: Question: What are the dining hours? 
Answer: The cafeteria is open from 7 AM to...

Retrieved 3 documents for: 'What are the library hours?'

🔍 RETRIEVAL PROCESS for: 'How do I register for classes?'
Step 1: Converting query to embedding...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Query converted to 384-dimensional vector
Step 2: Calculating similarities with all documents...
✓ Calculated 36 similarity scores
   Similarity range: 0.063 to 0.562
Step 3: Finding top-3 most relevant documents...
✓ Top document indices: [18 15 21]
Step 4: Filtering by threshold (0.1)...
✓ 3 documents passed the threshold

📋 RETRIEVAL RESULTS:
  1. [FAQ] Score: 0.562
      Preview: Question: How do I add or drop a course? 
Answer: You can add or drop courses th...

  2. [FAQ] Score: 0.503
      Preview: Question: What should I do if I have a question about my classes? 
Answer: Conta...

  3. [FAQ] Score: 0.418
      Preview: Question: How do I get a student ID card? 
Answer: New student ID cards can be o...

Retrieved 3 documents for: 'How do I register for classes?'



In [10]:
# Load GPT-2 model and tokenizer for response generation
print("🤖 Loading GPT-2 model for text generation...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

generation_model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)
print("✓ Generation model loaded successfully!")
print(f"Model running on: {next(generation_model.parameters()).device}")

🤖 Loading GPT-2 model for text generation...
✓ Generation model loaded successfully!
Model running on: cpu


In [None]:
def generate_rag_response(query: str, max_new_tokens: int = 100):
    """
    Complete RAG pipeline: retrieve relevant documents and generate response.
    🛠️ YOUR TASK: Complete the missing parts of this function!
    """
    print(f"\n🎯 RAG PIPELINE for: '{query}'")
    print("=" * 60)
    
    # Step 1: Retrieve relevant documents
    print("STEP 1: RETRIEVAL")
    retrieved_docs = retrieve_relevant_documents(query, top_k=3)
    
    # Step 2: Prepare context
    print("STEP 2: CONTEXT PREPARATION")
    if not retrieved_docs:
        print("⚠️ No relevant documents found - generating without context")
        context = "No specific information found in the knowledge base."
    else:
        # TODO: Combine retrieved documents into context
        context_parts = []
        for doc in retrieved_docs:
            source_label = f"[{doc['metadata']['source'].upper()}]"
            # YOUR CODE HERE: Add formatted document to context_parts
            # Format: f"{source_label} {doc['document']}"
            context_parts.append(f"{source_label} {doc['document']}")

        
        context = "\n\n".join(context_parts)
    
    print(f"✓ Context prepared ({len(context)} characters)")
    
    # Step 3: Create prompt
    print("STEP 3: PROMPT ENGINEERING")
    # TODO: Create a well-structured prompt
    # Include: context, user question, and clear instructions
    prompt = f"""Based on the following context, please answer the user's question accurately and helpfully.

Context:
{context}

User Question: {query}

Answer:"""
    
    print(f"✓ Prompt created ({len(prompt)} characters)")
    
    # Step 4: Generate response
    print("STEP 4: RESPONSE GENERATION")
    # TODO: Tokenize the prompt and move to device
    inputs = tokenizer.encode(prompt,return_tensors='pt').to(device)
    
    with torch.no_grad():
        # TODO: Generate response using the model
        outputs = generation_model.generate(
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
        # Include: max_new_tokens, temperature=0.7, do_sample=True, pad_token_id, eos_token_id
    
    # Step 5: Extract and clean response
    print("STEP 5: RESPONSE EXTRACTION")
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # TODO: Extract just the generated part (after the prompt)
    generated_answer = full-response.split('Answer:')[1].strip() if 'Answer:' in full_response else full_response.strip()
    
    print(f"✓ Response generated ({len(generated_answer)} characters)")
    
    return {
        'query': query,
        'retrieved_documents': retrieved_docs,
        'context_used': context,
        'generated_answer': generated_answer,
        'num_docs_retrieved': len(retrieved_docs)
    }

# 🧪 Test your complete RAG system!
print("🚀 Testing Your Complete RAG System!")
print("If you've implemented everything correctly, this should work:")
result = generate_rag_response("What are the library hours?")
print(f"\n🎉 FINAL ANSWER: {result['generated_answer']}")

🚀 Testing Your Complete RAG System!
If you've implemented everything correctly, this should work:

🎯 RAG PIPELINE for: 'What are the library hours?'
STEP 1: RETRIEVAL
🔍 RETRIEVAL PROCESS for: 'What are the library hours?'
Step 1: Converting query to embedding...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✓ Query converted to 384-dimensional vector
Step 2: Calculating similarities with all documents...
✓ Calculated 36 similarity scores
   Similarity range: -0.042 to 0.798
Step 3: Finding top-3 most relevant documents...
✓ Top document indices: [ 6 33 17]
Step 4: Filtering by threshold (0.1)...
✓ 3 documents passed the threshold

📋 RETRIEVAL RESULTS:
  1. [FAQ] Score: 0.798
      Preview: Question: What are the library hours? 
Answer: The library is open from 8 AM to ...

  2. [FAQ] Score: 0.554
      Preview: Question: Where can I study late at night? 
Answer: The library has 24-hour stud...

  3. [FAQ] Score: 0.482
      Preview: Question: What are the dining hours? 
Answer: The cafeteria is open from 7 AM to...

STEP 2: CONTEXT PREPARATION
✓ Context prepared (423 characters)
STEP 3: PROMPT ENGINEERING
✓ Prompt created (578 characters)
STEP 4: RESPONSE GENERATION
STEP 5: RESPONSE EXTRACTION
✓ Response generated (534 characters)

🎉 FINAL ANSWER: The Obama administration is threatening t

In [13]:
test_queries = [
    "What are the library hours?",
    "How do I register for classes?", 
    "What dining options are available on campus?",
    "Tell me about student support services",
    "What happens if I lose my student ID?",
    "How can I get involved in campus activities?"
]

print("🏆 COMPREHENSIVE RAG SYSTEM TESTING")
print("=" * 60)
print("🛠️ YOUR TASK: Uncomment and run the testing code once everything is implemented!")

# TODO: Uncomment the code below once you've completed all the functions

"""
all_results = []

for i, query in enumerate(test_queries, 1):
    print(f"\n📝 TEST {i}: {query}")
    print("-" * 50)
    
    result = generate_rag_response(query)
    all_results.append(result)
    
    print(f"\n📊 RESULTS SUMMARY:")
    print(f"- Documents retrieved: {result['num_docs_retrieved']}")
    print(f"- Context length: {len(result['context_used'])} characters")
    print(f"- Generated answer: {result['generated_answer']}")
    
    if i < len(test_queries):
        print("\n" + "="*60)

print(f"\n✅ Testing complete! ")
"""

print(f"\n🎯 ANALYSIS QUESTIONS (Discuss with your team):")
print("- Which queries worked best? Why?")
print("- Where did the system struggle? What could be the reasons?") 
print("- How could you improve the chunking strategy?")
print("- What about the prompts could be enhanced?")
print("- How might you handle queries with no relevant context?")
print("- What would you change for a production system?")

print(f"\n🏅 BONUS CHALLENGES:")
print("1. Implement a similarity threshold that adapts based on query complexity")
print("2. Add a re-ranking step that considers document diversity")
print("3. Implement caching for embeddings to speed up repeated queries")
print("4. Add evaluation metrics to measure RAG system quality")

🏆 COMPREHENSIVE RAG SYSTEM TESTING
🛠️ YOUR TASK: Uncomment and run the testing code once everything is implemented!

🎯 ANALYSIS QUESTIONS (Discuss with your team):
- Which queries worked best? Why?
- Where did the system struggle? What could be the reasons?
- How could you improve the chunking strategy?
- What about the prompts could be enhanced?
- How might you handle queries with no relevant context?
- What would you change for a production system?

🏅 BONUS CHALLENGES:
1. Implement a similarity threshold that adapts based on query complexity
2. Add a re-ranking step that considers document diversity
3. Implement caching for embeddings to speed up repeated queries
4. Add evaluation metrics to measure RAG system quality
