In [2]:
# ============================================================
# NOTEBOOK 2: RAG SYSTEM WITH QDRANT VECTOR DATABASE
# ============================================================
# Purpose: Create embeddings and set up vector search
# What YOU need to do:
#   1. Run Notebook 1 first (to create math_knowledge_base.json)
#   2. Make sure OpenAI API key is set
#   3. Run all cells
# ============================================================

# ============================================================
# STEP 1: Install Dependencies
# ============================================================
!pip install  qdrant-client sentence-transformers

# ============================================================
# STEP 2: Import Libraries
# ============================================================
import json
import os
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import numpy as np

# ============================================================
# STEP 3: Load Knowledge Base
# ============================================================
"""
This loads the JSON file you created in Notebook 1
"""
# try:
#     with open('data/math_knowledge_base.json', 'r') as f:
#         math_dataset = json.load(f)
#     print(f"✅ Loaded {len(math_dataset)} problems from knowledge base")
# except FileNotFoundError:
#     print("❌ ERROR: math_knowledge_base.json not found!")
#     print("   Please run Notebook 1 first to create the dataset.")
#     raise


import json
from pathlib import Path

# Project-level data folder (parallel to notebooks folder)
project_root = Path.cwd().parent  # assuming current file is in notebooks/
data_dir = project_root / "data"

try:
    kb_file = data_dir / "math_knowledge_base.json"
    with open(kb_file, "r") as f:
        math_dataset = json.load(f)
    print(f"✅ Loaded {len(math_dataset)} problems from knowledge base")
except FileNotFoundError:
    print(f"❌ ERROR: '{kb_file}' not found!")
    print("   Please run Notebook 1 first to create the dataset.")
    raise




# ============================================================
# STEP 4: Initialize Embedding Model
# ============================================================
"""
Using SentenceTransformers for creating embeddings
This model converts text into 384-dimensional vectors
"""
print("🔄 Loading embedding model (this may take a minute)...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Embedding model loaded!")

# Test the embedding
sample_text = "What is 2 + 2?"
sample_embedding = embedding_model.encode(sample_text)
print(f"\n📊 Embedding dimension: {len(sample_embedding)}")

# ============================================================
# STEP 5: Create Embeddings for All Questions
# ============================================================
"""
This creates vector representations for each question
"""
print("\n🔄 Creating embeddings for all questions...")

questions = [item['question'] for item in math_dataset]
embeddings = embedding_model.encode(questions, show_progress_bar=True)

print(f"✅ Created {len(embeddings)} embeddings")
print(f"   Shape: {embeddings.shape}")

# ============================================================
# STEP 6: Initialize Qdrant (In-Memory for Colab)
# ============================================================
"""
Qdrant is running in-memory mode (no server needed)
This is perfect for Google Colab testing
"""
print("\n🔄 Initializing Qdrant vector database...")

client = QdrantClient(":memory:")  # In-memory mode for Colab

COLLECTION_NAME = "math_problems"

# Create collection
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(
        size=embeddings.shape[1],  # 384 dimensions
        distance=Distance.COSINE
    )
)

print(f"✅ Created collection: {COLLECTION_NAME}")

# ============================================================
# STEP 7: Upload Data to Qdrant
# ============================================================
"""
Storing questions, embeddings, and metadata in Qdrant
"""
print("\n🔄 Uploading data to Qdrant...")

points = []
for idx, (item, embedding) in enumerate(zip(math_dataset, embeddings)):
    point = PointStruct(
        id=idx,
        vector=embedding.tolist(),
        payload={
            "question": item['question'],
            "solution": item['solution'],
            "answer": item['answer'],
            "topic": item['topic'],
            "difficulty": item['difficulty'],
            "id": item['id']
        }
    )
    points.append(point)

# Batch upload
client.upsert(
    collection_name=COLLECTION_NAME,
    points=points
)

print(f"✅ Uploaded {len(points)} problems to Qdrant")

# ============================================================
# STEP 8: Create Search Function
# ============================================================
"""
This function searches the knowledge base for similar questions
"""
def search_knowledge_base(query, top_k=3, score_threshold=0.5):
    """
    Search for similar questions in the knowledge base

    Args:
        query: User's math question
        top_k: Number of results to return
        score_threshold: Minimum similarity score (0-1)

    Returns:
        List of matching problems with scores
    """
    # Create embedding for query
    query_embedding = embedding_model.encode(query)

    # Search Qdrant
    search_results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding.tolist(),
        limit=top_k
    )

    # Filter by score threshold
    filtered_results = []
    for result in search_results:
        if result.score >= score_threshold:
            filtered_results.append({
                'score': result.score,
                'question': result.payload['question'],
                'solution': result.payload['solution'],
                'answer': result.payload['answer'],
                'topic': result.payload['topic'],
                'difficulty': result.payload['difficulty']
            })

    return filtered_results

print("✅ Search function created")

# ============================================================
# STEP 9: Test the Search System
# ============================================================
"""
Testing with sample queries
"""
print("\n" + "="*60)
print("🧪 TESTING KNOWLEDGE BASE SEARCH")
print("="*60)

test_queries = [
    "How do I solve 2x plus 5 equals 13?",
    "What is the derivative of a polynomial?",
    "How to calculate circle area?",
    "Solve quadratic equation"  # Should find quadratic problems
]

for query in test_queries:
    print(f"\n📝 Query: {query}")
    print("-" * 60)

    results = search_knowledge_base(query, top_k=2, score_threshold=0.3)

    if results:
        for i, result in enumerate(results, 1):
            print(f"\n   Result {i} (Score: {result['score']:.3f}):")
            print(f"   Question: {result['question']}")
            print(f"   Topic: {result['topic']} | Difficulty: {result['difficulty']}")
    else:
        print("   ❌ No matches found in knowledge base")

# ============================================================
# STEP 10: Save RAG System State
# ============================================================
"""
Saving important variables for next notebooks
YOU DON'T NEED TO CHANGE ANYTHING HERE
"""
# Save for later use
rag_config = {
    'collection_name': COLLECTION_NAME,
    'embedding_model_name': 'all-MiniLM-L6-v2',
    'embedding_dim': embeddings.shape[1],
    'total_problems': len(math_dataset)
}
# import os
# with open('data/rag_config.json', 'w') as f:
#     json.dump(rag_config, f, indent=2)

# print("\n✅ RAG configuration saved")

import os
import json
from pathlib import Path
# Get project root (parent of notebooks folder)
project_root = Path.cwd().parent  # assuming notebook is in 'notebooks/'

# Create data folder at project root
data_dir = project_root / "data"
data_dir.mkdir(exist_ok=True)

# Save RAG config
with open(data_dir / "rag_config.json", "w") as f:
    json.dump(rag_config, f, indent=2)
print(f"\n✅ RAG configuration saved to '{data_dir / 'rag_config.json'}'")






# ============================================================
# STEP 11: Determine Routing Decision
# ============================================================
"""
This function decides: Should we use Knowledge Base or Web Search?
"""
def should_use_knowledge_base(query, confidence_threshold=0.5):
    """
    Determine if we should use knowledge base or web search

    Args:
        query: User's question
        confidence_threshold: Minimum score to trust KB

    Returns:
        dict with decision and best match (if any)
    """
    results = search_knowledge_base(query, top_k=1, score_threshold=0.0)

    if not results:
        return {
            'use_kb': False,
            'confidence': 0.0,
            'reason': 'No matches found',
            'best_match': None
        }

    best_result = results[0]
    score = best_result['score']

    if score >= confidence_threshold:
        return {
            'use_kb': True,
            'confidence': score,
            'reason': f'High confidence match (score: {score:.3f})',
            'best_match': best_result
        }
    else:
        return {
            'use_kb': False,
            'confidence': score,
            'reason': f'Low confidence (score: {score:.3f}), using web search',
            'best_match': best_result
        }

print("✅ Routing decision function created")

# ============================================================
# STEP 12: Test Routing Logic
# ============================================================
print("\n" + "="*60)
print("🧪 TESTING ROUTING LOGIC")
print("="*60)

routing_tests = [
    "Solve for x: 2x + 5 = 13",  # Should use KB
    "What is quantum entanglement in physics?",  # Should use web search
    "Find the derivative of 3x squared",  # Should use KB
    "Who won the 2024 Nobel Prize in Mathematics?"  # Should use web search
]

for query in routing_tests:
    print(f"\n📝 Query: {query}")
    decision = should_use_knowledge_base(query, confidence_threshold=0.5)

    if decision['use_kb']:
        print(f"   ✅ USE KNOWLEDGE BASE")
        print(f"   Confidence: {decision['confidence']:.3f}")
        print(f"   Matched: {decision['best_match']['question']}")
    else:
        print(f"   🌐 USE WEB SEARCH")
        print(f"   Reason: {decision['reason']}")

print("\n" + "="*60)
print("✅ NOTEBOOK 2 COMPLETE!")
print("="*60)
print("\n📝 WHAT YOU DID:")
print("   - Created embeddings for all questions")
print("   - Set up Qdrant vector database")
print("   - Built search and routing logic")
print("   - Tested retrieval system")
print("\n🔜 NEXT: Move to Notebook 3 (MCP Web Search)")
print("="*60)


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\brije\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
✅ Loaded 20 problems from knowledge base
🔄 Loading embedding model (this may take a minute)...
✅ Embedding model loaded!

📊 Embedding dimension: 384

🔄 Creating embeddings for all questions...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created 20 embeddings
   Shape: (20, 384)

🔄 Initializing Qdrant vector database...
✅ Created collection: math_problems

🔄 Uploading data to Qdrant...
✅ Uploaded 20 problems to Qdrant
✅ Search function created

🧪 TESTING KNOWLEDGE BASE SEARCH

📝 Query: How do I solve 2x plus 5 equals 13?
------------------------------------------------------------

   Result 1 (Score: 0.894):
   Question: Solve for x: 2x + 5 = 13
   Topic: Linear Equations | Difficulty: easy

   Result 2 (Score: 0.533):
   Question: Solve the quadratic equation: x² - 7x + 12 = 0
   Topic: Quadratic Equations | Difficulty: medium

📝 Query: What is the derivative of a polynomial?
------------------------------------------------------------

   Result 1 (Score: 0.538):
   Question: Find the derivative of f(x) = 3x² + 2x - 1
   Topic: Derivatives | Difficulty: easy

   Result 2 (Score: 0.412):
   Question: Find the derivative of f(x) = sin(x) + cos(x)
   Topic: Derivatives of Trig Functions | Difficulty: easy

📝 Query: H

  search_results = client.search(
