In [None]:
!pip install langchain langgraph langchain-community langchain-text-splitters langchain-groq langchain-huggingface langchain-chroma pymupdf arxiv sentence-transformers

In [9]:
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

In [2]:
from google.colab import userdata
import os
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [3]:
from langchain_community.document_loaders import WebBaseLoader
url = "https://en.wikipedia.org/wiki/Retrieval-augmented_generation"
loader = WebBaseLoader(url)
docs = loader.load()



In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
splits = text_splitter.split_documents(docs)

In [6]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [7]:
from langchain_groq import ChatGroq
llm = ChatGroq(model_name="openai/gpt-oss-120b", temperature=0)

In [8]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, return_source_documents=True)

In [10]:
metrics = {
    "retrieval_latency": [],
    "embedding_drift": [],
    "cache_hits": 0,
    "cache_misses": 0,
    "response_quality": []
}

cache = {}
previous_embedding = None

In [11]:
def calculate_embedding_drift(old_embed, new_embed):
    if old_embed is None:
        return 0.0
    similarity = cosine_similarity([old_embed], [new_embed])[0][0]
    drift = 1 - similarity
    return drift

def simple_response_quality_score(response, query):
    if len(response) < 20:
        return 0.3
    elif any(word.lower() in response.lower() for word in query.split()):
        return 0.8
    else:
        return 0.5

In [12]:
def monitored_rag_query(query):
    global previous_embedding

    start_time = time.time()

    # Check cache first
    if query in cache:
        metrics["cache_hits"] += 1
        result = cache[query]
        logger.info(f"📋 Cache HIT for query: {query[:50]}...")
    else:
        metrics["cache_misses"] += 1

        # Execute RAG pipeline
        result = qa_chain({"query": query})
        cache[query] = result
        logger.info(f"🔍 Cache MISS - Executed RAG for: {query[:50]}...")

    # Calculate latency
    latency = time.time() - start_time
    metrics["retrieval_latency"].append(latency)

    # Calculate embedding drift
    query_embedding = embeddings.embed_query(query)
    drift = calculate_embedding_drift(previous_embedding, query_embedding)
    metrics["embedding_drift"].append(drift)
    previous_embedding = query_embedding

    # Simple response quality scoring
    quality = simple_response_quality_score(result["result"], query)
    metrics["response_quality"].append(quality)

    # Log metrics
    logger.info(f"⏱️ Latency: {latency:.3f}s | 📊 Drift: {drift:.3f} | ⭐ Quality: {quality:.3f}")

    return result

In [13]:
# Test queries
test_queries = [
    "What is retrieval-augmented generation?",
    "How does RAG work with vector databases?",
    "What are the benefits of using RAG?",
    "What is retrieval-augmented generation?",  # Duplicate to test cache
    "Explain the RAG architecture components"
]

print("🚀 Running monitored RAG queries...\n")

for query in test_queries:
    print(f"❓ Query: {query}")
    result = monitored_rag_query(query)
    print(f"💬 Answer: {result['result'][:200]}...\n")
    print("-" * 50)

🚀 Running monitored RAG queries...

❓ Query: What is retrieval-augmented generation?


  result = qa_chain({"query": query})


💬 Answer: **Retrieval‑augmented generation (RAG)** is a technique that couples a **retrieval component** with a **generative language model** so that the model can pull in up‑to‑date or domain‑specific informat...

--------------------------------------------------
❓ Query: How does RAG work with vector databases?
💬 Answer: **Retrieval‑Augmented Generation (RAG) and vector databases**

1. **Turn the source material into vectors**  
   * The raw data you want the model to be able to cite (documents, web pages, knowledge‑g...

--------------------------------------------------
❓ Query: What are the benefits of using RAG?
💬 Answer: **Benefits of using Retrieval‑Augmented Generation (RAG)**  

1. **Reduced need for retraining** – Because the model can fetch up‑to‑date information at inference time, you don’t have to constantly re...

--------------------------------------------------
❓ Query: What is retrieval-augmented generation?
💬 Answer: **Retrieval‑augmented generation (RAG)** is a te

In [14]:
def display_monitoring_dashboard():
    print("📊 RAG MONITORING DASHBOARD")
    print("=" * 40)

    # Latency metrics
    avg_latency = np.mean(metrics["retrieval_latency"])
    max_latency = np.max(metrics["retrieval_latency"])
    min_latency = np.min(metrics["retrieval_latency"])

    print(f"⏱️ RETRIEVAL LATENCY:")
    print(f"   Average: {avg_latency:.3f}s")
    print(f"   Min: {min_latency:.3f}s | Max: {max_latency:.3f}s")

    # Cache performance
    total_queries = metrics["cache_hits"] + metrics["cache_misses"]
    cache_hit_rate = metrics["cache_hits"] / total_queries * 100 if total_queries > 0 else 0

    print(f"\n📋 CACHE PERFORMANCE:")
    print(f"   Hit Rate: {cache_hit_rate:.1f}%")
    print(f"   Hits: {metrics['cache_hits']} | Misses: {metrics['cache_misses']}")

    # Embedding drift
    avg_drift = np.mean(metrics["embedding_drift"])
    max_drift = np.max(metrics["embedding_drift"])

    print(f"\n📊 EMBEDDING DRIFT:")
    print(f"   Average: {avg_drift:.3f}")
    print(f"   Max: {max_drift:.3f}")

    # Response quality
    avg_quality = np.mean(metrics["response_quality"])

    print(f"\n⭐ RESPONSE QUALITY:")
    print(f"   Average Score: {avg_quality:.3f}")

    print("\n" + "=" * 40)

# Display the dashboard
display_monitoring_dashboard()

📊 RAG MONITORING DASHBOARD
⏱️ RETRIEVAL LATENCY:
   Average: 1.278s
   Min: 0.000s | Max: 2.080s

📋 CACHE PERFORMANCE:
   Hit Rate: 20.0%
   Hits: 1 | Misses: 4

📊 EMBEDDING DRIFT:
   Average: 0.567
   Max: 0.805

⭐ RESPONSE QUALITY:
   Average Score: 0.800

