# Simple RAG Implementation with Constitution of Kenya

This notebook implements a simple Retrieval-Augmented Generation (RAG) system using the Constitution of Kenya 2010 PDF.

In [1]:
# Install required packages
!pip install pymupdf sentence-transformers faiss-cpu openai tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Import libraries
import fitz  # PyMuPDF
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import re
from typing import List

print("Libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!


In [3]:
# Load PDF and extract text
def load_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    
    for page in doc:
        page_text = page.get_text()
        # Clean up text
        page_text = re.sub(r'\s+', ' ', page_text)
        text += page_text + "\n\n"
    
    doc.close()
    return text.strip()

# Load the Constitution PDF
pdf_path = "/Users/pikachu/Downloads/llms-and-a-bit-more/Signed National-AI-Strategy_Final 26 Mar25.pdf"
strategy_text = load_pdf(pdf_path)

print(f"Document loaded: {len(strategy_text)} characters")
print(f"First 300 characters: {strategy_text[:300]}...")

Document loaded: 172217 characters
First 300 characters: KENYA ARTIFICIAL INTELLIGENCE STRATEGY 2025-2030 March 2025 



KENYA ARTIFICIAL INTELLIGENCE STRATEGY 2025-2030 March 2025 

4 / Kenya AI Strategy List of Abbreviations.�.�.�.�.�.�.�.�.�.�.�.�.�.�.�.�.6 EXECUTIVE SUMMAR.......................7 FOREWOR.........................................9...


In [4]:
# Split text into chunks
def split_text(text, chunk_size=1000, overlap=200):
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
        
        if i + chunk_size >= len(words):
            break
    
    return chunks

# Create chunks
chunks = split_text(strategy_text)
print(f"Created {len(chunks)} chunks")
print(f"Sample chunk: {chunks[0][:200]}...")

Created 31 chunks
Sample chunk: KENYA ARTIFICIAL INTELLIGENCE STRATEGY 2025-2030 March 2025 KENYA ARTIFICIAL INTELLIGENCE STRATEGY 2025-2030 March 2025 4 / Kenya AI Strategy List of Abbreviations.�.�.�.�.�.�.�.�.�.�.�.�.�.�.�.�.6 ...


In [5]:
# Create embeddings
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings...")
embeddings = model.encode(chunks, show_progress_bar=True)

print(f"Embeddings shape: {embeddings.shape}")
print("Embeddings created successfully!")

Loading embedding model...
Generating embeddings...


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.28it/s]

Embeddings shape: (31, 384)
Embeddings created successfully!





In [6]:
# Create vector database
print("Setting up vector database...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for similarity

# Normalize embeddings for cosine similarity
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
index.add(normalized_embeddings.astype('float32'))

print(f"Vector database created with {index.ntotal} vectors")
print("Ready for similarity search!")

Setting up vector database...
Vector database created with 31 vectors
Ready for similarity search!


In [7]:
# Simple RAG query function
def query_strategy(question, k=3):
    """Query the strategy and return relevant chunks"""
    
    # Create embedding for the question
    question_embedding = model.encode([question])
    
    # Normalize question embedding
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    
    # Search for similar chunks
    scores, indices = index.search(question_embedding.astype('float32'), k)
    
    # Collect results
    results = []
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        results.append({
            'chunk': chunks[idx],
            'score': float(score),
            'rank': i + 1
        })
    
    return results

print("RAG query function ready!")
print("Use: query_strategy on AI to search the Stratregy document")

RAG query function ready!
Use: query_strategy on AI to search the Stratregy document


In [8]:
# ask a sample question
sample_questions = [
    "What are the key objectives of the National AI Strategy?",
    "How does the strategy address ethical considerations in AI?",
    "What are the main challenges identified in the strategy?",
    "What is the role of public-private partnerships in the strategy?",
]

def query_strategy(question: str, top_k: int = 5):
    # Generate embedding for the question
    question_embedding = model.encode([question])[0]
    question_embedding = question_embedding / np.linalg.norm(question_embedding)  # Normalize
    
    # Search in the vector database
    distances, indices = index.search(np.array([question_embedding], dtype='float32'), top_k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'chunk': chunks[idx],
            'distance': distances[0][i]
        })
    
    return results

results = query_strategy("What are the key objectives of the National AI Strategy?")

In [9]:
# Fix the query_strategy function to match what display_results expects
def query_strategy(question: str, k: int = 5):
    """Query the strategy and return relevant chunks"""
    
    # Create embedding for the question
    question_embedding = model.encode([question])
    
    # Normalize question embedding
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    
    # Search for similar chunks
    scores, indices = index.search(question_embedding.astype('float32'), k)
    
    # Collect results
    results = []
    for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
        results.append({
            'chunk': chunks[idx],
            'score': float(score),  # This matches what display_results expects
            'rank': i + 1          # This matches what display_results expects
        })
    
    return results

# Function to display results nicely
def display_results(question, results):
    print(f"\n{'='*60}")
    print(f"QUESTION: {question}")
    print('='*60)
    
    for result in results:
        print(f"\nRank {result['rank']} (Score: {result['score']:.3f})")
        print(f"Text: {result['chunk'][:300]}...")
        print("-" * 40)

# Test with first question
question = sample_questions[0]
results = query_strategy(question)
display_results(question, results)


QUESTION: What are the key objectives of the National AI Strategy?

Rank 1 (Score: 0.650)
Text: As a leading hub for technology and innovation in Africa, Kenya is well positioned to provide leadership and set the pace for how AI can be applied to address our unique challenges and drive sustainable development. By integrating AI into critical sectors—including agriculture, healthcare, finance, ...
----------------------------------------

Rank 2 (Score: 0.599)
Text: potential of AI to improve the economic growth, productivity, and quality of life for the Mauritian state. The focal areas of the plan include matching existing and new AI solutions to specific sectors and regions, establishing a “Mauritian unique selling point” of AI, building an appropriate ecosys...
----------------------------------------

Rank 3 (Score: 0.582)
Text: / Kenya AI Strategy This strategy aims to be comprehensive, addressing multiple facets of AI development, adoption, and governance. The strategy will creat

In [10]:
# Install Azure AI Evaluation SDK for Azure AI Foundry evaluators
!pip install azure-ai-evaluation azure-identity python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import os
import json
from typing import Dict, List, Any
from azure.ai.evaluation import (
    GroundednessEvaluator,
    RelevanceEvaluator,
    RetrievalEvaluator,
    CoherenceEvaluator,
    FluencyEvaluator,
    evaluate
)
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Azure OpenAI configuration for evaluators
# You'll need to set these environment variables or replace with your actual values
model_config = {
    "azure_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),  # e.g., "https://your-resource.openai.azure.com/"
    "api_key": os.getenv("AZURE_OPENAI_API_KEY"),
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),  # e.g., "gpt-4"
    "api_version": "2025-01-01-preview"
}

# Check if configuration is available
config_available = all([
    model_config["azure_endpoint"],
    model_config["api_key"], 
    model_config["azure_deployment"]
])

if config_available:
    print("✅ Azure OpenAI configuration loaded successfully!")
    print(f"Deployment: {model_config['azure_deployment']}")
    
    # Initialize evaluators
    try:
        groundedness_evaluator = GroundednessEvaluator(model_config=model_config)
        relevance_evaluator = RelevanceEvaluator(model_config=model_config)
        coherence_evaluator = CoherenceEvaluator(model_config=model_config)
        fluency_evaluator = FluencyEvaluator(model_config=model_config)
        print("✅ Azure AI Foundry evaluators initialized successfully!")
    except Exception as e:
        print(f"⚠️ Error initializing evaluators: {e}")
        print("Continuing without AI-assisted evaluators...")
        groundedness_evaluator = None
        relevance_evaluator = None
        coherence_evaluator = None
        fluency_evaluator = None
else:
    print("⚠️ Azure OpenAI configuration not found!")
    print("Please set the following environment variables:")
    print("Or modify the model_config dictionary above with your values.")
    print("Continuing without AI-assisted evaluators...")
    groundedness_evaluator = None
    relevance_evaluator = None
    coherence_evaluator = None
    fluency_evaluator = None

✅ Azure OpenAI configuration loaded successfully!
Deployment: gpt-4.1-mini
✅ Azure AI Foundry evaluators initialized successfully!


In [12]:
# Enhanced RAG query function with evaluation support and observability
class EnhancedRAGSystem:
    def __init__(self, chunks, index, model, evaluators=None):
        self.chunks = chunks
        self.index = index
        self.model = model
        self.evaluators = evaluators or {}
        self.query_history = []
        
    def query_with_context(self, question: str, k: int = 5) -> Dict[str, Any]:
        """
        Enhanced query function that returns data in Azure AI Foundry evaluation format
        """
        # Create embedding for the question
        question_embedding = self.model.encode([question])
        
        # Normalize question embedding
        question_embedding = question_embedding / np.linalg.norm(question_embedding)
        
        # Search for similar chunks
        scores, indices = self.index.search(question_embedding.astype('float32'), k)
        
        # Collect retrieved chunks
        retrieved_chunks = []
        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
            retrieved_chunks.append({
                'chunk': self.chunks[idx],
                'score': float(score),
                'rank': i + 1,
                'chunk_id': int(idx)
            })
        
        # Create context from top chunks (concatenate all retrieved text)
        context = "\n\n".join([chunk['chunk'] for chunk in retrieved_chunks])
        
        # For now, we'll create a simple response (you can integrate with a generation model later)
        # This is a placeholder - in a full RAG system you'd use an LLM to generate the response
        response = f"Based on the retrieved information, here are the key points about '{question}': {context[:500]}..."
        
        # Structure data for Azure AI Foundry evaluation
        result = {
            'query': question,
            'response': response,
            'context': context,
            'retrieved_chunks': retrieved_chunks,
            'metadata': {
                'num_chunks_retrieved': k,
                'top_score': float(scores[0][0]) if len(scores[0]) > 0 else 0.0,
                'timestamp': str(np.datetime64('now'))
            }
        }
        
        # Store for observability
        self.query_history.append(result)
        
        return result
    
    def evaluate_query_result(self, result: Dict[str, Any]) -> Dict[str, float]:
        """
        Evaluate a query result using Azure AI Foundry evaluators
        """
        evaluation_scores = {}
        
        if not self.evaluators:
            print("⚠️ No evaluators available for evaluation")
            return evaluation_scores
            
        try:
            # Groundedness evaluation
            if 'groundedness' in self.evaluators and self.evaluators['groundedness']:
                try:
                    groundedness_result = self.evaluators['groundedness'](
                        query=result['query'],
                        response=result['response'],
                        context=result['context']
                    )
                    evaluation_scores['groundedness'] = groundedness_result.get('groundedness', 0)
                    evaluation_scores['groundedness_reason'] = groundedness_result.get('gpt_groundedness_reason', 'N/A')
                except Exception as e:
                    print(f"Error in groundedness evaluation: {e}")
            
            # Relevance evaluation
            if 'relevance' in self.evaluators and self.evaluators['relevance']:
                try:
                    relevance_result = self.evaluators['relevance'](
                        query=result['query'],
                        response=result['response']
                    )
                    evaluation_scores['relevance'] = relevance_result.get('relevance', 0)
                    evaluation_scores['relevance_reason'] = relevance_result.get('gpt_relevance_reason', 'N/A')
                except Exception as e:
                    print(f"Error in relevance evaluation: {e}")
            
            # Coherence evaluation
            if 'coherence' in self.evaluators and self.evaluators['coherence']:
                try:
                    coherence_result = self.evaluators['coherence'](
                        query=result['query'],
                        response=result['response']
                    )
                    evaluation_scores['coherence'] = coherence_result.get('coherence', 0)
                    evaluation_scores['coherence_reason'] = coherence_result.get('gpt_coherence_reason', 'N/A')
                except Exception as e:
                    print(f"Error in coherence evaluation: {e}")
            
            # Fluency evaluation
            if 'fluency' in self.evaluators and self.evaluators['fluency']:
                try:
                    fluency_result = self.evaluators['fluency'](
                        query=result['query'],
                        response=result['response']
                    )
                    evaluation_scores['fluency'] = fluency_result.get('fluency', 0)
                    evaluation_scores['fluency_reason'] = fluency_result.get('gpt_fluency_reason', 'N/A')
                except Exception as e:
                    print(f"Error in fluency evaluation: {e}")
            
        except Exception as e:
            print(f"Error during evaluation: {e}")
        
        return evaluation_scores
    
    def save_evaluation_dataset(self, filename: str = "rag_evaluation_dataset.jsonl"):
        """
        Save query history as JSONL for Azure AI Foundry evaluation
        """
        with open(filename, 'w') as f:
            for entry in self.query_history:
                # Format for Azure AI Foundry evaluation
                eval_entry = {
                    "query": entry['query'],
                    "response": entry['response'],
                    "context": entry['context']
                }
                f.write(json.dumps(eval_entry) + '\n')
        print(f"✅ Saved {len(self.query_history)} entries to {filename}")
    
    def get_observability_summary(self) -> Dict[str, Any]:
        """
        Get observability summary of the RAG system
        """
        if not self.query_history:
            return {"message": "No queries processed yet"}
        
        total_queries = len(self.query_history)
        avg_top_score = np.mean([q['metadata']['top_score'] for q in self.query_history])
        
        return {
            "total_queries": total_queries,
            "average_retrieval_score": avg_top_score,
            "latest_query": self.query_history[-1]['query'],
            "chunks_per_query": self.query_history[-1]['metadata']['num_chunks_retrieved']
        }

# Initialize the enhanced RAG system
evaluators_dict = {}
if config_available:
    if groundedness_evaluator:
        evaluators_dict['groundedness'] = groundedness_evaluator
    if relevance_evaluator:
        evaluators_dict['relevance'] = relevance_evaluator
    if coherence_evaluator:
        evaluators_dict['coherence'] = coherence_evaluator
    if fluency_evaluator:
        evaluators_dict['fluency'] = fluency_evaluator

enhanced_rag = EnhancedRAGSystem(
    chunks=chunks,
    index=index,
    model=model,
    evaluators=evaluators_dict
)

print("✅ Enhanced RAG system with Azure AI Foundry evaluators initialized!")
print(f"Available evaluators: {list(evaluators_dict.keys())}")

✅ Enhanced RAG system with Azure AI Foundry evaluators initialized!
Available evaluators: ['groundedness', 'relevance', 'coherence', 'fluency']


In [13]:
# Test the enhanced RAG system with evaluations
def test_query_with_evaluation(question: str):
    """Test a query and evaluate the results"""
    print(f"\n{'='*80}")
    print(f"🔍 TESTING QUERY: {question}")
    print('='*80)
    
    # Get query result
    result = enhanced_rag.query_with_context(question, k=3)
    
    # Display query results
    print(f"\n📝 RESPONSE:")
    print(f"{result['response'][:300]}...")
    
    print(f"\n📚 RETRIEVED CONTEXT:")
    for i, chunk in enumerate(result['retrieved_chunks'][:2]):  # Show top 2 chunks
        print(f"\nChunk {i+1} (Score: {chunk['score']:.3f}):")
        print(f"{chunk['chunk'][:200]}...")
    
    # Evaluate the result
    print(f"\n📊 EVALUATION RESULTS:")
    evaluation_scores = enhanced_rag.evaluate_query_result(result)
    
    if evaluation_scores:
        for metric, score in evaluation_scores.items():
            if not metric.endswith('_reason'):
                reason_key = f"{metric}_reason"
                reason = evaluation_scores.get(reason_key, "N/A")
                print(f"  • {metric.upper()}: {score}/5")
                if reason != "N/A":
                    print(f"    Reasoning: {reason[:100]}...")
    else:
        print("  ⚠️ No evaluation scores available (Azure OpenAI not configured)")
    
    return result, evaluation_scores

# Test with sample questions
print("🚀 Testing Enhanced RAG System with Azure AI Foundry Evaluators")

# Test 1: Key objectives
result1, scores1 = test_query_with_evaluation(sample_questions[0])

# Test 2: Ethical considerations  
result2, scores2 = test_query_with_evaluation(sample_questions[1])

🚀 Testing Enhanced RAG System with Azure AI Foundry Evaluators

🔍 TESTING QUERY: What are the key objectives of the National AI Strategy?

📝 RESPONSE:
Based on the retrieved information, here are the key points about 'What are the key objectives of the National AI Strategy?': As a leading hub for technology and innovation in Africa, Kenya is well positioned to provide leadership and set the pace for how AI can be applied to address our unique chal...

📚 RETRIEVED CONTEXT:

Chunk 1 (Score: 0.650):
As a leading hub for technology and innovation in Africa, Kenya is well positioned to provide leadership and set the pace for how AI can be applied to address our unique challenges and drive sustainab...

Chunk 2 (Score: 0.599):
potential of AI to improve the economic growth, productivity, and quality of life for the Mauritian state. The focal areas of the plan include matching existing and new AI solutions to specific sector...

📊 EVALUATION RESULTS:
  • GROUNDEDNESS: 4.0/5
  • RELEVANCE: 3.0

In [14]:
# Batch evaluation and observability dashboard
def run_batch_evaluation(questions: List[str], save_results: bool = True):
    """Run batch evaluation on multiple questions"""
    print(f"\n🔄 RUNNING BATCH EVALUATION ON {len(questions)} QUESTIONS")
    print("="*60)
    
    batch_results = []
    batch_scores = []
    
    for i, question in enumerate(questions, 1):
        print(f"\n[{i}/{len(questions)}] Processing: {question[:50]}...")
        
        try:
            # Get result
            result = enhanced_rag.query_with_context(question)
            
            # Evaluate
            scores = enhanced_rag.evaluate_query_result(result)
            
            batch_results.append(result)
            batch_scores.append(scores)
            
            # Show brief progress
            if scores:
                avg_score = np.mean([v for k, v in scores.items() if not k.endswith('_reason')])
                print(f"  ✅ Avg evaluation score: {avg_score:.2f}/5")
            else:
                print(f"  ⚠️ No evaluation scores (Azure OpenAI not configured)")
                
        except Exception as e:
            print(f"  ❌ Error: {e}")
            continue
    
    # Summary statistics
    print(f"\n📈 BATCH EVALUATION SUMMARY")
    print("="*40)
    
    if batch_scores and any(batch_scores):
        # Calculate average scores for each metric
        metrics = set()
        for score_dict in batch_scores:
            metrics.update([k for k in score_dict.keys() if not k.endswith('_reason')])
        
        for metric in metrics:
            scores_for_metric = [s.get(metric, 0) for s in batch_scores if s.get(metric, 0) > 0]
            if scores_for_metric:
                avg_score = np.mean(scores_for_metric)
                print(f"  • Average {metric.upper()}: {avg_score:.2f}/5")
    
    # Observability summary
    obs_summary = enhanced_rag.get_observability_summary()
    print(f"\n🔍 OBSERVABILITY SUMMARY")
    print("="*30)
    for key, value in obs_summary.items():
        print(f"  • {key.replace('_', ' ').title()}: {value}")
    
    # Save evaluation dataset
    if save_results:
        enhanced_rag.save_evaluation_dataset()
    
    return batch_results, batch_scores

# Run batch evaluation on all sample questions
batch_results, batch_scores = run_batch_evaluation(sample_questions)


🔄 RUNNING BATCH EVALUATION ON 4 QUESTIONS

[1/4] Processing: What are the key objectives of the National AI Str...
  ✅ Avg evaluation score: 3.75/5

[2/4] Processing: How does the strategy address ethical consideratio...
  ✅ Avg evaluation score: 3.00/5

[3/4] Processing: What are the main challenges identified in the str...
  ✅ Avg evaluation score: 1.50/5

[4/4] Processing: What is the role of public-private partnerships in...
  ✅ Avg evaluation score: 2.00/5

📈 BATCH EVALUATION SUMMARY
  • Average COHERENCE: 2.50/5
  • Average GROUNDEDNESS: 2.50/5
  • Average FLUENCY: 3.00/5
  • Average RELEVANCE: 2.25/5

🔍 OBSERVABILITY SUMMARY
  • Total Queries: 6
  • Average Retrieval Score: 0.5321155687173208
  • Latest Query: What is the role of public-private partnerships in the strategy?
  • Chunks Per Query: 5
✅ Saved 6 entries to rag_evaluation_dataset.jsonl


In [15]:
pip install azure-ai-projects

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
# Azure AI Foundry Cloud Evaluation Integration
def setup_cloud_evaluation():
    """
    Set up cloud evaluation with Azure AI Foundry
    Requires Azure AI Foundry project
    """
    try:
        from azure.ai.projects import AIProjectClient
        from azure.identity import DefaultAzureCredential
        
        # These should be set in your environment variables
        project_endpoint = os.getenv("AZURE_AI_PROJECT_ENDPOINT")  # e.g., "https://<project-name>.services.ai.azure.com/api/projects/<project-id>"
        
        if not project_endpoint:
            print("⚠️ Azure AI Foundry project endpoint not configured")
            print("Set AZURE_AI_PROJECT_ENDPOINT environment variable to use cloud evaluation")
            return None
        
        # Create project client
        credential = DefaultAzureCredential()
        project_client = AIProjectClient(
            endpoint=project_endpoint,
            credential=credential
        )
        
        print("✅ Azure AI Foundry project client initialized")
        return project_client
        
    except ImportError:
        print("⚠️ Azure AI Projects SDK not installed")
        print("Run: pip install azure-ai-projects")
        return None
    except Exception as e:
        print(f"⚠️ Error setting up cloud evaluation: {e}")
        return None

def run_cloud_evaluation(project_client, dataset_path: str = "rag_evaluation_dataset.jsonl"):
    """
    Run evaluation in Azure AI Foundry cloud
    """
    if not project_client:
        print("❌ No project client available for cloud evaluation")
        return None
    
    try:
        # This is a placeholder for cloud evaluation
        # The actual implementation would depend on your Azure AI Foundry project setup
        print(f"🌩️ Running cloud evaluation with dataset: {dataset_path}")
        print("📊 Results would be available in Azure AI Foundry portal")
        print("🔗 Check your Azure AI Foundry project for detailed evaluation results")
        
        # In a real implementation, you would:
        # 1. Upload the dataset to your Azure AI Foundry project
        # 2. Run the evaluation using the project client
        # 3. Monitor the results in the Azure AI Foundry portal
        
        return "cloud_evaluation_placeholder"
        
    except Exception as e:
        print(f"❌ Error running cloud evaluation: {e}")
        return None

# Set up cloud evaluation (optional)
project_client = setup_cloud_evaluation()

# Create monitoring and logging utilities
class RAGObservability:
    """Enhanced observability for RAG system"""
    
    def __init__(self, rag_system):
        self.rag_system = rag_system
        self.evaluation_history = []
    
    def log_evaluation(self, query: str, result: dict, scores: dict):
        """Log evaluation for monitoring"""
        log_entry = {
            'timestamp': str(np.datetime64('now')),
            'query': query,
            'response_length': len(result.get('response', '')),
            'context_length': len(result.get('context', '')),
            'num_retrieved_chunks': len(result.get('retrieved_chunks', [])),
            'scores': scores,
            'retrieval_score': result.get('metadata', {}).get('top_score', 0)
        }
        self.evaluation_history.append(log_entry)
    
    def get_performance_metrics(self):
        """Get performance metrics over time"""
        if not self.evaluation_history:
            return {}
        
        metrics = {}
        
        # Average evaluation scores
        all_scores = {}
        for entry in self.evaluation_history:
            for metric, score in entry['scores'].items():
                if not metric.endswith('_reason') and isinstance(score, (int, float)):
                    if metric not in all_scores:
                        all_scores[metric] = []
                    all_scores[metric].append(score)
        
        for metric, scores in all_scores.items():
            metrics[f'avg_{metric}'] = np.mean(scores)
            metrics[f'std_{metric}'] = np.std(scores)
        
        # System performance metrics
        metrics['avg_response_length'] = np.mean([e['response_length'] for e in self.evaluation_history])
        metrics['avg_context_length'] = np.mean([e['context_length'] for e in self.evaluation_history])
        metrics['avg_retrieval_score'] = np.mean([e['retrieval_score'] for e in self.evaluation_history])
        
        return metrics
    
    def export_monitoring_data(self, filename: str = "rag_monitoring.json"):
        """Export monitoring data for external analysis"""
        monitoring_data = {
            'system_info': {
                'total_chunks': len(self.rag_system.chunks),
                'embedding_model': 'all-MiniLM-L6-v2',
                'total_evaluations': len(self.evaluation_history)
            },
            'performance_metrics': self.get_performance_metrics(),
            'evaluation_history': self.evaluation_history
        }
        
        with open(filename, 'w') as f:
            json.dump(monitoring_data, f, indent=2)
        
        print(f"✅ Monitoring data exported to {filename}")

# Initialize observability
rag_observability = RAGObservability(enhanced_rag)

print("✅ Azure AI Foundry integration and observability setup complete!")
print("\n Available Features:")
print("  • Enhanced RAG with evaluation support")
print("  • Azure AI Foundry evaluators (Groundedness, Relevance, Coherence, Fluency)")
print("  • Batch evaluation capabilities")
print("  • Observability and monitoring")
print("  • Cloud evaluation integration (when configured)")
print("  • Evaluation dataset export (JSONL format)")
print("\n Next Steps:")
print("  1. Set Azure OpenAI environment variables for AI-assisted evaluation")
print("  2. Set AZURE_AI_PROJECT_ENDPOINT for cloud evaluation")
print("  3. Run queries and monitor performance metrics")
print("  4. Export evaluation datasets for further analysis")

✅ Azure AI Foundry project client initialized
✅ Azure AI Foundry integration and observability setup complete!

 Available Features:
  • Enhanced RAG with evaluation support
  • Azure AI Foundry evaluators (Groundedness, Relevance, Coherence, Fluency)
  • Batch evaluation capabilities
  • Observability and monitoring
  • Cloud evaluation integration (when configured)
  • Evaluation dataset export (JSONL format)

 Next Steps:
  1. Set Azure OpenAI environment variables for AI-assisted evaluation
  2. Set AZURE_AI_PROJECT_ENDPOINT for cloud evaluation
  3. Run queries and monitor performance metrics
  4. Export evaluation datasets for further analysis
